예제 #1
0
def main():
    global coordinator_db, username

    parser = argparse.ArgumentParser(
        description="Run a web-interface for monitoring the cluster.")
    utils.add_redis_params(parser)
    parser.add_argument(
        "--port", "-p", help="port on which the GUI accepts HTTP connections",
        type=int, default=4280)
    parser.add_argument(
        "--user", help="the username to run under "
        "(default: %(default)s)", default=getpass.getuser())
    args = parser.parse_args()

    coordinator_db = redis_utils.CoordinatorDB(
        args.redis_host, args.redis_port, args.redis_db)
    username = args.user

    try:
        bottle.run(host='0.0.0.0', port=args.port)
    except socket.error, e:
        print e
        # Return error 42 to indicate that we can't bind, so that scripts
        # calling this one can handle that case specially
        return constants.CANNOT_BIND
예제 #2
0
def main():
    global coordinator_db, log_directory

    parser = argparse.ArgumentParser(
        description="provides a GUI for information on job history")
    utils.add_redis_params(parser)
    parser.add_argument(
        "--port", "-p", help="port on which the GUI accepts HTTP connections",
        type=int, default=4280)
    parser.add_argument("log_directory", help="base log directory for the "
                        "coordinator")
    args = parser.parse_args()

    coordinator_db = redis_utils.CoordinatorDB(
        args.redis_host, args.redis_port, args.redis_db)

    log_directory = args.log_directory

    try:
        bottle.run(host='0.0.0.0', port=args.port)
    except socket.error, e:
        print e
        # Return error 42 to indicate that we can't bind, so that scripts
        # calling this one can handle that case specially
        return constants.CANNOT_BIND
예제 #3
0
def main():
    global coordinator_db, username

    parser = argparse.ArgumentParser(
        description="Run a web-interface for monitoring the cluster.")
    utils.add_redis_params(parser)
    parser.add_argument("--port",
                        "-p",
                        help="port on which the GUI accepts HTTP connections",
                        type=int,
                        default=4280)
    parser.add_argument("--user",
                        help="the username to run under "
                        "(default: %(default)s)",
                        default=getpass.getuser())
    args = parser.parse_args()

    coordinator_db = redis_utils.CoordinatorDB(args.redis_host,
                                               args.redis_port, args.redis_db)
    username = args.user

    try:
        bottle.run(host='0.0.0.0', port=args.port)
    except socket.error, e:
        print e
        # Return error 42 to indicate that we can't bind, so that scripts
        # calling this one can handle that case specially
        return constants.CANNOT_BIND
예제 #4
0
def main():
    parser = argparse.ArgumentParser(
        description="Run a command or script in over ssh in parallel across "
        "all nodes in the cluster")
    parser.add_argument(
        "--user", help="the username to run under "
        "(default: %(default)s)", default=getpass.getuser())
    parser.add_argument(
        "--hosts", help="a comma-delimited list of hosts to use instead of "
        "contacting redis")
    parser.add_argument(
        "--ignore_bad_hosts", help="if set, ignore hosts that couldn't be "
        "reached, rather than failing", action="store_true")
    parser.add_argument(
        "--master", "-m", help="if set, also run command on master node",
        action="store_true")
    parser.add_argument(
        "command", help="the command to be run", nargs="+")

    themis_utils.add_redis_params(parser)

    args = parser.parse_args()
    username = args.user
    hosts = args.hosts
    if hosts != None:
        # Separate comma-delimited list
        hosts = filter(lambda x: len(x) > 0, hosts.split(','))

    redis_client = redis.StrictRedis(
        host=args.redis_host, port=args.redis_port, db=args.redis_db)

    return_code, _, _ = parallel_ssh(
        redis_client, args.command, username, hosts, args.ignore_bad_hosts,
        args.master)
    return return_code
예제 #5
0
def main():
    parser = argparse.ArgumentParser(
        description="set up read request queues to replay a given job's inputs"
    )
    parser.add_argument("job_description_file",
                        help="file describing the job "
                        "whose inputs are to be replayed")
    parser.add_argument("job_ids",
                        nargs="+",
                        help="the job IDs of the jobs being replayed",
                        type=int)
    parser.add_argument("--skip_phase_zero",
                        default=False,
                        action="store_true",
                        help="don't generate read requests for phase zero")
    parser.add_argument("--skip_phase_one",
                        default=False,
                        action="store_true",
                        help="don't generate read requests for phase one")
    parser.add_argument("--phase_zero_sample_size",
                        default=125000000,
                        help="how much data to sample from each file in phase "
                        "zero (default: %(default)s)",
                        type=int)

    themis_utils.add_redis_params(parser)

    args = parser.parse_args()
    return reload_read_request_queues(**vars(args))
예제 #6
0
def main():
    global coordinator_db, log_directory

    parser = argparse.ArgumentParser(
        description="provides a GUI for information on job history")
    utils.add_redis_params(parser)
    parser.add_argument("--port",
                        "-p",
                        help="port on which the GUI accepts HTTP connections",
                        type=int,
                        default=4280)
    parser.add_argument("log_directory",
                        help="base log directory for the "
                        "coordinator")
    args = parser.parse_args()

    coordinator_db = redis_utils.CoordinatorDB(args.redis_host,
                                               args.redis_port, args.redis_db)

    log_directory = args.log_directory

    try:
        bottle.run(host='0.0.0.0', port=args.port)
    except socket.error, e:
        print e
        # Return error 42 to indicate that we can't bind, so that scripts
        # calling this one can handle that case specially
        return constants.CANNOT_BIND
예제 #7
0
def main():
    parser = argparse.ArgumentParser(
        description="submits a job to the Themis coordinator")
    utils.add_redis_params(parser)
    parser.add_argument("job_specification_file",
                        help="a JSON file giving enough information about the "
                        "job for Themis to run it")
    parser.add_argument("--non_blocking", default=False, action="store_true",
                        help="don't wait for jobs to complete before returning")
    args = parser.parse_args()
    return run_job(**vars(args))
def main():
    parser = argparse.ArgumentParser(
        description="dump the contents of the coordinator's read request "
        "queues")
    utils.add_redis_params(parser)

    parser.add_argument("directive", choices=["list", "flush"], help="specify "
                        "which action to perform on read request queues")

    args = parser.parse_args()

    return read_request_queues(**vars(args))
def main():
    parser = argparse.ArgumentParser(
        description="Check status of a Themis cluster")

    parser.add_argument("cluster_ID", help="ID of the cluster", type=int)

    utils.add_redis_params(parser)
    args = parser.parse_args()

    redis_client = redis.StrictRedis(
        host=args.redis_host, port=args.redis_port, db=args.redis_db)

    return check_cluster_status(args.cluster_ID, redis_client)
def main():
    parser = argparse.ArgumentParser(
        description="Launch a Themis cluster on EC2")

    parser.add_argument("config", help="Cloud provider config file")

    parser.add_argument("cluster_name", help="Unique cluster name")
    parser.add_argument("cluster_size",
                        type=int,
                        help="The number of worker nodes")
    parser.add_argument("instance_type",
                        help="VM instance type of the worker nodes")
    parser.add_argument("AMI_ID", help="Amazon Machine Image")
    parser.add_argument("master_instance_type",
                        help="VM instance type of the master node")
    parser.add_argument("subnet_ID", help="Subnet IDS for launch")
    parser.add_argument("security_group_ID", help="Security Group ID")
    parser.add_argument(
        "S3_bucket", help="S3 bucket to use for storing configuration files")
    parser.add_argument("private_key", help="Private key file for ssh")
    parser.add_argument("public_key", help="Public key file for ssh")
    parser.add_argument("themis_config_directory",
                        help="Local directory containing Themis "
                        "config files to upload to S3.")
    parser.add_argument("--placement_group",
                        help="The optional placement group to use")
    parser.add_argument("--EBS_optimized",
                        action="store_true",
                        default=False,
                        help="Launch VMs with EBS optimization on")
    parser.add_argument(
        "--username",
        default="ec2-user",
        help="Username to use for logging into EC2. Default %(default)s")

    utils.add_redis_params(parser)
    args = parser.parse_args()

    provider_info = authenticate("amazon", args.config)

    redis_client = redis.StrictRedis(host=args.redis_host,
                                     port=args.redis_port,
                                     db=args.redis_db)

    return launch_amazon_cluster(
        provider_info, args.cluster_name, args.cluster_size,
        args.instance_type, args.AMI_ID, args.master_instance_type,
        args.subnet_ID, args.security_group_ID, args.S3_bucket,
        args.private_key, args.public_key, args.themis_config_directory,
        args.placement_group, args.EBS_optimized, args.username, redis_client)
def main():
    parser = argparse.ArgumentParser(
        description="Launch a Themis cluster on EC2")

    parser.add_argument("config", help="Cloud provider config file")

    parser.add_argument("cluster_name", help="Unique cluster name")
    parser.add_argument(
        "cluster_size", type=int, help="The number of worker nodes")
    parser.add_argument(
        "instance_type", help="VM instance type of the worker nodes")
    parser.add_argument("AMI_ID", help="Amazon Machine Image")
    parser.add_argument(
        "master_instance_type", help="VM instance type of the master node")
    parser.add_argument(
        "subnet_ID", help="Subnet IDS for launch")
    parser.add_argument("security_group_ID", help="Security Group ID")
    parser.add_argument(
        "S3_bucket", help="S3 bucket to use for storing configuration files")
    parser.add_argument(
        "private_key", help="Private key file for ssh")
    parser.add_argument(
        "public_key", help="Public key file for ssh")
    parser.add_argument(
        "themis_config_directory", help="Local directory containing Themis "
        "config files to upload to S3.")
    parser.add_argument(
        "--placement_group", help="The optional placement group to use")
    parser.add_argument(
        "--EBS_optimized", action="store_true", default=False,
        help="Launch VMs with EBS optimization on")
    parser.add_argument(
        "--username", default="ec2-user",
        help="Username to use for logging into EC2. Default %(default)s")

    utils.add_redis_params(parser)
    args = parser.parse_args()

    provider_info = authenticate("amazon", args.config)

    redis_client = redis.StrictRedis(
        host=args.redis_host, port=args.redis_port, db=args.redis_db)

    return launch_amazon_cluster(
        provider_info, args.cluster_name, args.cluster_size, args.instance_type,
        args.AMI_ID, args.master_instance_type, args.subnet_ID,
        args.security_group_ID, args.S3_bucket, args.private_key,
        args.public_key, args.themis_config_directory, args.placement_group,
        args.EBS_optimized, args.username, redis_client)
예제 #12
0
def main():

    parser = argparse.ArgumentParser(
        description="Cluster utility program for getting IP addresses")
    utils.add_redis_params(parser)
    parser.add_argument("command",
                        help="Utility command. Valid commands: all, live")
    args = parser.parse_args()

    coordinator_db = redis_utils.CoordinatorDB(args.redis_host,
                                               args.redis_port, args.redis_db)

    assert args.command in ["all", "live"]

    return cluster_utils(args.command, coordinator_db)
def main():
    parser = argparse.ArgumentParser(description="run a script on all nodes")
    parser.add_argument(
        "--user", help="the username as whom this job will be run "
        "(default: %(default)s)", default=getpass.getuser())
    parser.add_argument(
        "--hosts", help="a comma-delimited list of hosts to use instead of "
        "contacting redis")
    parser.add_argument(
        "command", help="the command to be run", nargs=argparse.REMAINDER)

    themis_utils.add_redis_params(parser)

    args = parser.parse_args()
    return run_script(**vars(args))
def main():

    parser = argparse.ArgumentParser(
        description="Cluster utility program for getting IP addresses")
    utils.add_redis_params(parser)
    parser.add_argument(
        "command", help="Utility command. Valid commands: all, live")
    args = parser.parse_args()

    coordinator_db = redis_utils.CoordinatorDB(
        args.redis_host, args.redis_port, args.redis_db)

    assert args.command in ["all", "live"]

    return cluster_utils(args.command, coordinator_db)
def main():
    parser = argparse.ArgumentParser(
        description="Launch a Themis cluster on Google")

    parser.add_argument("config", help="Cloud provider config file")

    parser.add_argument("cluster_name", help="Unique cluster name")
    parser.add_argument("cluster_size",
                        type=int,
                        help="The number of worker nodes")
    parser.add_argument("instance_type",
                        help="VM instance type of the worker nodes")
    parser.add_argument("local_ssds",
                        help="Number of local SSDs to add to each node",
                        type=int)
    parser.add_argument("persistent_ssds",
                        help="Number of persistent SSDs to add to each node",
                        type=int)
    parser.add_argument("image", help="Google Cloud Compute Engine VM Image")
    parser.add_argument("master_instance_type",
                        help="VM instance type of the master node")
    parser.add_argument("network", help="Network to run in")
    parser.add_argument("zone", help="Compute Engine Zone (eg. us-central1-f)")
    parser.add_argument(
        "bucket", help="Storage bucket to use for storing configuration files")
    parser.add_argument("private_key", help="Private key file for ssh")
    parser.add_argument("public_key", help="Public key file for ssh")
    parser.add_argument("themis_config_directory",
                        help="Local directory containing Themis "
                        "config files to upload to Storage.")

    utils.add_redis_params(parser)
    args = parser.parse_args()

    provider_info = authenticate("google", args.config)

    redis_client = redis.StrictRedis(host=args.redis_host,
                                     port=args.redis_port,
                                     db=args.redis_db)

    return launch_google_cluster(args.cluster_name, args.cluster_size,
                                 args.instance_type, args.local_ssds,
                                 args.persistent_ssds, args.image,
                                 args.master_instance_type, args.network,
                                 args.zone, args.bucket, args.private_key,
                                 args.public_key, args.themis_config_directory,
                                 provider_info, redis_client)
def main():
    parser = argparse.ArgumentParser(description="Launch a Themis cluster on Google")

    parser.add_argument("config", help="Cloud provider config file")

    parser.add_argument("cluster_name", help="Unique cluster name")
    parser.add_argument("cluster_size", type=int, help="The number of worker nodes")
    parser.add_argument("instance_type", help="VM instance type of the worker nodes")
    parser.add_argument("local_ssds", help="Number of local SSDs to add to each node", type=int)
    parser.add_argument("persistent_ssds", help="Number of persistent SSDs to add to each node", type=int)
    parser.add_argument("image", help="Google Cloud Compute Engine VM Image")
    parser.add_argument("master_instance_type", help="VM instance type of the master node")
    parser.add_argument("network", help="Network to run in")
    parser.add_argument("zone", help="Compute Engine Zone (eg. us-central1-f)")
    parser.add_argument("bucket", help="Storage bucket to use for storing configuration files")
    parser.add_argument("private_key", help="Private key file for ssh")
    parser.add_argument("public_key", help="Public key file for ssh")
    parser.add_argument(
        "themis_config_directory", help="Local directory containing Themis " "config files to upload to Storage."
    )

    utils.add_redis_params(parser)
    args = parser.parse_args()

    provider_info = authenticate("google", args.config)

    redis_client = redis.StrictRedis(host=args.redis_host, port=args.redis_port, db=args.redis_db)

    return launch_google_cluster(
        args.cluster_name,
        args.cluster_size,
        args.instance_type,
        args.local_ssds,
        args.persistent_ssds,
        args.image,
        args.master_instance_type,
        args.network,
        args.zone,
        args.bucket,
        args.private_key,
        args.public_key,
        args.themis_config_directory,
        provider_info,
        redis_client,
    )
예제 #17
0
def main():
    parser = argparse.ArgumentParser(description="run a script on all nodes")
    parser.add_argument("--user",
                        help="the username as whom this job will be run "
                        "(default: %(default)s)",
                        default=getpass.getuser())
    parser.add_argument(
        "--hosts",
        help="a comma-delimited list of hosts to use instead of "
        "contacting redis")
    parser.add_argument("command",
                        help="the command to be run",
                        nargs=argparse.REMAINDER)

    themis_utils.add_redis_params(parser)

    args = parser.parse_args()
    return run_script(**vars(args))
def main():
    parser = argparse.ArgumentParser(description="Terminate a Themis cluster")

    parser.add_argument("cluster_ID", help="Unique cluster ID", type=int)
    parser.add_argument(
        "--provider", help="The provider to use (amazon or google). Must be " "set if cluster is not found in redis."
    )
    parser.add_argument(
        "--zone",
        help="zone that the cluster is running in. Must be set if "
        "cluster is not found in redis and provider is google.",
    )

    utils.add_redis_params(parser)
    args = parser.parse_args()

    redis_client = redis.StrictRedis(host=args.redis_host, port=args.redis_port, db=args.redis_db)

    return terminate_cluster(args.cluster_ID, redis_client, args.provider, args.zone)
def main():
    parser = argparse.ArgumentParser(
        description="set up read request queues to replay a given job's inputs")
    parser.add_argument("job_description_file", help="file describing the job "
                        "whose inputs are to be replayed")
    parser.add_argument(
        "job_ids", nargs="+", help="the job IDs of the jobs being replayed",
        type=int)
    parser.add_argument("--skip_phase_zero", default=False, action="store_true",
                        help="don't generate read requests for phase zero")
    parser.add_argument("--skip_phase_one", default=False, action="store_true",
                        help="don't generate read requests for phase one")
    parser.add_argument("--phase_zero_sample_size", default=125000000,
                        help="how much data to sample from each file in phase "
                        "zero (default: %(default)s)", type=int)

    themis_utils.add_redis_params(parser)

    args = parser.parse_args()
    return reload_read_request_queues(**vars(args))
예제 #20
0
def main():
    parser = argparse.ArgumentParser(
        description="Terminate a Themis cluster")

    parser.add_argument("cluster_ID", help="Unique cluster ID", type=int)
    parser.add_argument(
        "--provider", help="The provider to use (amazon or google). Must be "
        "set if cluster is not found in redis.")
    parser.add_argument(
        "--zone", help="zone that the cluster is running in. Must be set if "
        "cluster is not found in redis and provider is google.")

    utils.add_redis_params(parser)
    args = parser.parse_args()

    redis_client = redis.StrictRedis(
        host=args.redis_host, port=args.redis_port, db=args.redis_db)

    return terminate_cluster(
        args.cluster_ID, redis_client, args.provider, args.zone)
예제 #21
0
def main():
    global hdfs_host, hdfs_port, redis_host, redis_port, redis_db

    parser = argparse.ArgumentParser(
        description="WebHDFS proxy that re-writes file requests to facilitate "
        "increased I/O parallelism")
    parser.add_argument("hdfs_namenode",
                        help="the host:port of the HDFS namenode for which "
                        "this script will serve as a proxy")

    utils.add_redis_params(parser)

    args = parser.parse_args()

    redis_host = args.redis_host
    redis_port = args.redis_port
    redis_db = args.redis_db

    hdfs_namenode_parts = filter(lambda x: len(x) > 0,
                                 args.hdfs_namenode.split(':'))

    if len(hdfs_namenode_parts) == 2:
        hdfs_host, hdfs_port = hdfs_namenode_parts
        hdfs_port = int(hdfs_port)
    else:
        hdfs_host = hdfs_namenode_parts[0]
        hdfs_port = 50070

    print "Proxying %s:%d" % (hdfs_host, hdfs_port)

    redis_client = new_redis_client()

    # Reconstruct the path mapping from the existing state of HDFS
    build_hdfs_redis_state(redis_client)

    bottle.run(host='0.0.0.0', port=5000, server="paste")
    bottle.debug(True)
예제 #22
0
def main():
    global provider_info, redis_client

    parser = argparse.ArgumentParser(
        description="Run a web-interface for provisioning cloud clusters")
    parser.add_argument("config", help="Cloud provider config file")

    utils.add_redis_params(parser)
    parser.add_argument("--port",
                        "-p",
                        help="port on which the GUI accepts HTTP connections",
                        type=int,
                        default=4281)
    args = parser.parse_args()

    redis_client = redis.StrictRedis(host=args.redis_host,
                                     port=args.redis_port,
                                     db=args.redis_db)
    # Test the connection to redis to fail early if redis isn't running.
    clusters = redis_client.smembers("clusters")

    # Perform Amazon configuration
    amazon_info = authenticate("amazon", args.config)
    if amazon_info != None:
        print "Fetching Amazon provider information..."
        provider_info["amazon"] = amazon_info

        aws = plumbum.local["aws"]

        # Fetch EC2 configuration information.
        # Since these commands take some time we'll run them in the background
        subnet_cmd = aws["--profile"]["themis"]["ec2"]["describe-subnets"] & BG
        placement_group_cmd =\
            aws["--profile"]["themis"]["ec2"]["describe-placement-groups"] & BG
        security_group_cmd =\
            aws["--profile"]["themis"]["ec2"]["describe-security-groups"] & BG
        AMI_cmd =\
            aws["--profile"]["themis"]["ec2"]["describe-images"]\
            ["--owners"]["self"] & BG
        S3_cmd = aws["--profile"]["themis"]["s3api"]["list-buckets"] & BG

        print "Gathering information for subnets..."
        stdout = wait_on_background_command(subnet_cmd)

        result = json.loads(stdout)
        subnets = result["Subnets"]
        subnets = [(x["SubnetId"], x["AvailabilityZone"]) for x in subnets]
        provider_info["amazon"]["subnets"] = subnets

        print "Gathering information for placement groups..."
        stdout = wait_on_background_command(placement_group_cmd)

        result = json.loads(stdout)
        placement_groups = result["PlacementGroups"]
        placement_groups = [x["GroupName"] for x in placement_groups]
        provider_info["amazon"]["placement_groups"] = placement_groups

        print "Gathering information for security groups..."
        stdout = wait_on_background_command(security_group_cmd)

        result = json.loads(stdout)
        security_groups = result["SecurityGroups"]
        security_groups = [(x["GroupName"], x["GroupId"])
                           for x in security_groups]
        provider_info["amazon"]["security_groups"] = security_groups

        print "Gathering information for AMIs..."
        stdout = wait_on_background_command(AMI_cmd)

        result = json.loads(stdout)
        images = result["Images"]
        HVM_images = [(x["Name"], x["ImageId"]) for x in images \
                      if x["VirtualizationType"] == "hvm"]
        PV_images = [(x["Name"], x["ImageId"]) for x in images \
                      if x["VirtualizationType"] == "paravirtual"]
        provider_info["amazon"]["HVM_images"] = HVM_images
        provider_info["amazon"]["PV_images"] = PV_images

        print "Gathering information for S3 buckets..."
        stdout = wait_on_background_command(S3_cmd)

        result = json.loads(stdout)
        buckets = result["Buckets"]
        buckets = [x["Name"] for x in buckets]
        provider_info["amazon"]["buckets"] = buckets

        # Load instance type and device information
        print "Gathering information for instance types..."
        parser = ConfigParser.SafeConfigParser()
        parser.read(INSTANCE_TYPE_CONFIG)

        device_map = {}
        instances = []
        for instance_type, num_devices in parser.items("devices"):
            device_map[instance_type] = int(num_devices)
            instances.append(instance_type)
        provider_info["amazon"]["instances"] = instances
        provider_info["amazon"]["device_map"] = device_map

        vm_type_map = {}
        for instance_type, vm_type in parser.items("vm_type"):
            vm_type_map[instance_type] = vm_type
        provider_info["amazon"]["vm_type_map"] = vm_type_map

        placement_groups_map = {}
        for instance_type, placement_groups_enabled in parser.items(
                "placement_groups"):
            placement_groups_map[instance_type] = placement_groups_enabled
        provider_info["amazon"]["placement_groups_map"] = placement_groups_map

        ebs_optimized_map = {}
        for instance_type, ebs_optimized in parser.items("EBS_optimized"):
            ebs_optimized_map[instance_type] = ebs_optimized
        provider_info["amazon"]["ebs_optimized_map"] = ebs_optimized_map

    # Perform Google configuration
    google_info = authenticate("google", args.config)
    if google_info != None:
        print "Fetching Google provider information..."
        provider_info["google"] = google_info

        gcloud = plumbum.local["gcloud"]
        gsutil = plumbum.local["gsutil"]

        # Get list of zones
        print "Retrieving zone information..."
        zones = gcloud["compute"]["zones"]["list"]\
                ["--format"]["json"]()
        zones = json.loads(zones)
        zones = [x["name"] for x in zones]
        if len(zones) == 0:
            print >> sys.stderr, "Found no zones"
            sys.exit(1)
        provider_info["google"]["zones"] = zones

        print "Retrieving network information..."
        networks = gcloud["compute"]["networks"]["list"]["--format"]["json"]()
        networks = json.loads(networks)
        networks = [x["name"] for x in networks]
        provider_info["google"]["networks"] = networks

        print "Retrieving image information"
        images = gcloud["compute"]["images"]["list"]["--no-standard-images"]\
                 ["--format"]["json"]()
        images = json.loads(images)
        images = [x["name"] for x in images]
        provider_info["google"]["images"] = images

        print "Retrieving storage bucket information"
        buckets = gsutil["ls"]()
        buckets = buckets.split("\n")
        buckets = [bucket for bucket in buckets if len(bucket) > 0]
        buckets = [
            bucket.split("gs://")[1].split("/")[0] for bucket in buckets
        ]
        provider_info["google"]["buckets"] = buckets

        print "Retrieving instance type information"
        instances = gcloud["compute"]["machine-types"]["list"]\
                    ["--format"]["json"]()
        instances = json.loads(instances)
        instances = [x["name"] for x in instances]
        instances = list(set(instances))
        instances.sort()
        provider_info["google"]["instances"] = instances
    try:
        bottle.run(host='0.0.0.0', port=args.port)
    except socket.error, e:
        print e
        # Return error 42 to indicate that we can't bind, so that scripts
        # calling this one can handle that case specially
        return constants.CANNOT_BIND
def main():
    global provider_info, redis_client

    parser = argparse.ArgumentParser(
        description="Run a web-interface for provisioning cloud clusters")
    parser.add_argument("config", help="Cloud provider config file")

    utils.add_redis_params(parser)
    parser.add_argument(
        "--port", "-p", help="port on which the GUI accepts HTTP connections",
        type=int, default=4281)
    args = parser.parse_args()

    redis_client = redis.StrictRedis(
        host=args.redis_host, port=args.redis_port, db=args.redis_db)
    # Test the connection to redis to fail early if redis isn't running.
    clusters = redis_client.smembers("clusters")

    # Perform Amazon configuration
    amazon_info = authenticate("amazon", args.config)
    if amazon_info != None:
        print "Fetching Amazon provider information..."
        provider_info["amazon"] = amazon_info

        aws = plumbum.local["aws"]

        # Fetch EC2 configuration information.
        # Since these commands take some time we'll run them in the background
        subnet_cmd = aws["--profile"]["themis"]["ec2"]["describe-subnets"] & BG
        placement_group_cmd =\
            aws["--profile"]["themis"]["ec2"]["describe-placement-groups"] & BG
        security_group_cmd =\
            aws["--profile"]["themis"]["ec2"]["describe-security-groups"] & BG
        AMI_cmd =\
            aws["--profile"]["themis"]["ec2"]["describe-images"]\
            ["--owners"]["self"] & BG
        S3_cmd = aws["--profile"]["themis"]["s3api"]["list-buckets"] & BG

        print "Gathering information for subnets..."
        stdout = wait_on_background_command(subnet_cmd)

        result = json.loads(stdout)
        subnets = result["Subnets"]
        subnets = [(x["SubnetId"], x["AvailabilityZone"]) for x in subnets]
        provider_info["amazon"]["subnets"] = subnets

        print "Gathering information for placement groups..."
        stdout = wait_on_background_command(placement_group_cmd)

        result = json.loads(stdout)
        placement_groups = result["PlacementGroups"]
        placement_groups = [x["GroupName"] for x in placement_groups]
        provider_info["amazon"]["placement_groups"] = placement_groups

        print "Gathering information for security groups..."
        stdout = wait_on_background_command(security_group_cmd)

        result = json.loads(stdout)
        security_groups = result["SecurityGroups"]
        security_groups = [(x["GroupName"], x["GroupId"]) for x in security_groups]
        provider_info["amazon"]["security_groups"] = security_groups

        print "Gathering information for AMIs..."
        stdout = wait_on_background_command(AMI_cmd)

        result = json.loads(stdout)
        images = result["Images"]
        HVM_images = [(x["Name"], x["ImageId"]) for x in images \
                      if x["VirtualizationType"] == "hvm"]
        PV_images = [(x["Name"], x["ImageId"]) for x in images \
                      if x["VirtualizationType"] == "paravirtual"]
        provider_info["amazon"]["HVM_images"] = HVM_images
        provider_info["amazon"]["PV_images"] = PV_images

        print "Gathering information for S3 buckets..."
        stdout = wait_on_background_command(S3_cmd)

        result = json.loads(stdout)
        buckets = result["Buckets"]
        buckets = [x["Name"] for x in buckets]
        provider_info["amazon"]["buckets"] = buckets

        # Load instance type and device information
        print "Gathering information for instance types..."
        parser = ConfigParser.SafeConfigParser()
        parser.read(INSTANCE_TYPE_CONFIG)

        device_map = {}
        instances = []
        for instance_type, num_devices in parser.items("devices"):
            device_map[instance_type] = int(num_devices)
            instances.append(instance_type)
        provider_info["amazon"]["instances"] = instances
        provider_info["amazon"]["device_map"] = device_map

        vm_type_map = {}
        for instance_type, vm_type in parser.items("vm_type"):
            vm_type_map[instance_type] = vm_type
        provider_info["amazon"]["vm_type_map"] = vm_type_map

        placement_groups_map = {}
        for instance_type, placement_groups_enabled in parser.items("placement_groups"):
            placement_groups_map[instance_type] = placement_groups_enabled
        provider_info["amazon"]["placement_groups_map"] = placement_groups_map

        ebs_optimized_map = {}
        for instance_type, ebs_optimized in parser.items("EBS_optimized"):
            ebs_optimized_map[instance_type] = ebs_optimized
        provider_info["amazon"]["ebs_optimized_map"] = ebs_optimized_map

    # Perform Google configuration
    google_info = authenticate("google", args.config)
    if google_info != None:
        print "Fetching Google provider information..."
        provider_info["google"] = google_info

        gcloud = plumbum.local["gcloud"]
        gsutil = plumbum.local["gsutil"]

        # Get list of zones
        print "Retrieving zone information..."
        zones = gcloud["compute"]["zones"]["list"]\
                ["--format"]["json"]()
        zones = json.loads(zones)
        zones = [x["name"] for x in zones]
        if len(zones) == 0:
            print >>sys.stderr, "Found no zones"
            sys.exit(1)
        provider_info["google"]["zones"] = zones

        print "Retrieving network information..."
        networks = gcloud["compute"]["networks"]["list"]["--format"]["json"]()
        networks = json.loads(networks)
        networks = [x["name"] for x in networks]
        provider_info["google"]["networks"] = networks

        print "Retrieving image information"
        images = gcloud["compute"]["images"]["list"]["--no-standard-images"]\
                 ["--format"]["json"]()
        images = json.loads(images)
        images = [x["name"] for x in images]
        provider_info["google"]["images"] = images

        print "Retrieving storage bucket information"
        buckets = gsutil["ls"]()
        buckets = buckets.split("\n")
        buckets = [bucket for bucket in buckets if len(bucket) > 0]
        buckets = [bucket.split("gs://")[1].split("/")[0] for bucket in buckets]
        provider_info["google"]["buckets"] = buckets

        print "Retrieving instance type information"
        instances = gcloud["compute"]["machine-types"]["list"]\
                    ["--format"]["json"]()
        instances = json.loads(instances)
        instances = [x["name"] for x in instances]
        instances = list(set(instances))
        instances.sort()
        provider_info["google"]["instances"] = instances
    try:
        bottle.run(host='0.0.0.0', port=args.port)
    except socket.error, e:
        print e
        # Return error 42 to indicate that we can't bind, so that scripts
        # calling this one can handle that case specially
        return constants.CANNOT_BIND
예제 #24
0
                                 valsort_output_dir)
    if delete_files == "yes":
        # Paranoia check:
        if valsort_output_dir == "/":
            sys.exit("Can't delete /")

        success = parallel_ssh(host_list, "rm", "-rf %s" % valsort_output_dir,
                               verbose)
        if not success:
            sys.exit("Valsort output deletion failed.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Validates graysort input or output files")
    utils.add_redis_params(parser)

    parser.add_argument("job_specification_file",
                        help="a JSON file giving enough information "
                        "about the job for Themis to run it")
    parser.add_argument(
        "-j",
        "--job",
        help="if specified, validate output files for this job "
        "ID, otherwise validate input files",
        type=int,
        default=None)
    parser.add_argument(
        "-p",
        "--parallel",
        type=int,
예제 #25
0
    # Ask user to delete files
    if delete_files:
        # Paranoia check:
        if valsort_output_dir == "/":
            sys.exit("Can't delete /")

        success = parallel_ssh(
            host_list, "rm", "-rf %s" % valsort_output_dir, verbose)
        if not success:
            sys.exit("Valsort output deletion failed.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Validates graysort input or output files")
    utils.add_redis_params(parser)

    parser.add_argument(
        "job_specification_file", help="a JSON file giving enough information "
        "about the job for Themis to run it")
    parser.add_argument(
        "-j", "--job", help="if specified, validate output files for this job "
        "ID, otherwise validate input files", type=int, default=None)
    parser.add_argument(
        "-p", "--parallel", type=int, default=1,
        help="number of parallel workers to use per disk (default %(default)s)")
    parser.add_argument(
        "-c", "--cleanup", default=False, action="store_true",
        help="Clean up by only deleting valsort outputs instead of validating")
    parser.add_argument(
        "-i", "--intermediates", default=False, action="store_true",
예제 #26
0
def main():
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("themis_binary", help="path to the Themis binary")
    parser.add_argument("config", help="a YAML file giving configuration "
                        "options for Themis")
    parser.add_argument("log_directory", help="the base log directory where "
                        "the job runner stores its logs")
    parser.add_argument("batch_nonce", help="the nonce for all batches "
                        "executed by this node coordinator", type=int)
    parser.add_argument("--keepalive_refresh", help="the interval, in seconds, "
                        "between refreshes of the key that this node "
                        "coordinator uses to tell the cluster coordinator that "
                        "it's still alive", type=int)
    parser.add_argument("--keepalive_timeout", help="the amount of time that "
                        "must pass without receiving a keepalive message from "
                        "this node coordinator before the cluster coordinator "
                        "considers it to be dead (default: %(default)s "
                        "seconds)", type=int, default=10)
    parser.add_argument("--profiler", help="path to the binary of a profiling"
                        "tool to use, for example valgrind or operf")
    parser.add_argument("--profiler_options", help="options surrounded by "
                        "quotes to pass to the profiler")
    parser.add_argument("--ld_preload", help="Path to a library to be "
                        "preloaded using LD_PRELOAD.")

    utils.add_redis_params(parser)
    utils.add_interfaces_params(parser)

    args = parser.parse_args()

    args.log_directory = create_log_directory(args.log_directory)
    log.info("Logging to %s" % (args.log_directory))

    node_coordinator_log = os.path.join(
        args.log_directory, "node_coordinators",
        "%s.log" % (socket.getfqdn()))

    utils.backup_if_exists(node_coordinator_log)

    logging.basicConfig(
        format="%(levelname)-8s %(asctime)s %(name)-15s %(message)s",
        datefmt="%m-%d %H:%M:%S",
        filename=node_coordinator_log)

    coordinator = None

    def signal_handler(signal_id, frame):
        log.error("Caught signal %s" % (str(signal_id)))
        os.killpg(0, signal.SIGKILL)

        sys.exit(1)

    signal.signal(signal.SIGUSR1, signal_handler)
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    try:
        coordinator = NodeCoordinator(**vars(args))
        coordinator.run()
    except:
        # Log and print the exception you just caught
        exception_info = sys.exc_info()

        exception = exception_info[1]

        log.exception(exception)

        traceback.print_exception(*exception_info)

        if (not isinstance(exception, SystemExit)) and coordinator is not None:
            log.error("Marking current batch as failed")
            coordinator.fail_current_batch(
                "Node coordinator error: " + str(exception_info[1]))

    finally:
        if coordinator is not None:
            coordinator.stop_keepalive()
예제 #27
0
def main():
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("themis_binary", help="path to the Themis binary")
    parser.add_argument("config",
                        help="a YAML file giving configuration "
                        "options for Themis")
    parser.add_argument("log_directory",
                        help="the base log directory where "
                        "the job runner stores its logs")
    parser.add_argument("batch_nonce",
                        help="the nonce for all batches "
                        "executed by this node coordinator",
                        type=int)
    parser.add_argument(
        "--keepalive_refresh",
        help="the interval, in seconds, "
        "between refreshes of the key that this node "
        "coordinator uses to tell the cluster coordinator that "
        "it's still alive",
        type=int)
    parser.add_argument("--keepalive_timeout",
                        help="the amount of time that "
                        "must pass without receiving a keepalive message from "
                        "this node coordinator before the cluster coordinator "
                        "considers it to be dead (default: %(default)s "
                        "seconds)",
                        type=int,
                        default=10)
    parser.add_argument("--profiler",
                        help="path to the binary of a profiling"
                        "tool to use, for example valgrind or operf")
    parser.add_argument("--profiler_options",
                        help="options surrounded by "
                        "quotes to pass to the profiler")
    parser.add_argument("--ld_preload",
                        help="Path to a library to be "
                        "preloaded using LD_PRELOAD.")

    utils.add_redis_params(parser)
    utils.add_interfaces_params(parser)

    args = parser.parse_args()

    args.log_directory = create_log_directory(args.log_directory)
    log.info("Logging to %s" % (args.log_directory))

    node_coordinator_log = os.path.join(args.log_directory,
                                        "node_coordinators",
                                        "%s.log" % (socket.getfqdn()))

    utils.backup_if_exists(node_coordinator_log)

    logging.basicConfig(
        format="%(levelname)-8s %(asctime)s %(name)-15s %(message)s",
        datefmt="%m-%d %H:%M:%S",
        filename=node_coordinator_log)

    coordinator = None

    def signal_handler(signal_id, frame):
        log.error("Caught signal %s" % (str(signal_id)))
        os.killpg(0, signal.SIGKILL)

        sys.exit(1)

    signal.signal(signal.SIGUSR1, signal_handler)
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    try:
        coordinator = NodeCoordinator(**vars(args))
        coordinator.run()
    except:
        # Log and print the exception you just caught
        exception_info = sys.exc_info()

        exception = exception_info[1]

        log.exception(exception)

        traceback.print_exception(*exception_info)

        if (not isinstance(exception, SystemExit)) and coordinator is not None:
            log.error("Marking current batch as failed")
            coordinator.fail_current_batch("Node coordinator error: " +
                                           str(exception_info[1]))

    finally:
        if coordinator is not None:
            coordinator.stop_keepalive()
def main():
    # Load cluster.conf
    parser = ConfigParser.SafeConfigParser()
    parser.read(CLUSTER_CONF)

    # Get default log directory
    log_directory = parser.get("cluster", "log_directory")

    parser = argparse.ArgumentParser(
        description="coordinates the execution of Themis jobs")
    parser.add_argument("themis_binary", help="path to the Themis binary")
    parser.add_argument("config", help="a YAML file giving configuration "
                        "options for Themis")
    parser.add_argument("--log_directory", "-l",
                        help="the directory in which to store coordinator logs "
                        "(default: %(default)s)", default=log_directory)
    parser.add_argument("--keepalive_refresh", help="the length of time node "
                        "coordinators should wait between refreshing keepalive "
                        "information (default: %(default)s seconds)", type=int,
                        default=2)
    parser.add_argument("--keepalive_timeout", help="the amount of time that "
                        "must pass without receiving a keepalive message from "
                        "a node coordinator before the cluster coordinator "
                        "considers that node to be dead (default: %(default)s "
                        "seconds)", type=int, default=10)
    parser.add_argument("--profiler", help="path to the binary of a profiling"
                        "tool to use, for example valgrind or operf")
    parser.add_argument("--profiler_options", help="options surrounded by "
                        "quotes to pass to the profiler")
    parser.add_argument("--ld_preload", help="Path to a library to be "
                        "preloaded using LD_PRELOAD.")

    utils.add_redis_params(parser)
    utils.add_interfaces_params(parser)

    args = parser.parse_args()

    args.config = os.path.abspath(args.config)

    args.log_directory = create_log_directory(args.log_directory)
    log.info("Logging to %s" % (args.log_directory))

    job_status_gui = None
    job_status_gui_out_fp = None

    resource_monitor_gui = None
    resource_monitor_gui_out_fp = None

    coordinator = None

    try:
        # To make the status GUI port distinct for each user but deterministic
        # for a single user, use 2000 + (the md5 hash of the user's username
        # mod 1000) as the web GUI's port number
        username_md5sum = hashlib.md5()
        username_md5sum.update(getpass.getuser())

        job_status_gui_port = (
            (int(username_md5sum.hexdigest(), 16) % 1000 + 2000) / 10) * 10
        resource_monitor_gui_port = (
            (int(username_md5sum.hexdigest(), 16) % 1000 + 3200) / 10) * 10


        print ""

        # Start the resource monitor web GUI
        resource_monitor_gui, resource_monitor_gui_out_fp = \
            start_resource_monitor_gui(args, resource_monitor_gui_port)

        # Start the job status web GUI
        job_status_gui, job_status_gui_out_fp = start_job_status_gui(
            args, job_status_gui_port)

        print ""

        coordinator = ClusterCoordinator(**vars(args))
        coordinator.run()
    finally:
        if job_status_gui is not None:
            log.info("Stopping job status GUI (PID %d)" % (job_status_gui.pid))
            os.killpg(job_status_gui.pid, signal.SIGTERM)
            job_status_gui.wait()


        if job_status_gui_out_fp is not None:
            job_status_gui_out_fp.flush()
            job_status_gui_out_fp.close()

        if resource_monitor_gui is not None:
            log.info("Stopping resource monitor GUI (PID %d)" % (
                    resource_monitor_gui.pid))
            os.killpg(resource_monitor_gui.pid, signal.SIGTERM)
            resource_monitor_gui.wait()

        if resource_monitor_gui_out_fp is not None:
            resource_monitor_gui_out_fp.flush()
            resource_monitor_gui_out_fp.close()

        if coordinator is not None:
            log.info("Stopping node coordinators")
            coordinator.stop_node_coordinators()
def main():
    # Load cluster.conf
    parser = ConfigParser.SafeConfigParser()
    parser.read(CLUSTER_CONF)

    # Get default log directory
    log_directory = parser.get("cluster", "log_directory")

    parser = argparse.ArgumentParser(
        description="coordinates the execution of Themis jobs")
    parser.add_argument("themis_binary", help="path to the Themis binary")
    parser.add_argument("config",
                        help="a YAML file giving configuration "
                        "options for Themis")
    parser.add_argument(
        "--log_directory",
        "-l",
        help="the directory in which to store coordinator logs "
        "(default: %(default)s)",
        default=log_directory)
    parser.add_argument(
        "--keepalive_refresh",
        help="the length of time node "
        "coordinators should wait between refreshing keepalive "
        "information (default: %(default)s seconds)",
        type=int,
        default=2)
    parser.add_argument("--keepalive_timeout",
                        help="the amount of time that "
                        "must pass without receiving a keepalive message from "
                        "a node coordinator before the cluster coordinator "
                        "considers that node to be dead (default: %(default)s "
                        "seconds)",
                        type=int,
                        default=10)
    parser.add_argument("--profiler",
                        help="path to the binary of a profiling"
                        "tool to use, for example valgrind or operf")
    parser.add_argument("--profiler_options",
                        help="options surrounded by "
                        "quotes to pass to the profiler")
    parser.add_argument("--ld_preload",
                        help="Path to a library to be "
                        "preloaded using LD_PRELOAD.")

    utils.add_redis_params(parser)
    utils.add_interfaces_params(parser)

    args = parser.parse_args()

    args.config = os.path.abspath(args.config)

    args.log_directory = create_log_directory(args.log_directory)
    log.info("Logging to %s" % (args.log_directory))

    job_status_gui = None
    job_status_gui_out_fp = None

    resource_monitor_gui = None
    resource_monitor_gui_out_fp = None

    coordinator = None

    try:
        # To make the status GUI port distinct for each user but deterministic
        # for a single user, use 2000 + (the md5 hash of the user's username
        # mod 1000) as the web GUI's port number
        username_md5sum = hashlib.md5()
        username_md5sum.update(getpass.getuser())

        job_status_gui_port = (
            (int(username_md5sum.hexdigest(), 16) % 1000 + 2000) / 10) * 10
        resource_monitor_gui_port = (
            (int(username_md5sum.hexdigest(), 16) % 1000 + 3200) / 10) * 10

        print ""

        # Start the resource monitor web GUI
        resource_monitor_gui, resource_monitor_gui_out_fp = \
            start_resource_monitor_gui(args, resource_monitor_gui_port)

        # Start the job status web GUI
        job_status_gui, job_status_gui_out_fp = start_job_status_gui(
            args, job_status_gui_port)

        print ""

        coordinator = ClusterCoordinator(**vars(args))
        coordinator.run()
    finally:
        if job_status_gui is not None:
            log.info("Stopping job status GUI (PID %d)" % (job_status_gui.pid))
            os.killpg(job_status_gui.pid, signal.SIGTERM)
            job_status_gui.wait()

        if job_status_gui_out_fp is not None:
            job_status_gui_out_fp.flush()
            job_status_gui_out_fp.close()

        if resource_monitor_gui is not None:
            log.info("Stopping resource monitor GUI (PID %d)" %
                     (resource_monitor_gui.pid))
            os.killpg(resource_monitor_gui.pid, signal.SIGTERM)
            resource_monitor_gui.wait()

        if resource_monitor_gui_out_fp is not None:
            resource_monitor_gui_out_fp.flush()
            resource_monitor_gui_out_fp.close()

        if coordinator is not None:
            log.info("Stopping node coordinators")
            coordinator.stop_node_coordinators()