def main(): global coordinator_db, username parser = argparse.ArgumentParser( description="Run a web-interface for monitoring the cluster.") utils.add_redis_params(parser) parser.add_argument( "--port", "-p", help="port on which the GUI accepts HTTP connections", type=int, default=4280) parser.add_argument( "--user", help="the username to run under " "(default: %(default)s)", default=getpass.getuser()) args = parser.parse_args() coordinator_db = redis_utils.CoordinatorDB( args.redis_host, args.redis_port, args.redis_db) username = args.user try: bottle.run(host='0.0.0.0', port=args.port) except socket.error, e: print e # Return error 42 to indicate that we can't bind, so that scripts # calling this one can handle that case specially return constants.CANNOT_BIND
def main(): global coordinator_db, log_directory parser = argparse.ArgumentParser( description="provides a GUI for information on job history") utils.add_redis_params(parser) parser.add_argument( "--port", "-p", help="port on which the GUI accepts HTTP connections", type=int, default=4280) parser.add_argument("log_directory", help="base log directory for the " "coordinator") args = parser.parse_args() coordinator_db = redis_utils.CoordinatorDB( args.redis_host, args.redis_port, args.redis_db) log_directory = args.log_directory try: bottle.run(host='0.0.0.0', port=args.port) except socket.error, e: print e # Return error 42 to indicate that we can't bind, so that scripts # calling this one can handle that case specially return constants.CANNOT_BIND
def main(): global coordinator_db, username parser = argparse.ArgumentParser( description="Run a web-interface for monitoring the cluster.") utils.add_redis_params(parser) parser.add_argument("--port", "-p", help="port on which the GUI accepts HTTP connections", type=int, default=4280) parser.add_argument("--user", help="the username to run under " "(default: %(default)s)", default=getpass.getuser()) args = parser.parse_args() coordinator_db = redis_utils.CoordinatorDB(args.redis_host, args.redis_port, args.redis_db) username = args.user try: bottle.run(host='0.0.0.0', port=args.port) except socket.error, e: print e # Return error 42 to indicate that we can't bind, so that scripts # calling this one can handle that case specially return constants.CANNOT_BIND
def main(): parser = argparse.ArgumentParser( description="Run a command or script in over ssh in parallel across " "all nodes in the cluster") parser.add_argument( "--user", help="the username to run under " "(default: %(default)s)", default=getpass.getuser()) parser.add_argument( "--hosts", help="a comma-delimited list of hosts to use instead of " "contacting redis") parser.add_argument( "--ignore_bad_hosts", help="if set, ignore hosts that couldn't be " "reached, rather than failing", action="store_true") parser.add_argument( "--master", "-m", help="if set, also run command on master node", action="store_true") parser.add_argument( "command", help="the command to be run", nargs="+") themis_utils.add_redis_params(parser) args = parser.parse_args() username = args.user hosts = args.hosts if hosts != None: # Separate comma-delimited list hosts = filter(lambda x: len(x) > 0, hosts.split(',')) redis_client = redis.StrictRedis( host=args.redis_host, port=args.redis_port, db=args.redis_db) return_code, _, _ = parallel_ssh( redis_client, args.command, username, hosts, args.ignore_bad_hosts, args.master) return return_code
def main(): parser = argparse.ArgumentParser( description="set up read request queues to replay a given job's inputs" ) parser.add_argument("job_description_file", help="file describing the job " "whose inputs are to be replayed") parser.add_argument("job_ids", nargs="+", help="the job IDs of the jobs being replayed", type=int) parser.add_argument("--skip_phase_zero", default=False, action="store_true", help="don't generate read requests for phase zero") parser.add_argument("--skip_phase_one", default=False, action="store_true", help="don't generate read requests for phase one") parser.add_argument("--phase_zero_sample_size", default=125000000, help="how much data to sample from each file in phase " "zero (default: %(default)s)", type=int) themis_utils.add_redis_params(parser) args = parser.parse_args() return reload_read_request_queues(**vars(args))
def main(): global coordinator_db, log_directory parser = argparse.ArgumentParser( description="provides a GUI for information on job history") utils.add_redis_params(parser) parser.add_argument("--port", "-p", help="port on which the GUI accepts HTTP connections", type=int, default=4280) parser.add_argument("log_directory", help="base log directory for the " "coordinator") args = parser.parse_args() coordinator_db = redis_utils.CoordinatorDB(args.redis_host, args.redis_port, args.redis_db) log_directory = args.log_directory try: bottle.run(host='0.0.0.0', port=args.port) except socket.error, e: print e # Return error 42 to indicate that we can't bind, so that scripts # calling this one can handle that case specially return constants.CANNOT_BIND
def main(): parser = argparse.ArgumentParser( description="submits a job to the Themis coordinator") utils.add_redis_params(parser) parser.add_argument("job_specification_file", help="a JSON file giving enough information about the " "job for Themis to run it") parser.add_argument("--non_blocking", default=False, action="store_true", help="don't wait for jobs to complete before returning") args = parser.parse_args() return run_job(**vars(args))
def main(): parser = argparse.ArgumentParser( description="dump the contents of the coordinator's read request " "queues") utils.add_redis_params(parser) parser.add_argument("directive", choices=["list", "flush"], help="specify " "which action to perform on read request queues") args = parser.parse_args() return read_request_queues(**vars(args))
def main(): parser = argparse.ArgumentParser( description="Check status of a Themis cluster") parser.add_argument("cluster_ID", help="ID of the cluster", type=int) utils.add_redis_params(parser) args = parser.parse_args() redis_client = redis.StrictRedis( host=args.redis_host, port=args.redis_port, db=args.redis_db) return check_cluster_status(args.cluster_ID, redis_client)
def main(): parser = argparse.ArgumentParser( description="Launch a Themis cluster on EC2") parser.add_argument("config", help="Cloud provider config file") parser.add_argument("cluster_name", help="Unique cluster name") parser.add_argument("cluster_size", type=int, help="The number of worker nodes") parser.add_argument("instance_type", help="VM instance type of the worker nodes") parser.add_argument("AMI_ID", help="Amazon Machine Image") parser.add_argument("master_instance_type", help="VM instance type of the master node") parser.add_argument("subnet_ID", help="Subnet IDS for launch") parser.add_argument("security_group_ID", help="Security Group ID") parser.add_argument( "S3_bucket", help="S3 bucket to use for storing configuration files") parser.add_argument("private_key", help="Private key file for ssh") parser.add_argument("public_key", help="Public key file for ssh") parser.add_argument("themis_config_directory", help="Local directory containing Themis " "config files to upload to S3.") parser.add_argument("--placement_group", help="The optional placement group to use") parser.add_argument("--EBS_optimized", action="store_true", default=False, help="Launch VMs with EBS optimization on") parser.add_argument( "--username", default="ec2-user", help="Username to use for logging into EC2. Default %(default)s") utils.add_redis_params(parser) args = parser.parse_args() provider_info = authenticate("amazon", args.config) redis_client = redis.StrictRedis(host=args.redis_host, port=args.redis_port, db=args.redis_db) return launch_amazon_cluster( provider_info, args.cluster_name, args.cluster_size, args.instance_type, args.AMI_ID, args.master_instance_type, args.subnet_ID, args.security_group_ID, args.S3_bucket, args.private_key, args.public_key, args.themis_config_directory, args.placement_group, args.EBS_optimized, args.username, redis_client)
def main(): parser = argparse.ArgumentParser( description="Launch a Themis cluster on EC2") parser.add_argument("config", help="Cloud provider config file") parser.add_argument("cluster_name", help="Unique cluster name") parser.add_argument( "cluster_size", type=int, help="The number of worker nodes") parser.add_argument( "instance_type", help="VM instance type of the worker nodes") parser.add_argument("AMI_ID", help="Amazon Machine Image") parser.add_argument( "master_instance_type", help="VM instance type of the master node") parser.add_argument( "subnet_ID", help="Subnet IDS for launch") parser.add_argument("security_group_ID", help="Security Group ID") parser.add_argument( "S3_bucket", help="S3 bucket to use for storing configuration files") parser.add_argument( "private_key", help="Private key file for ssh") parser.add_argument( "public_key", help="Public key file for ssh") parser.add_argument( "themis_config_directory", help="Local directory containing Themis " "config files to upload to S3.") parser.add_argument( "--placement_group", help="The optional placement group to use") parser.add_argument( "--EBS_optimized", action="store_true", default=False, help="Launch VMs with EBS optimization on") parser.add_argument( "--username", default="ec2-user", help="Username to use for logging into EC2. Default %(default)s") utils.add_redis_params(parser) args = parser.parse_args() provider_info = authenticate("amazon", args.config) redis_client = redis.StrictRedis( host=args.redis_host, port=args.redis_port, db=args.redis_db) return launch_amazon_cluster( provider_info, args.cluster_name, args.cluster_size, args.instance_type, args.AMI_ID, args.master_instance_type, args.subnet_ID, args.security_group_ID, args.S3_bucket, args.private_key, args.public_key, args.themis_config_directory, args.placement_group, args.EBS_optimized, args.username, redis_client)
def main(): parser = argparse.ArgumentParser( description="Cluster utility program for getting IP addresses") utils.add_redis_params(parser) parser.add_argument("command", help="Utility command. Valid commands: all, live") args = parser.parse_args() coordinator_db = redis_utils.CoordinatorDB(args.redis_host, args.redis_port, args.redis_db) assert args.command in ["all", "live"] return cluster_utils(args.command, coordinator_db)
def main(): parser = argparse.ArgumentParser(description="run a script on all nodes") parser.add_argument( "--user", help="the username as whom this job will be run " "(default: %(default)s)", default=getpass.getuser()) parser.add_argument( "--hosts", help="a comma-delimited list of hosts to use instead of " "contacting redis") parser.add_argument( "command", help="the command to be run", nargs=argparse.REMAINDER) themis_utils.add_redis_params(parser) args = parser.parse_args() return run_script(**vars(args))
def main(): parser = argparse.ArgumentParser( description="Cluster utility program for getting IP addresses") utils.add_redis_params(parser) parser.add_argument( "command", help="Utility command. Valid commands: all, live") args = parser.parse_args() coordinator_db = redis_utils.CoordinatorDB( args.redis_host, args.redis_port, args.redis_db) assert args.command in ["all", "live"] return cluster_utils(args.command, coordinator_db)
def main(): parser = argparse.ArgumentParser( description="Launch a Themis cluster on Google") parser.add_argument("config", help="Cloud provider config file") parser.add_argument("cluster_name", help="Unique cluster name") parser.add_argument("cluster_size", type=int, help="The number of worker nodes") parser.add_argument("instance_type", help="VM instance type of the worker nodes") parser.add_argument("local_ssds", help="Number of local SSDs to add to each node", type=int) parser.add_argument("persistent_ssds", help="Number of persistent SSDs to add to each node", type=int) parser.add_argument("image", help="Google Cloud Compute Engine VM Image") parser.add_argument("master_instance_type", help="VM instance type of the master node") parser.add_argument("network", help="Network to run in") parser.add_argument("zone", help="Compute Engine Zone (eg. us-central1-f)") parser.add_argument( "bucket", help="Storage bucket to use for storing configuration files") parser.add_argument("private_key", help="Private key file for ssh") parser.add_argument("public_key", help="Public key file for ssh") parser.add_argument("themis_config_directory", help="Local directory containing Themis " "config files to upload to Storage.") utils.add_redis_params(parser) args = parser.parse_args() provider_info = authenticate("google", args.config) redis_client = redis.StrictRedis(host=args.redis_host, port=args.redis_port, db=args.redis_db) return launch_google_cluster(args.cluster_name, args.cluster_size, args.instance_type, args.local_ssds, args.persistent_ssds, args.image, args.master_instance_type, args.network, args.zone, args.bucket, args.private_key, args.public_key, args.themis_config_directory, provider_info, redis_client)
def main(): parser = argparse.ArgumentParser(description="Launch a Themis cluster on Google") parser.add_argument("config", help="Cloud provider config file") parser.add_argument("cluster_name", help="Unique cluster name") parser.add_argument("cluster_size", type=int, help="The number of worker nodes") parser.add_argument("instance_type", help="VM instance type of the worker nodes") parser.add_argument("local_ssds", help="Number of local SSDs to add to each node", type=int) parser.add_argument("persistent_ssds", help="Number of persistent SSDs to add to each node", type=int) parser.add_argument("image", help="Google Cloud Compute Engine VM Image") parser.add_argument("master_instance_type", help="VM instance type of the master node") parser.add_argument("network", help="Network to run in") parser.add_argument("zone", help="Compute Engine Zone (eg. us-central1-f)") parser.add_argument("bucket", help="Storage bucket to use for storing configuration files") parser.add_argument("private_key", help="Private key file for ssh") parser.add_argument("public_key", help="Public key file for ssh") parser.add_argument( "themis_config_directory", help="Local directory containing Themis " "config files to upload to Storage." ) utils.add_redis_params(parser) args = parser.parse_args() provider_info = authenticate("google", args.config) redis_client = redis.StrictRedis(host=args.redis_host, port=args.redis_port, db=args.redis_db) return launch_google_cluster( args.cluster_name, args.cluster_size, args.instance_type, args.local_ssds, args.persistent_ssds, args.image, args.master_instance_type, args.network, args.zone, args.bucket, args.private_key, args.public_key, args.themis_config_directory, provider_info, redis_client, )
def main(): parser = argparse.ArgumentParser(description="run a script on all nodes") parser.add_argument("--user", help="the username as whom this job will be run " "(default: %(default)s)", default=getpass.getuser()) parser.add_argument( "--hosts", help="a comma-delimited list of hosts to use instead of " "contacting redis") parser.add_argument("command", help="the command to be run", nargs=argparse.REMAINDER) themis_utils.add_redis_params(parser) args = parser.parse_args() return run_script(**vars(args))
def main(): parser = argparse.ArgumentParser(description="Terminate a Themis cluster") parser.add_argument("cluster_ID", help="Unique cluster ID", type=int) parser.add_argument( "--provider", help="The provider to use (amazon or google). Must be " "set if cluster is not found in redis." ) parser.add_argument( "--zone", help="zone that the cluster is running in. Must be set if " "cluster is not found in redis and provider is google.", ) utils.add_redis_params(parser) args = parser.parse_args() redis_client = redis.StrictRedis(host=args.redis_host, port=args.redis_port, db=args.redis_db) return terminate_cluster(args.cluster_ID, redis_client, args.provider, args.zone)
def main(): parser = argparse.ArgumentParser( description="set up read request queues to replay a given job's inputs") parser.add_argument("job_description_file", help="file describing the job " "whose inputs are to be replayed") parser.add_argument( "job_ids", nargs="+", help="the job IDs of the jobs being replayed", type=int) parser.add_argument("--skip_phase_zero", default=False, action="store_true", help="don't generate read requests for phase zero") parser.add_argument("--skip_phase_one", default=False, action="store_true", help="don't generate read requests for phase one") parser.add_argument("--phase_zero_sample_size", default=125000000, help="how much data to sample from each file in phase " "zero (default: %(default)s)", type=int) themis_utils.add_redis_params(parser) args = parser.parse_args() return reload_read_request_queues(**vars(args))
def main(): parser = argparse.ArgumentParser( description="Terminate a Themis cluster") parser.add_argument("cluster_ID", help="Unique cluster ID", type=int) parser.add_argument( "--provider", help="The provider to use (amazon or google). Must be " "set if cluster is not found in redis.") parser.add_argument( "--zone", help="zone that the cluster is running in. Must be set if " "cluster is not found in redis and provider is google.") utils.add_redis_params(parser) args = parser.parse_args() redis_client = redis.StrictRedis( host=args.redis_host, port=args.redis_port, db=args.redis_db) return terminate_cluster( args.cluster_ID, redis_client, args.provider, args.zone)
def main(): global hdfs_host, hdfs_port, redis_host, redis_port, redis_db parser = argparse.ArgumentParser( description="WebHDFS proxy that re-writes file requests to facilitate " "increased I/O parallelism") parser.add_argument("hdfs_namenode", help="the host:port of the HDFS namenode for which " "this script will serve as a proxy") utils.add_redis_params(parser) args = parser.parse_args() redis_host = args.redis_host redis_port = args.redis_port redis_db = args.redis_db hdfs_namenode_parts = filter(lambda x: len(x) > 0, args.hdfs_namenode.split(':')) if len(hdfs_namenode_parts) == 2: hdfs_host, hdfs_port = hdfs_namenode_parts hdfs_port = int(hdfs_port) else: hdfs_host = hdfs_namenode_parts[0] hdfs_port = 50070 print "Proxying %s:%d" % (hdfs_host, hdfs_port) redis_client = new_redis_client() # Reconstruct the path mapping from the existing state of HDFS build_hdfs_redis_state(redis_client) bottle.run(host='0.0.0.0', port=5000, server="paste") bottle.debug(True)
def main(): global provider_info, redis_client parser = argparse.ArgumentParser( description="Run a web-interface for provisioning cloud clusters") parser.add_argument("config", help="Cloud provider config file") utils.add_redis_params(parser) parser.add_argument("--port", "-p", help="port on which the GUI accepts HTTP connections", type=int, default=4281) args = parser.parse_args() redis_client = redis.StrictRedis(host=args.redis_host, port=args.redis_port, db=args.redis_db) # Test the connection to redis to fail early if redis isn't running. clusters = redis_client.smembers("clusters") # Perform Amazon configuration amazon_info = authenticate("amazon", args.config) if amazon_info != None: print "Fetching Amazon provider information..." provider_info["amazon"] = amazon_info aws = plumbum.local["aws"] # Fetch EC2 configuration information. # Since these commands take some time we'll run them in the background subnet_cmd = aws["--profile"]["themis"]["ec2"]["describe-subnets"] & BG placement_group_cmd =\ aws["--profile"]["themis"]["ec2"]["describe-placement-groups"] & BG security_group_cmd =\ aws["--profile"]["themis"]["ec2"]["describe-security-groups"] & BG AMI_cmd =\ aws["--profile"]["themis"]["ec2"]["describe-images"]\ ["--owners"]["self"] & BG S3_cmd = aws["--profile"]["themis"]["s3api"]["list-buckets"] & BG print "Gathering information for subnets..." stdout = wait_on_background_command(subnet_cmd) result = json.loads(stdout) subnets = result["Subnets"] subnets = [(x["SubnetId"], x["AvailabilityZone"]) for x in subnets] provider_info["amazon"]["subnets"] = subnets print "Gathering information for placement groups..." stdout = wait_on_background_command(placement_group_cmd) result = json.loads(stdout) placement_groups = result["PlacementGroups"] placement_groups = [x["GroupName"] for x in placement_groups] provider_info["amazon"]["placement_groups"] = placement_groups print "Gathering information for security groups..." stdout = wait_on_background_command(security_group_cmd) result = json.loads(stdout) security_groups = result["SecurityGroups"] security_groups = [(x["GroupName"], x["GroupId"]) for x in security_groups] provider_info["amazon"]["security_groups"] = security_groups print "Gathering information for AMIs..." stdout = wait_on_background_command(AMI_cmd) result = json.loads(stdout) images = result["Images"] HVM_images = [(x["Name"], x["ImageId"]) for x in images \ if x["VirtualizationType"] == "hvm"] PV_images = [(x["Name"], x["ImageId"]) for x in images \ if x["VirtualizationType"] == "paravirtual"] provider_info["amazon"]["HVM_images"] = HVM_images provider_info["amazon"]["PV_images"] = PV_images print "Gathering information for S3 buckets..." stdout = wait_on_background_command(S3_cmd) result = json.loads(stdout) buckets = result["Buckets"] buckets = [x["Name"] for x in buckets] provider_info["amazon"]["buckets"] = buckets # Load instance type and device information print "Gathering information for instance types..." parser = ConfigParser.SafeConfigParser() parser.read(INSTANCE_TYPE_CONFIG) device_map = {} instances = [] for instance_type, num_devices in parser.items("devices"): device_map[instance_type] = int(num_devices) instances.append(instance_type) provider_info["amazon"]["instances"] = instances provider_info["amazon"]["device_map"] = device_map vm_type_map = {} for instance_type, vm_type in parser.items("vm_type"): vm_type_map[instance_type] = vm_type provider_info["amazon"]["vm_type_map"] = vm_type_map placement_groups_map = {} for instance_type, placement_groups_enabled in parser.items( "placement_groups"): placement_groups_map[instance_type] = placement_groups_enabled provider_info["amazon"]["placement_groups_map"] = placement_groups_map ebs_optimized_map = {} for instance_type, ebs_optimized in parser.items("EBS_optimized"): ebs_optimized_map[instance_type] = ebs_optimized provider_info["amazon"]["ebs_optimized_map"] = ebs_optimized_map # Perform Google configuration google_info = authenticate("google", args.config) if google_info != None: print "Fetching Google provider information..." provider_info["google"] = google_info gcloud = plumbum.local["gcloud"] gsutil = plumbum.local["gsutil"] # Get list of zones print "Retrieving zone information..." zones = gcloud["compute"]["zones"]["list"]\ ["--format"]["json"]() zones = json.loads(zones) zones = [x["name"] for x in zones] if len(zones) == 0: print >> sys.stderr, "Found no zones" sys.exit(1) provider_info["google"]["zones"] = zones print "Retrieving network information..." networks = gcloud["compute"]["networks"]["list"]["--format"]["json"]() networks = json.loads(networks) networks = [x["name"] for x in networks] provider_info["google"]["networks"] = networks print "Retrieving image information" images = gcloud["compute"]["images"]["list"]["--no-standard-images"]\ ["--format"]["json"]() images = json.loads(images) images = [x["name"] for x in images] provider_info["google"]["images"] = images print "Retrieving storage bucket information" buckets = gsutil["ls"]() buckets = buckets.split("\n") buckets = [bucket for bucket in buckets if len(bucket) > 0] buckets = [ bucket.split("gs://")[1].split("/")[0] for bucket in buckets ] provider_info["google"]["buckets"] = buckets print "Retrieving instance type information" instances = gcloud["compute"]["machine-types"]["list"]\ ["--format"]["json"]() instances = json.loads(instances) instances = [x["name"] for x in instances] instances = list(set(instances)) instances.sort() provider_info["google"]["instances"] = instances try: bottle.run(host='0.0.0.0', port=args.port) except socket.error, e: print e # Return error 42 to indicate that we can't bind, so that scripts # calling this one can handle that case specially return constants.CANNOT_BIND
def main(): global provider_info, redis_client parser = argparse.ArgumentParser( description="Run a web-interface for provisioning cloud clusters") parser.add_argument("config", help="Cloud provider config file") utils.add_redis_params(parser) parser.add_argument( "--port", "-p", help="port on which the GUI accepts HTTP connections", type=int, default=4281) args = parser.parse_args() redis_client = redis.StrictRedis( host=args.redis_host, port=args.redis_port, db=args.redis_db) # Test the connection to redis to fail early if redis isn't running. clusters = redis_client.smembers("clusters") # Perform Amazon configuration amazon_info = authenticate("amazon", args.config) if amazon_info != None: print "Fetching Amazon provider information..." provider_info["amazon"] = amazon_info aws = plumbum.local["aws"] # Fetch EC2 configuration information. # Since these commands take some time we'll run them in the background subnet_cmd = aws["--profile"]["themis"]["ec2"]["describe-subnets"] & BG placement_group_cmd =\ aws["--profile"]["themis"]["ec2"]["describe-placement-groups"] & BG security_group_cmd =\ aws["--profile"]["themis"]["ec2"]["describe-security-groups"] & BG AMI_cmd =\ aws["--profile"]["themis"]["ec2"]["describe-images"]\ ["--owners"]["self"] & BG S3_cmd = aws["--profile"]["themis"]["s3api"]["list-buckets"] & BG print "Gathering information for subnets..." stdout = wait_on_background_command(subnet_cmd) result = json.loads(stdout) subnets = result["Subnets"] subnets = [(x["SubnetId"], x["AvailabilityZone"]) for x in subnets] provider_info["amazon"]["subnets"] = subnets print "Gathering information for placement groups..." stdout = wait_on_background_command(placement_group_cmd) result = json.loads(stdout) placement_groups = result["PlacementGroups"] placement_groups = [x["GroupName"] for x in placement_groups] provider_info["amazon"]["placement_groups"] = placement_groups print "Gathering information for security groups..." stdout = wait_on_background_command(security_group_cmd) result = json.loads(stdout) security_groups = result["SecurityGroups"] security_groups = [(x["GroupName"], x["GroupId"]) for x in security_groups] provider_info["amazon"]["security_groups"] = security_groups print "Gathering information for AMIs..." stdout = wait_on_background_command(AMI_cmd) result = json.loads(stdout) images = result["Images"] HVM_images = [(x["Name"], x["ImageId"]) for x in images \ if x["VirtualizationType"] == "hvm"] PV_images = [(x["Name"], x["ImageId"]) for x in images \ if x["VirtualizationType"] == "paravirtual"] provider_info["amazon"]["HVM_images"] = HVM_images provider_info["amazon"]["PV_images"] = PV_images print "Gathering information for S3 buckets..." stdout = wait_on_background_command(S3_cmd) result = json.loads(stdout) buckets = result["Buckets"] buckets = [x["Name"] for x in buckets] provider_info["amazon"]["buckets"] = buckets # Load instance type and device information print "Gathering information for instance types..." parser = ConfigParser.SafeConfigParser() parser.read(INSTANCE_TYPE_CONFIG) device_map = {} instances = [] for instance_type, num_devices in parser.items("devices"): device_map[instance_type] = int(num_devices) instances.append(instance_type) provider_info["amazon"]["instances"] = instances provider_info["amazon"]["device_map"] = device_map vm_type_map = {} for instance_type, vm_type in parser.items("vm_type"): vm_type_map[instance_type] = vm_type provider_info["amazon"]["vm_type_map"] = vm_type_map placement_groups_map = {} for instance_type, placement_groups_enabled in parser.items("placement_groups"): placement_groups_map[instance_type] = placement_groups_enabled provider_info["amazon"]["placement_groups_map"] = placement_groups_map ebs_optimized_map = {} for instance_type, ebs_optimized in parser.items("EBS_optimized"): ebs_optimized_map[instance_type] = ebs_optimized provider_info["amazon"]["ebs_optimized_map"] = ebs_optimized_map # Perform Google configuration google_info = authenticate("google", args.config) if google_info != None: print "Fetching Google provider information..." provider_info["google"] = google_info gcloud = plumbum.local["gcloud"] gsutil = plumbum.local["gsutil"] # Get list of zones print "Retrieving zone information..." zones = gcloud["compute"]["zones"]["list"]\ ["--format"]["json"]() zones = json.loads(zones) zones = [x["name"] for x in zones] if len(zones) == 0: print >>sys.stderr, "Found no zones" sys.exit(1) provider_info["google"]["zones"] = zones print "Retrieving network information..." networks = gcloud["compute"]["networks"]["list"]["--format"]["json"]() networks = json.loads(networks) networks = [x["name"] for x in networks] provider_info["google"]["networks"] = networks print "Retrieving image information" images = gcloud["compute"]["images"]["list"]["--no-standard-images"]\ ["--format"]["json"]() images = json.loads(images) images = [x["name"] for x in images] provider_info["google"]["images"] = images print "Retrieving storage bucket information" buckets = gsutil["ls"]() buckets = buckets.split("\n") buckets = [bucket for bucket in buckets if len(bucket) > 0] buckets = [bucket.split("gs://")[1].split("/")[0] for bucket in buckets] provider_info["google"]["buckets"] = buckets print "Retrieving instance type information" instances = gcloud["compute"]["machine-types"]["list"]\ ["--format"]["json"]() instances = json.loads(instances) instances = [x["name"] for x in instances] instances = list(set(instances)) instances.sort() provider_info["google"]["instances"] = instances try: bottle.run(host='0.0.0.0', port=args.port) except socket.error, e: print e # Return error 42 to indicate that we can't bind, so that scripts # calling this one can handle that case specially return constants.CANNOT_BIND
valsort_output_dir) if delete_files == "yes": # Paranoia check: if valsort_output_dir == "/": sys.exit("Can't delete /") success = parallel_ssh(host_list, "rm", "-rf %s" % valsort_output_dir, verbose) if not success: sys.exit("Valsort output deletion failed.") if __name__ == "__main__": parser = argparse.ArgumentParser( description="Validates graysort input or output files") utils.add_redis_params(parser) parser.add_argument("job_specification_file", help="a JSON file giving enough information " "about the job for Themis to run it") parser.add_argument( "-j", "--job", help="if specified, validate output files for this job " "ID, otherwise validate input files", type=int, default=None) parser.add_argument( "-p", "--parallel", type=int,
# Ask user to delete files if delete_files: # Paranoia check: if valsort_output_dir == "/": sys.exit("Can't delete /") success = parallel_ssh( host_list, "rm", "-rf %s" % valsort_output_dir, verbose) if not success: sys.exit("Valsort output deletion failed.") if __name__ == "__main__": parser = argparse.ArgumentParser( description="Validates graysort input or output files") utils.add_redis_params(parser) parser.add_argument( "job_specification_file", help="a JSON file giving enough information " "about the job for Themis to run it") parser.add_argument( "-j", "--job", help="if specified, validate output files for this job " "ID, otherwise validate input files", type=int, default=None) parser.add_argument( "-p", "--parallel", type=int, default=1, help="number of parallel workers to use per disk (default %(default)s)") parser.add_argument( "-c", "--cleanup", default=False, action="store_true", help="Clean up by only deleting valsort outputs instead of validating") parser.add_argument( "-i", "--intermediates", default=False, action="store_true",
def main(): parser = argparse.ArgumentParser(description="") parser.add_argument("themis_binary", help="path to the Themis binary") parser.add_argument("config", help="a YAML file giving configuration " "options for Themis") parser.add_argument("log_directory", help="the base log directory where " "the job runner stores its logs") parser.add_argument("batch_nonce", help="the nonce for all batches " "executed by this node coordinator", type=int) parser.add_argument("--keepalive_refresh", help="the interval, in seconds, " "between refreshes of the key that this node " "coordinator uses to tell the cluster coordinator that " "it's still alive", type=int) parser.add_argument("--keepalive_timeout", help="the amount of time that " "must pass without receiving a keepalive message from " "this node coordinator before the cluster coordinator " "considers it to be dead (default: %(default)s " "seconds)", type=int, default=10) parser.add_argument("--profiler", help="path to the binary of a profiling" "tool to use, for example valgrind or operf") parser.add_argument("--profiler_options", help="options surrounded by " "quotes to pass to the profiler") parser.add_argument("--ld_preload", help="Path to a library to be " "preloaded using LD_PRELOAD.") utils.add_redis_params(parser) utils.add_interfaces_params(parser) args = parser.parse_args() args.log_directory = create_log_directory(args.log_directory) log.info("Logging to %s" % (args.log_directory)) node_coordinator_log = os.path.join( args.log_directory, "node_coordinators", "%s.log" % (socket.getfqdn())) utils.backup_if_exists(node_coordinator_log) logging.basicConfig( format="%(levelname)-8s %(asctime)s %(name)-15s %(message)s", datefmt="%m-%d %H:%M:%S", filename=node_coordinator_log) coordinator = None def signal_handler(signal_id, frame): log.error("Caught signal %s" % (str(signal_id))) os.killpg(0, signal.SIGKILL) sys.exit(1) signal.signal(signal.SIGUSR1, signal_handler) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) try: coordinator = NodeCoordinator(**vars(args)) coordinator.run() except: # Log and print the exception you just caught exception_info = sys.exc_info() exception = exception_info[1] log.exception(exception) traceback.print_exception(*exception_info) if (not isinstance(exception, SystemExit)) and coordinator is not None: log.error("Marking current batch as failed") coordinator.fail_current_batch( "Node coordinator error: " + str(exception_info[1])) finally: if coordinator is not None: coordinator.stop_keepalive()
def main(): parser = argparse.ArgumentParser(description="") parser.add_argument("themis_binary", help="path to the Themis binary") parser.add_argument("config", help="a YAML file giving configuration " "options for Themis") parser.add_argument("log_directory", help="the base log directory where " "the job runner stores its logs") parser.add_argument("batch_nonce", help="the nonce for all batches " "executed by this node coordinator", type=int) parser.add_argument( "--keepalive_refresh", help="the interval, in seconds, " "between refreshes of the key that this node " "coordinator uses to tell the cluster coordinator that " "it's still alive", type=int) parser.add_argument("--keepalive_timeout", help="the amount of time that " "must pass without receiving a keepalive message from " "this node coordinator before the cluster coordinator " "considers it to be dead (default: %(default)s " "seconds)", type=int, default=10) parser.add_argument("--profiler", help="path to the binary of a profiling" "tool to use, for example valgrind or operf") parser.add_argument("--profiler_options", help="options surrounded by " "quotes to pass to the profiler") parser.add_argument("--ld_preload", help="Path to a library to be " "preloaded using LD_PRELOAD.") utils.add_redis_params(parser) utils.add_interfaces_params(parser) args = parser.parse_args() args.log_directory = create_log_directory(args.log_directory) log.info("Logging to %s" % (args.log_directory)) node_coordinator_log = os.path.join(args.log_directory, "node_coordinators", "%s.log" % (socket.getfqdn())) utils.backup_if_exists(node_coordinator_log) logging.basicConfig( format="%(levelname)-8s %(asctime)s %(name)-15s %(message)s", datefmt="%m-%d %H:%M:%S", filename=node_coordinator_log) coordinator = None def signal_handler(signal_id, frame): log.error("Caught signal %s" % (str(signal_id))) os.killpg(0, signal.SIGKILL) sys.exit(1) signal.signal(signal.SIGUSR1, signal_handler) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) try: coordinator = NodeCoordinator(**vars(args)) coordinator.run() except: # Log and print the exception you just caught exception_info = sys.exc_info() exception = exception_info[1] log.exception(exception) traceback.print_exception(*exception_info) if (not isinstance(exception, SystemExit)) and coordinator is not None: log.error("Marking current batch as failed") coordinator.fail_current_batch("Node coordinator error: " + str(exception_info[1])) finally: if coordinator is not None: coordinator.stop_keepalive()
def main(): # Load cluster.conf parser = ConfigParser.SafeConfigParser() parser.read(CLUSTER_CONF) # Get default log directory log_directory = parser.get("cluster", "log_directory") parser = argparse.ArgumentParser( description="coordinates the execution of Themis jobs") parser.add_argument("themis_binary", help="path to the Themis binary") parser.add_argument("config", help="a YAML file giving configuration " "options for Themis") parser.add_argument("--log_directory", "-l", help="the directory in which to store coordinator logs " "(default: %(default)s)", default=log_directory) parser.add_argument("--keepalive_refresh", help="the length of time node " "coordinators should wait between refreshing keepalive " "information (default: %(default)s seconds)", type=int, default=2) parser.add_argument("--keepalive_timeout", help="the amount of time that " "must pass without receiving a keepalive message from " "a node coordinator before the cluster coordinator " "considers that node to be dead (default: %(default)s " "seconds)", type=int, default=10) parser.add_argument("--profiler", help="path to the binary of a profiling" "tool to use, for example valgrind or operf") parser.add_argument("--profiler_options", help="options surrounded by " "quotes to pass to the profiler") parser.add_argument("--ld_preload", help="Path to a library to be " "preloaded using LD_PRELOAD.") utils.add_redis_params(parser) utils.add_interfaces_params(parser) args = parser.parse_args() args.config = os.path.abspath(args.config) args.log_directory = create_log_directory(args.log_directory) log.info("Logging to %s" % (args.log_directory)) job_status_gui = None job_status_gui_out_fp = None resource_monitor_gui = None resource_monitor_gui_out_fp = None coordinator = None try: # To make the status GUI port distinct for each user but deterministic # for a single user, use 2000 + (the md5 hash of the user's username # mod 1000) as the web GUI's port number username_md5sum = hashlib.md5() username_md5sum.update(getpass.getuser()) job_status_gui_port = ( (int(username_md5sum.hexdigest(), 16) % 1000 + 2000) / 10) * 10 resource_monitor_gui_port = ( (int(username_md5sum.hexdigest(), 16) % 1000 + 3200) / 10) * 10 print "" # Start the resource monitor web GUI resource_monitor_gui, resource_monitor_gui_out_fp = \ start_resource_monitor_gui(args, resource_monitor_gui_port) # Start the job status web GUI job_status_gui, job_status_gui_out_fp = start_job_status_gui( args, job_status_gui_port) print "" coordinator = ClusterCoordinator(**vars(args)) coordinator.run() finally: if job_status_gui is not None: log.info("Stopping job status GUI (PID %d)" % (job_status_gui.pid)) os.killpg(job_status_gui.pid, signal.SIGTERM) job_status_gui.wait() if job_status_gui_out_fp is not None: job_status_gui_out_fp.flush() job_status_gui_out_fp.close() if resource_monitor_gui is not None: log.info("Stopping resource monitor GUI (PID %d)" % ( resource_monitor_gui.pid)) os.killpg(resource_monitor_gui.pid, signal.SIGTERM) resource_monitor_gui.wait() if resource_monitor_gui_out_fp is not None: resource_monitor_gui_out_fp.flush() resource_monitor_gui_out_fp.close() if coordinator is not None: log.info("Stopping node coordinators") coordinator.stop_node_coordinators()
def main(): # Load cluster.conf parser = ConfigParser.SafeConfigParser() parser.read(CLUSTER_CONF) # Get default log directory log_directory = parser.get("cluster", "log_directory") parser = argparse.ArgumentParser( description="coordinates the execution of Themis jobs") parser.add_argument("themis_binary", help="path to the Themis binary") parser.add_argument("config", help="a YAML file giving configuration " "options for Themis") parser.add_argument( "--log_directory", "-l", help="the directory in which to store coordinator logs " "(default: %(default)s)", default=log_directory) parser.add_argument( "--keepalive_refresh", help="the length of time node " "coordinators should wait between refreshing keepalive " "information (default: %(default)s seconds)", type=int, default=2) parser.add_argument("--keepalive_timeout", help="the amount of time that " "must pass without receiving a keepalive message from " "a node coordinator before the cluster coordinator " "considers that node to be dead (default: %(default)s " "seconds)", type=int, default=10) parser.add_argument("--profiler", help="path to the binary of a profiling" "tool to use, for example valgrind or operf") parser.add_argument("--profiler_options", help="options surrounded by " "quotes to pass to the profiler") parser.add_argument("--ld_preload", help="Path to a library to be " "preloaded using LD_PRELOAD.") utils.add_redis_params(parser) utils.add_interfaces_params(parser) args = parser.parse_args() args.config = os.path.abspath(args.config) args.log_directory = create_log_directory(args.log_directory) log.info("Logging to %s" % (args.log_directory)) job_status_gui = None job_status_gui_out_fp = None resource_monitor_gui = None resource_monitor_gui_out_fp = None coordinator = None try: # To make the status GUI port distinct for each user but deterministic # for a single user, use 2000 + (the md5 hash of the user's username # mod 1000) as the web GUI's port number username_md5sum = hashlib.md5() username_md5sum.update(getpass.getuser()) job_status_gui_port = ( (int(username_md5sum.hexdigest(), 16) % 1000 + 2000) / 10) * 10 resource_monitor_gui_port = ( (int(username_md5sum.hexdigest(), 16) % 1000 + 3200) / 10) * 10 print "" # Start the resource monitor web GUI resource_monitor_gui, resource_monitor_gui_out_fp = \ start_resource_monitor_gui(args, resource_monitor_gui_port) # Start the job status web GUI job_status_gui, job_status_gui_out_fp = start_job_status_gui( args, job_status_gui_port) print "" coordinator = ClusterCoordinator(**vars(args)) coordinator.run() finally: if job_status_gui is not None: log.info("Stopping job status GUI (PID %d)" % (job_status_gui.pid)) os.killpg(job_status_gui.pid, signal.SIGTERM) job_status_gui.wait() if job_status_gui_out_fp is not None: job_status_gui_out_fp.flush() job_status_gui_out_fp.close() if resource_monitor_gui is not None: log.info("Stopping resource monitor GUI (PID %d)" % (resource_monitor_gui.pid)) os.killpg(resource_monitor_gui.pid, signal.SIGTERM) resource_monitor_gui.wait() if resource_monitor_gui_out_fp is not None: resource_monitor_gui_out_fp.flush() resource_monitor_gui_out_fp.close() if coordinator is not None: log.info("Stopping node coordinators") coordinator.stop_node_coordinators()