def ExistingSecurityGroup(self, args): ''' Does the security group name currently exist ? get it if it does''' trace(2, "\"%s\"" % (args.nsg_name)) if (args.nsg_name == "" or args.nsg_name == None or args.nsg_name == "None"): error("NetworkSecurityGroup name is \"%s\"" % args.nsg_name) return 1 # Is there a better way to do this than to pull in the entire dictionary # and iterate through the keys? cmd = "aws ec2 describe-security-groups " # build the AWS command to create an instance cmd += " --region %s" % args.region # us-west-2 retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): # check for return code error("Problems describing security groups") return 1 decoded_output = json.loads(output) # number of security groups items = len( decoded_output["SecurityGroups"]) # number of security groups # slow search for name for idx in range(0, items): if (decoded_output["SecurityGroups"][idx]["GroupName"] == args.nsg_name): args.nsg_id = decoded_output["SecurityGroups"][idx]["GroupId"] debug( 2, "%2d %-12s \"%s\"" % (idx, decoded_output["SecurityGroups"][idx]["GroupId"], decoded_output["SecurityGroups"][idx]["GroupName"])) return 0 # found it # returns 1 if did not find security group trace(2, "Did not find security group: \"%s\"" % args.nsg_name) return 1
def GetIPSetupCorrectly(self, args): ''' called after 'running' status to get IP. Does nothing for Alibaba ''' # On aws, IP address change across stop/start cases. # # get full description of the instance json record - large # from this we can get the public IP address of the instance cmd = "aws ec2 describe-instances" cmd += " --instance-id %s" % args.vm_id cmd += " --region %s" % args.region # us-west-2 retcode, output, errval = self.DoCmd(cmd) # this return json structure from 'describe-instances' has about 50 values # in it that, as the command says, describes the instance. Only need a few # of them here. decoded_output = json.loads(output) args.vm_ip = decoded_output['Reservations'][0]['Instances'][0][ 'PublicDnsName'] key_name = decoded_output['Reservations'][0]['Instances'][0]['KeyName'] debug(1, "ip: %s keyname: \"%s\"" % (args.vm_ip, key_name)) # name of SSH keyfile was sent to Create function when VM was built, and we # get a chance to read it back here. Parinoid check to verify that it is # the same. This should never happen, but check for safety if (key_name != args.key_name): # cross-check error("args.key_name:\"%s\" != version vm thinks its using:\"%s\"", args.key_name, key_name) return 1 return 0
def DoCmdNoError(self, cmd): ''' ali specifc Blocking command -- returns command output, doesn't report error''' debug(1, cmd) self.Log(cmd) child = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) output, errval = child.communicate( ) # returns data from stdout, stderr debug(3, output) # full output for trace # print "cmd: %s " % cmd # print "child.returncode: %d " % child.returncode # print "errval: %s " % errval # print "output:\n%s " % output # ali error output is in json format -- kind of... # { # "Message": "The specified InstanceId does not exist.", # "Code": "InvalidInstanceId.NotFound" # } # Detail of Server Exception: # # HTTP Status: 404 Error:InvalidInstanceId.NotFound The specified InstanceId does not exist. RequestID: C66FB5EA-FA09-41B2-AD69-9A68BCCE0B4A if child.returncode != 0 and errval == "": pos = output.find('}') if (pos == -1): return (child.returncode, "", errval) jsonbuf = output[:pos + 1] # only the stuff before the first '}' decoded_output = json.loads(jsonbuf) errval = decoded_output['Message'] return (child.returncode, output, errval ) # pass back retcode, stdout, stderr
def CreateVM(self, args): ''' Creates a new VM. 'args' holds parameters ''' if (args.vm_id != "None" and args.vm_id != None): error( "Instance \"%s\" already exists, run 'deleteVM' first, or 'clean' if stale arg list" % args.vm_id) return 1 args.vm_ip = "" # make sure IP address is clear # ssh key file, builds path from options, checks existance retcode = self.CheckSSHKeyFilePath(args, ".pem") if (retcode != 0): return (retcode) # security group, create if neeeded, does nothing if already exists # consider moving this step outside this VM create so that better # reflects real VM timing? self.Inform("CreateNSG") if (self.CreateNSG(args) != 0): # sets args.nsg_id return 1 trace(2, "nsg_id: \"%s\" %s" % (args.nsg_name, args.nsg_id)) # look up image-name, return region specific image id self.Inform("GetImageId") if (self.GetImageId(args) != 0): return 1 trace(2, "image_id: \"%s\" %s" % (args.image_name, args.image_id)) # with security group and image id, we can now create the instance self.Inform("run-instances") cmd = "aws ec2 run-instances" # build the AWS command to create an instance cmd += " --image-id %s" % args.image_id # aws image identifer via self.GetImageid() cmd += " --instance-type %s" % args.instance_type # t2.micro cmd += " --region %s" % args.region # us-west-2 cmd += " --key-name %s" % args.key_name # my-security-key cmd += " --security-group-ids %s" % args.nsg_id # Security Group retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): # check for return code error("Problems creating VM \"%s\"" % args.vm_name) return 1 # nothing to delete, can return # decode the JSON output decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) args.vm_id = decoded_output['Instances'][0]['InstanceId'] args.vm_ip = "" # don't have IP we see it running # Name your instance! . Done here instead of in run-instances call # it's tricky in bash to get space/qoutes right, at least in original bash code where # this was orginally written.. :-) self.Inform("create-tags") cmd = "aws ec2 create-tags" cmd += " --resource %s" % args.vm_id cmd += " --tags Key=Name,Value=%s" % args.vm_name # unique time-stamped name retcode, output, errval = self.DoCmd(cmd) # wait till the instance is up and running, pingable and ssh-able if (retcode == 0): retcode = self.WaitTillRunning(args, "running", TIMEOUT_1) # save vm ID and other fields setup here so don't use them if error later self.ArgSaveToFile(args) debug(2, "createVM returning %d" % retcode) return retcode # 0: succcess, 1: failure
def CreateSecurityGroup(self, args): ''' creates security group. saves it in args.nsg_id ''' trace(2, "\"%s\" %s" % (args.nsg_name, args.nsg_id)) # Get the users VPC id if we don't have it if (args.vpcid == "" or args.vpcid == None or args.vpcid == "None"): cmd = "aws ec2 describe-vpcs" cmd += " --region %s" % args.region retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): return retcode decoded_output = json.loads(output) debug(2, json.dumps(decoded_output, indent=4, sort_keys=True)) args.vpcid = decoded_output["Vpcs"][0]["VpcId"] debug(1, "args.vpcid <--- %s" % args.vpcid) # create the security group, with a meaningful description desc = "NSG Generated for %s" % args.vm_name cmd = "aws ec2 create-security-group" cmd += " --group-name %s" % args.nsg_name cmd += " --description \"%s\"" % desc cmd += " --vpc-id %s" % args.vpcid cmd += " --region %s" % args.region retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): # check for return code return retcode # get the groupid of the new security group decoded_output = json.loads(output) debug(2, json.dumps(decoded_output, indent=4, sort_keys=True)) args.nsg_id = decoded_output["GroupId"] debug(1, "args.nsg_id <--- %s" % args.nsg_id) # tag new group with our group name cmd = "aws ec2 create-tags" cmd += " --resource %s" % args.nsg_id cmd += " --tags Key=Name,Value=%s" % args.nsg_name retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): # check for return code return retcode # Security rules -- make a list of ingress and outgress rules - easy to change # slow, but this code is rarely used. understandability is more important ingress = {} ingress[0] = { "IpProtocol": "tcp", "ToPort": 22, "FromPort": 22, "CidrIp": "0.0.0.0/0", "Description": "For SSH" } ingress[1] = { "IpProtocol": "tcp", "ToPort": 443, "FromPort": 443, "CidrIp": "0.0.0.0/0", "Description": "For SSL" } ingress[2] = { "IpProtocol": "tcp", "ToPort": 5000, "FromPort": 5000, "CidrIp": "0.0.0.0/0", "Description": "For NVIDIA DIGITS6" } ingress[3] = { "IpProtocol": "icmp", "ToPort": -1, "FromPort": 8, "CidrIp": "0.0.0.0/0", "Description": "To allow to be pinged" } egress = {} outer_retcode = 0 for idx in range(0, len(ingress)): self.Inform("CreateNSG rule %s.%s" % args.nsg_name, ingress[idx]["Name"]) cmd = "aws ec2 authorize-security-group-ingress" cmd += " --group-id %s" % args.nsg_id cmd += " --ip-permissions '[{" # mini-embedded json like cmd += " \"IpProtocol\":\"%s\"," % ingress[idx]["IpProtocol"] cmd += " \"ToPort\":%s," % ingress[idx][ "ToPort"] # KEEP 'To' before 'From' - no effect for tcp, but cmd += " \"FromPort\":%s," % ingress[idx][ "FromPort"] # required for how Wildcard ICMP type is defined cmd += " \"IpRanges\": [{" cmd += " \"CidrIp\":\"%s\"," % ingress[idx]["CidrIp"] cmd += " \"Description\":\"%s\"" % ingress[idx]["Description"] cmd += " }]" cmd += " }]'" retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): outer_retcode = retcode # keep any non-zero return code # egress rules -- as of 1/2018 there arn't any... return outer_retcode
def CreateVM(self, args): ''' Creates a new VM. 'args' holds parameters ''' if (args.vm_id != "None" and args.vm_id != None): error( "Instance \"%s\" already exists, run 'deleteVM' first, or 'clean' if stale arg list" % args.vm_id) return 1 # make sure our persistant IP address is clear args.vm_ip = "" # public ssh key file, builds path from options, checks existance # this sets args.key_file to "keyfile.pub" (better known as "id_rsa.pub") retcode = self.CheckSSHKeyFilePath(args, ".pub") if (retcode != 0): return (retcode) keyfile_pub = args.key_file # print "keyfile_pub:%s" % keyfile_pub # however other than in the createVM, the private Key file # is required for all the local ssh'ing that we will be doing retcode = self.CheckSSHKeyFilePath(args, "") if (retcode != 0): return (retcode) # ssh key file, builds path from options, checks existance # metadata consists of user name, and the "ssh key" file # # Note that where we pass azure the name of our public ssh key, # with Google the entire public key string is passsed in the metadata # # Example: # metadata = "ssh-keys=newtonl:ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDbzMfRh2nXbcwqqVjGvMgOqD3FyJHk4hGdXofLfBAsfQtZQbUg208yWqPEdFgPVyw8zwhd2WAEnaRSK6TmNOok5qgCydpjxbqoCNIfdhfOSFl+T6veiibzQ2UyWolxNPaQ4IPE4FdQsNDM37lsQNCFyZfBaqfbTSmDi5W8Odoqf7E2tfXcLD4gsFpexM4bgK43aaOCp/ekCiJi+Y13MJTw5VmLIdLgJZ/40oMRpK6nZcipbkHkVQEV9mLpTKDLG/xvb7gRzFiXbp4qgF9dWQKqIkfL4UNpcKTjYXqmdt2okoeDGVhQ0AnVM1pHKIyVulV5c17jz7wyj+0UaizAFvSh [email protected]" # # Note: The first few characters of the id_rsa.pub file is "ssh-rsa AAAAB3..." # don't need to explicitly pass in "ssh-rsa" here. Don't over complicate it # with open(keyfile_pub, "r") as f: ssh_rsa_data = f.read() metadata = "ssh-keys=%s:%s" % (args.user, ssh_rsa_data) # with Google, don't need to create a network security group. # mostly inherit defaults from the main scription # neat thing with Google, is that we can specify GPU's at VM init time # with other CSPs, number/type of GPU's is a function of the "instance_type" accelerator_count = 0 # used for delay before ping below if (args.accelerator_type != None and args.accelerator_type != "" and args.accelerator_type != "None" and args.accelerator_count > 0): accelerator = "%s,count=%d" % (args.accelerator_type, args.accelerator_count) accelerator_count = args.accelerator_count # if adding GPUs, add additional info to the VM name # # Google GPU 'accelerator' types are of form: nvidia-tesla-p100 - too long for VM name which is # limited to 61 chars - so strip of last what's after last '-' as name # # Remember with google, names must all be lowercase numbers/letters if (args.vm_name.find("gpu") == -1): # haven't added "gpu" yet type = args.accelerator_type[args.accelerator_type.rfind("-") + 1:] args.vm_name += "-%dx%sgpu" % (args.accelerator_count, type) else: accelerator = None # don't assign gpus # Create the VM # NOTE: with gcp, it's not necessary to assign it Network Security Groups # when creating the VM's -- Called "network firewall rules", they are # added later after the VM is created. self.Inform("CreateVM") cmd = "gcloud --format=\"json\" beta compute" cmd += " --project \"%s\" " % args.project # "my-project" cmd += "instances create \"%s\"" % args.vm_name # "pbradstr-Fri-2018Mar02-181931" cmd += " --zone \"%s\"" % args.region # "us-west1-b" cmd += " --quiet" # reduces noize output cmd += " --machine-type \"%s\"" % args.instance_type # "n1-standard-1" cmd += " --subnet \"%s\"" % args.subnet # default cmd += " --metadata \"%s\"" % metadata cmd += " --maintenance-policy \"%s\"" % args.maintenance_policy # "TERMINATE" cmd += " --service-account \"%s\"" % args.service_account # "*****@*****.**" # cmd += " --scopes %s" % args.scopes # https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring.write","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" \ if (accelerator != None): # optional if we want GPUs cmd += " --accelerator type=%s" % accelerator # nvidia-tesla-p100,count=1" cmd += " --min-cpu-platform \"%s\"" % args.min_cpu_platform # "Automatic" cmd += " --image \"%s\"" % args.image_name # "nvidia-gpu-cloud-image-20180227" cmd += " --image-project \"%s\"" % args.image_project # "nvidia-ngc-public" cmd += " --boot-disk-size %d" % args.boot_disk_size # 32, in GB cmd += " --boot-disk-type \"%s\"" % args.boot_disk_type # "pd-standard" cmd += " --boot-disk-device-name \"%s\"" % args.vm_name # assume same as VM name # To break big command into individual options per line for debugging # echo $V | sed -e $'s/ --/\\\n --/g' # execute the command rc, output, errval = self.DoCmd(cmd) if (rc != 0): # check for return code error("Problems creating VM \"%s\"" % args.vm_name) return rc # Get the returend information, pull out the vmID and (if possible) # the public IP address of the VM # # NOTE: with gcp, IP address is assigned in output from 'create' commmand # don't need to poll for it (we waited for command to complete instead) decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) # FYI: reason why [0] is user here is that json output format could # possibly supply more than one instance of data. Since our request # is specific to one instance, the [0] grouping is kind of redundant args.vm_id = decoded_output[0][ 'id'] # may not actually need the ID, all vm_name based args.vm_ip = decoded_output[0]['networkInterfaces'][0][ 'accessConfigs'][0]['natIP'] # save vm ID and other fields setup here so don't use them if error later # actually don't care if it's fully running, (that would be nice) but # need to save the VM id here since we need to delete it in any case self.ArgSaveToFile(args) # Google has a habbit of reusing the IP addresses, way more than any other # csp that I've tested. But since this is an old IP with a new VM, if that # IP exists in the known_hosts file, it's going to cause problems when # we try to ssh into it (as will happen right away with "WaitTillRunning" # Blow away value in known-hosts now. Note that it's also removed when # the VM is deleted... but done here on create if forgot or removed some # other way. (TODO: This step needed on other CSPs ? ) self.DeleteIPFromSSHKnownHostsFile(args) # quick sanity check -- verify the name returned from the create command # is the same as we were given returned_name = decoded_output[0]["name"] # print("name:%s" % returned_name) if (decoded_output[0]["name"] != args.vm_name): error( "sanity check: vm name returned \"%s\" != vm_name \"%s\" given to create command" % (returned_name, args.vm_name)) json.dumps(decoded_output, indent=4, sort_keys=True) return 1 # Seeing an error here on gcloud only where # # 1) VM is up in gcloud web page, and can ssh into it there from the web page # 2) the first ping in WaitTillRunning succeeds # 3) the ssh in WaitTillRunning fails with a timeout # 4) any further ping or ssh fails # 5) see #1 # # A delay before the first ping seems to workaround the problem # 5 seconds is not enough, got 30% error rates. 10 seconds seems # to work at least with"n1-standard-1" instances and no gpus # # Adding and additional 10 seconds per GPU. Emperical value # delay = 10 + (accelerator_count * 10) debug( 0, "WORKAROUND: external network connect - sleep for %d seconds before ping" % (delay)) time.sleep(delay) # wait a few seconds before ANY command to vm # Another sanity check -- gcp will return from create only once the # vm is up and running. This code here (which comes from aws implementation) # wait's till we can ping and ssh into the VM. It should take little # time here with gcp, but on the other hand it's a good confidence booster # to know that we have checked and hav verified that can ping and ssh into # the vm. if (rc == 0): rc = self.WaitTillRunning(args, "RUNNING", TIMEOUT_1) # returns 0 only if VM is fully up and running, we have it's public IP # and can ssh into it debug(2, "createVM returning %d" % rc) return rc # 0: succcess, 1: failure