예제 #1
0
 def testVMNotCreatedByExecGroup(self):
     instance_helper = tpu_utils.Instance(self.track)
     instance = instance_helper.BuildInstanceSpec('fake-instance',
                                                  'fake-zone',
                                                  'fake-machine-type', -1,
                                                  False, 'default')
     instance.labels = self.instances_messages.Instance.LabelsValue()
     self.assertFalse(instance_helper._VMCreatedByExecGroup(instance))
예제 #2
0
 def testInstanceIsRunning(self):
     instance_helper = tpu_utils.Instance(self.track)
     instance = instance_helper.BuildInstanceSpec('fake-instance',
                                                  'fake-zone',
                                                  'fake-machine-type', -1,
                                                  False, 'default')
     instance.status = self.instances_messages.Instance.StatusValueValuesEnum.RUNNING
     self.assertEqual(instance_helper.IsRunning(instance), True)
예제 #3
0
 def testVMCreatedByExecGroup(self):
     instance_helper = tpu_utils.Instance(self.track)
     instance = instance_helper.BuildInstanceSpec('fake-instance',
                                                  'fake-zone',
                                                  'fake-machine-type', -1,
                                                  False, 'default')
     instance.labels = self.instances_messages.Instance.LabelsValue(
         additionalProperties=self.createMultipleLabels())
     self.assertTrue(instance_helper._VMCreatedByExecGroup(instance))
예제 #4
0
 def testImageFamilyFromTensorflowVersion(self):
     tests = [
         ['1.6', True, 'tf-1-6-gpu'],
         ['1.6.35', False, 'tf-1-6-35'],
         ['1.6', False, 'tf-1-6'],
     ]
     instance_helper = tpu_utils.Instance(self.track)
     for test in tests:
         got = instance_helper._ImageFamilyFromTensorflowVersion(
             test[0], test[1])
         self.assertEqual(got, test[2])
예제 #5
0
 def testInstanceIsNotRunning(self):
     instance_helper = tpu_utils.Instance(self.track)
     instance = instance_helper.BuildInstanceSpec('fake-instance',
                                                  'fake-zone',
                                                  'fake-machine-type', -1,
                                                  False, 'default')
     state_enum = self.instances_messages.Instance.StatusValueValuesEnum
     for state in state_enum:
         if state != state_enum.RUNNING:
             instance.status = state
             self.assertFalse(instance_helper.IsRunning(instance))
예제 #6
0
    def Run(self, args):
        tpu_utils.DefaultArgs.ValidateZone(args)

        responses = []
        instance_helper = tpu_utils.Instance(self.ReleaseTrack())
        try:
            instance = instance_helper.Get(args.execution_group_name,
                                           args.zone)
        except HttpNotFoundError:
            # As it stands, we provide vm-only option but no tpu-only option. So if
            # there is no VM, then we can safely short-circuit and claim the
            # execution group is not found.
            responses.append(GetResult('Execution Group Status:', 'Not Found'))
            return responses

        responses.append(
            GetResult(
                'Compute Engine Instance IP Address:',
                instance.networkInterfaces and instance.networkInterfaces[0]
                and instance.networkInterfaces[0].networkIP))
        responses.append(
            GetResult('Compute Engine Created:', instance.creationTimestamp))
        responses.append(
            GetResult('Compute Engine Machine Type:', instance.machineType))

        node_helper = tpu_utils.TPUNode(self.ReleaseTrack())
        node = None
        try:
            node = node_helper.Get(args.execution_group_name, args.zone)
        except HttpNotFoundError:
            responses.append(GetResult('TPU Node status:', 'Not Found'))

        if node:
            responses.append(
                GetResult('TPU Accelerator Type:', node.acceleratorType))
            responses.append(
                GetResult(
                    'TPU IP Address:', node.networkEndpoints
                    and node.networkEndpoints[0]
                    and node.networkEndpoints[0].ipAddress))
            responses.append(
                GetResult('TPU TF Version:', node.tensorflowVersion))
            responses.append(
                GetResult('TPU Service Account:', node.serviceAccount))
            responses.append(GetResult('TPU Created:', node.createTime))
            responses.append(GetResult('TPU State:', node.state))
            responses.append(GetResult('TPU Health:', node.health))
            responses.append(
                GetResult(
                    'TPU Preemptible:', node.schedulingConfig
                    and node.schedulingConfig.preemptible))

        return responses
예제 #7
0
    def testImageFamilyResolutionFromTensorflowVersion(self):
        instance_helper = tpu_utils.Instance(self.track)

        self.ExpectComputeImagesGetFamily('tf-1-6-gpu', 'fake-project',
                                          'fake-image-self-link')
        instance_helper.ResolveImageFromTensorflowVersion(
            '1.6', 'fake-project', True)

        self.ExpectComputeImagesGetFamily('tf-1-7-35', 'fake-project-2',
                                          'fake-image-self-link')
        instance_helper.ResolveImageFromTensorflowVersion(
            '1.7.35', 'fake-project-2', False)
예제 #8
0
 def testImageFamilyFromTensorflowVersionWithParseError(self):
     instance_helper = tpu_utils.Instance(self.track)
     with self.assertRaisesRegex(
             tpu_utils.TensorflowVersionParser.ParseError,
             r'Invalid tensorflow version:1.7-RC3 \(non-empty modifier\); please '
             'set the --gce-image flag'):
         instance_helper._ImageFamilyFromTensorflowVersion('1.7-RC3', False)
     with self.assertRaisesRegex(
             tpu_utils.TensorflowVersionParser.ParseError,
             r'Invalid tensorflow version:1.6-RC1 \(non-empty modifier\); please set '
             'the --gce-image flag'):
         instance_helper._ImageFamilyFromTensorflowVersion('1.6-RC1', True)
예제 #9
0
 def _makeTestInstanceSpec(self,
                           instance_name,
                           preemptible_vm,
                           source_image,
                           network='default'):
     instance_helper = tpu_utils.Instance(self.track)
     return instance_helper.BuildInstanceSpec(instance_name,
                                              'central2-a',
                                              'n1-standard-1',
                                              250,
                                              preemptible=preemptible_vm,
                                              source_image=source_image,
                                              network=network)
예제 #10
0
  def Run(self, args):
    responses = []
    tpu_operation_ref = None
    instance_operation_ref = None

    tpu_utils.DefaultArgs.ValidateZone(args)

    if not args.tpu_only:
      instance = tpu_utils.Instance(self.ReleaseTrack())
      try:
        instance_operation_ref = instance.Delete(args.execution_group_name,
                                                 args.zone)
      except HttpNotFoundError:
        log.status.Print(
            'Instance:{} not found, possibly already deleted.'.format(
                args.execution_group_name))

    tpu = tpu_utils.TPUNode(self.ReleaseTrack())
    try:
      tpu_operation_ref = tpu.Delete(args.execution_group_name, args.zone)
    except HttpNotFoundError:
      log.status.Print(
          'TPU Node:{} not found, possibly already deleted.'.format(
              args.execution_group_name))

    if instance_operation_ref:
      try:
        instance_delete_response = instance.WaitForOperationNoResources(
            instance_operation_ref, 'Deleting GCE VM')
        responses.append(instance_delete_response)
      except HttpNotFoundError:
        log.status.Print(
            'Instance:{} not found, possibly already deleted.'.format(
                args.execution_group_name))

    if tpu_operation_ref:
      try:
        responses.append(
            tpu.WaitForOperationNoResources(
                tpu_operation_ref, 'Deleting TPU node'))
      except HttpNotFoundError:
        log.status.Print(
            'TPU Node:{} not found, possibly already deleted.'.format(
                args.execution_group_name))

    return responses
예제 #11
0
  def Run(self, args):
    responses = []
    instances = {}
    nodes = {}
    instance_helper = tpu_utils.Instance(self.ReleaseTrack())
    for instance in instance_helper.List(args.zone):
      instances[instance.name] = instance

    node_helper = tpu_utils.TPUNode(self.ReleaseTrack())
    for node in node_helper.List(args.zone):
      nodes[node_helper.NodeName(node)] = node

    for name, instance in instances.items():
      if name not in nodes.keys():
        responses.append(ListResult(name, 'Paused'))
      elif instance_helper.IsRunning(instance) and node_helper.IsRunning(
          nodes[name]):
        responses.append(ListResult(name, 'Running'))
      else:
        responses.append(ListResult(name, 'Unknown Status'))
    return sorted(responses)
예제 #12
0
    def Run(self, args):
        tpu_utils.DefaultArgs.ValidateZone(args)

        responses = []
        tpu = tpu_utils.TPUNode(self.ReleaseTrack())
        tpu_operation_ref = None
        instance_operation_ref = None
        if not args.vm_only:
            try:
                tpu_operation_ref = tpu.Create(args.execution_group_name,
                                               args.accelerator_type,
                                               args.tf_version, args.zone,
                                               args.preemptible, args.network)
            except HttpConflictError:
                log.status.Print('TPU Node with name:{} already exists, '
                                 'try a different name'.format(
                                     args.execution_group_name))
                return responses

        instance = tpu_utils.Instance(self.ReleaseTrack())
        try:
            instance_operation_ref = instance.Start(args.execution_group_name,
                                                    args.zone)
        except HttpNotFoundError:
            log.status.Print('Instance:{} not found, possibly deleted.'.format(
                args.execution_group_name))
            return responses

        if instance_operation_ref:
            instance_start_response = instance.WaitForOperation(
                instance_operation_ref, 'Starting GCE VM')
            responses.append(instance_start_response)

        if tpu_operation_ref:
            responses.append(
                tpu.WaitForOperation(
                    tpu_operation_ref,
                    'Creating TPU node:{}'.format(args.execution_group_name)))

        return responses
예제 #13
0
    def Run(self, args):
        responses = []
        if args.dry_run:
            self.DryRun(args)
            return responses

        tpu = tpu_utils.TPUNode(self.ReleaseTrack())
        if not args.tf_version:
            try:
                args.tf_version = tpu.LatestStableTensorflowVersion(args.zone)
            except HttpNotFoundError:
                log.err.Print(
                    'Could not find stable Tensorflow version, please '
                    'set tensorflow version flag using --tf-version')
                return responses

        if not args.vm_only:
            try:
                tpu_operation_ref = tpu.Create(args.name,
                                               args.accelerator_type,
                                               args.tf_version, args.zone,
                                               args.preemptible, args.network)
            except HttpConflictError:
                log.err.Print('TPU Node with name:{} already exists, '
                              'try a different name'.format(args.name))
                return responses

        if not args.tpu_only:
            instance = tpu_utils.Instance(self.ReleaseTrack())
            gce_image = args.gce_image
            if not gce_image:
                gce_image = instance.ResolveImageFromTensorflowVersion(
                    args.tf_version, 'ml-images', args.use_dl_images)
            try:
                instance_operation_ref = instance.Create(
                    args.name, args.zone, args.machine_type,
                    utils.BytesToGb(args.disk_size), args.preemptible_vm,
                    gce_image, args.network)
            except HttpConflictError:
                err_msg = ('VM with name:{} already exists, '
                           'try a different name.').format(args.name)
                if not args.vm_only:
                    err_msg += (' TPU Node:{} creation is underway and will '
                                'need to be deleted.'.format(args.name))
                log.err.Print(err_msg)
                return responses

        if not args.vm_only:
            responses.append(
                tpu.WaitForOperation(tpu_operation_ref,
                                     'Creating TPU node:{}'.format(args.name)))
        if not args.tpu_only:
            instance_create_response = instance.WaitForOperation(
                instance_operation_ref, 'Creating GCE VM:{}'.format(args.name))
            responses.append(instance_create_response)

            ssh_helper = tpu_utils.SSH(self.ReleaseTrack())
            responses.append(
                ssh_helper.SSHToInstance(args, instance_create_response))

        return responses