def testEphemeralPackage(self, mock_mkdtemp): mock_mkdtemp.return_value = self._tmp_dir if os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR'): # This test requires setuptools which is not available. logging.info('Skipping testEphemeralPackage') return package = dependency_utils.build_ephemeral_package() self.assertRegex(os.path.basename(package), r'tfx_ephemeral-.*\.tar.gz')
def testEphemeralPackageMocked(self, mock_subprocess_call, mock_mkdtemp): source_data_dir = os.path.join(os.path.dirname(__file__), 'testdata') test_file = os.path.join(source_data_dir, 'test.csv') expected_package = 'mypackage.tar.gz' def side_effect(cmd): self.assertEqual(3, len(cmd)) self.assertEqual(sys.executable, cmd[0]) self.assertEqual('sdist', cmd[2]) setup_file = cmd[1] dist_dir = os.path.join(os.path.dirname(setup_file), 'dist') tf.io.gfile.makedirs(dist_dir) dest_file = os.path.join(dist_dir, expected_package) tf.io.gfile.copy(test_file, dest_file) mock_subprocess_call.side_effect = side_effect mock_mkdtemp.return_value = self._tmp_dir package = dependency_utils.build_ephemeral_package() self.assertEqual(expected_package, os.path.basename(package))
def start_cmle_training(input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], executor_class_path: Text, training_inputs: Dict[Text, Any]): """Start a trainer job on CMLE. This is done by forwarding the inputs/outputs/exec_properties to the tfx.scripts.run_executor module on a CMLE training job interpreter. Args: input_dict: Passthrough input dict for tfx.components.Trainer.executor. output_dict: Passthrough input dict for tfx.components.Trainer.executor. exec_properties: Passthrough input dict for tfx.components.Trainer.executor. executor_class_path: class path for TFX core default trainer. training_inputs: Training input for CMLE training job. 'pythonModule', 'pythonVersion' and 'runtimeVersion' will be inferred by the runner. For the full set of parameters supported, refer to https://cloud.google.com/ml-engine/docs/tensorflow/deploying-models#creating_a_model_version. Returns: None Raises: RuntimeError: if the Google Cloud AI Platform training job failed. """ training_inputs = training_inputs.copy() # Remove cmle_args from exec_properties so CMLE trainer doesn't call itself for gaip_training_key in ['cmle_training_args', 'gaip_training_args']: if gaip_training_key in exec_properties.get('custom_config'): exec_properties['custom_config'].pop(gaip_training_key) json_inputs = artifact_utils.jsonify_artifact_dict(input_dict) tf.logging.info('json_inputs=\'%s\'.', json_inputs) json_outputs = artifact_utils.jsonify_artifact_dict(output_dict) tf.logging.info('json_outputs=\'%s\'.', json_outputs) json_exec_properties = json.dumps(exec_properties) tf.logging.info('json_exec_properties=\'%s\'.', json_exec_properties) # Configure CMLE job api_client = discovery.build('ml', 'v1') job_args = [ '--executor_class_path', executor_class_path, '--inputs', json_inputs, '--outputs', json_outputs, '--exec-properties', json_exec_properties ] training_inputs['args'] = job_args training_inputs['pythonModule'] = 'tfx.scripts.run_executor' training_inputs['pythonVersion'] = _get_caip_python_version() # runtimeVersion should be same as <major>.<minor> of currently # installed tensorflow version. training_inputs['runtimeVersion'] = _get_tf_runtime_version() # Pop project_id so CMLE doesn't complain about an unexpected parameter. # It's been a stowaway in cmle_args and has finally reached its destination. project = training_inputs.pop('project') project_id = 'projects/{}'.format(project) package_uris = training_inputs.get('packageUris', []) if package_uris: tf.logging.info('Following packageUris \'%s\' are provided by user.', package_uris) else: local_package = dependency_utils.build_ephemeral_package() # TODO(b/125451545): Use a safe temp dir instead of jobDir. cloud_package = os.path.join(training_inputs['jobDir'], os.path.basename(local_package)) io_utils.copy_file(local_package, cloud_package, True) training_inputs['packageUris'] = [cloud_package] tf.logging.info('Package %s will be used', training_inputs['packageUris']) job_name = 'tfx_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S') job_spec = {'jobId': job_name, 'trainingInput': training_inputs} # Submit job to CMLE tf.logging.info('Submitting job=\'{}\', project=\'{}\' to CMLE.'.format( job_name, project)) request = api_client.projects().jobs().create(body=job_spec, parent=project_id) request.execute() # Wait for CMLE job to finish job_id = '{}/jobs/{}'.format(project_id, job_name) request = api_client.projects().jobs().get(name=job_id) response = request.execute() while response['state'] not in ('SUCCEEDED', 'FAILED'): time.sleep(_POLLING_INTERVAL_IN_SECONDS) response = request.execute() if response['state'] == 'FAILED': err_msg = 'Job \'{}\' did not succeed. Detailed response {}.'.format( job_name, response) tf.logging.error(err_msg) raise RuntimeError(err_msg) # CMLE training complete tf.logging.info('Job \'{}\' successful.'.format(job_name))