Пример #1
0
    def testPipelineSchemaSuccessfulRun(self):
        # First create a pipeline.
        flags_dict = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_DSL_PATH: self.pipeline_path
        }
        handler = beam_handler.BeamHandler(flags_dict)
        handler.create_pipeline()

        flags_dict = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_NAME: self.pipeline_name,
        }
        handler = beam_handler.BeamHandler(flags_dict)
        # Create fake schema in pipeline root.
        component_output_dir = os.path.join(self.pipeline_root, 'SchemaGen')
        schema_path = base_driver._generate_output_uri(  # pylint: disable=protected-access
            component_output_dir, 'schema', 3)

        fileio.makedirs(schema_path)
        with open(os.path.join(schema_path, 'schema.pbtxt'), 'w') as f:
            f.write('SCHEMA')
        with self.captureWritesToStream(sys.stdout) as captured:
            handler.get_schema()
            curr_dir_path = os.path.abspath('schema.pbtxt')
            self.assertIn('Path to schema: {}'.format(curr_dir_path),
                          captured.contents())
            self.assertIn(
                '*********SCHEMA FOR {}**********'.format(
                    self.pipeline_name.upper()), captured.contents())
            self.assertTrue(fileio.exists(curr_dir_path))
Пример #2
0
    def testUpdatePipeline(self):
        # First create pipeline with test_pipeline.py
        pipeline_path_1 = os.path.join(self.chicago_taxi_pipeline_dir,
                                       'test_pipeline_beam_1.py')
        flags_dict_1 = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_DSL_PATH: pipeline_path_1
        }
        handler = beam_handler.BeamHandler(flags_dict_1)
        handler.create_pipeline()

        # Update test_pipeline and run update_pipeline
        pipeline_path_2 = os.path.join(self.chicago_taxi_pipeline_dir,
                                       'test_pipeline_beam_2.py')
        flags_dict_2 = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_DSL_PATH: pipeline_path_2
        }
        handler = beam_handler.BeamHandler(flags_dict_2)
        handler.update_pipeline()
        handler_pipeline_path = os.path.join(
            handler._handler_home_dir,
            self.pipeline_args[labels.PIPELINE_NAME], '')
        self.assertTrue(
            fileio.exists(
                os.path.join(handler_pipeline_path, 'pipeline_args.json')))
Пример #3
0
  def testPipelineSchemaSuccessfulRun(self):
    # First create a pipeline.
    flags_dict = {
        labels.ENGINE_FLAG: self.engine,
        labels.PIPELINE_DSL_PATH: self.pipeline_path
    }
    handler = beam_handler.BeamHandler(flags_dict)
    handler.create_pipeline()

    flags_dict = {
        labels.ENGINE_FLAG: self.engine,
        labels.PIPELINE_NAME: self.pipeline_name,
    }
    handler = beam_handler.BeamHandler(flags_dict)
    # Create fake schema in pipeline root.
    schema_path = os.path.join(self.pipeline_root, 'SchemaGen', 'output', '3')
    tf.io.gfile.makedirs(schema_path)
    with open(os.path.join(schema_path, 'schema.pbtxt'), 'w') as f:
      f.write('SCHEMA')
    with self.captureWritesToStream(sys.stdout) as captured:
      handler.get_schema()
      curr_dir_path = os.path.join(os.getcwd(), 'schema.pbtxt')
      self.assertIn('Path to schema: {}'.format(curr_dir_path),
                    captured.contents())
      self.assertIn(
          '*********SCHEMA FOR {}**********'.format(self.pipeline_name.upper()),
          captured.contents())
      self.assertTrue(tf.io.gfile.exists(curr_dir_path))
Пример #4
0
def detect_handler(flags_dict: Dict[Text, Any]) -> base_handler.BaseHandler:
  """Detect handler from the environment.

  Details:
    When the engine flag is set to 'auto', this method first finds all the
    packages in the local environment. The environment is first checked
    for multiple orchestrators and if true the user must rerun the command with
    required engine. If only one orchestrator is present, the engine is set to
    that.

  Args:
    flags_dict: A dictionary containing the flags of a command.

  Returns:
    Corrosponding Handler object.
  """
  packages_list = str(subprocess.check_output(['pip', 'freeze', '--local']))
  if labels.AIRFLOW_PACKAGE_NAME in packages_list and labels.KUBEFLOW_PACKAGE_NAME in packages_list:
    sys.exit('Multiple orchestrators found. Choose one using --engine flag.')
  if labels.AIRFLOW_PACKAGE_NAME in packages_list:
    click.echo('Detected Airflow.')
    flags_dict[labels.ENGINE_FLAG] = 'airflow'
    from tfx.tools.cli.handler import airflow_handler  # pylint: disable=g-import-not-at-top
    return airflow_handler.AirflowHandler(flags_dict)
  elif labels.KUBEFLOW_PACKAGE_NAME in packages_list:
    click.echo('Detected Kubeflow.')
    flags_dict[labels.ENGINE_FLAG] = 'kubeflow'
    from tfx.tools.cli.handler import kubeflow_handler  # pylint: disable=g-import-not-at-top
    return kubeflow_handler.KubeflowHandler(flags_dict)
  # TODO(b/132286477):Update to beam runner later.
  else:
    click.echo('Detected Beam.')
    flags_dict[labels.ENGINE_FLAG] = 'beam'
    from tfx.tools.cli.handler import beam_handler  # pylint: disable=g-import-not-at-top
    return beam_handler.BeamHandler(flags_dict)
Пример #5
0
def create_handler(flags_dict: Dict[Text, Any]) -> base_handler.BaseHandler:
  """Retrieve handler from the environment using the --engine flag.

  Args:
    flags_dict: A dictionary containing the flags of a command.

  Raises:
    RuntimeError: When engine is not supported by TFX.

  Returns:
    Corresponding Handler object.

  """
  engine = flags_dict[labels.ENGINE_FLAG]
  if engine == 'airflow':
    from tfx.tools.cli.handler import airflow_handler  # pylint: disable=g-import-not-at-top
    return airflow_handler.AirflowHandler(flags_dict)
  elif engine == 'kubeflow':
    from tfx.tools.cli.handler import kubeflow_handler  # pylint: disable=g-import-not-at-top
    return kubeflow_handler.KubeflowHandler(flags_dict)
  elif engine == 'beam':
    from tfx.tools.cli.handler import beam_handler  # pylint: disable=g-import-not-at-top
    return beam_handler.BeamHandler(flags_dict)
  elif engine == 'auto':
    return detect_handler(flags_dict)
  else:
    raise RuntimeError('Engine {} is not supported.'.format(engine))
Пример #6
0
def create_handler(flags_dict: Dict[Text, Any]) -> base_handler.BaseHandler:
    """Retrieve handler from the environment using the --engine flag.

  Args:
    flags_dict: A dictionary containing the flags of a command.

  Raises:
    RuntimeError: When engine is not supported by TFX.

  Returns:
    Corresponding Handler object.
  """
    engine = flags_dict[labels.ENGINE_FLAG]
    packages_list = str(subprocess.check_output(['pip', 'freeze', '--local']))
    if engine == 'airflow':
        if labels.AIRFLOW_PACKAGE_NAME not in packages_list:
            sys.exit('Airflow not found.')
        from tfx.tools.cli.handler import airflow_handler  # pylint: disable=g-import-not-at-top
        return airflow_handler.AirflowHandler(flags_dict)
    elif engine == 'kubeflow':
        if labels.KUBEFLOW_PACKAGE_NAME not in packages_list:
            sys.exit('Kubeflow not found.')
        from tfx.tools.cli.handler import kubeflow_handler  # pylint: disable=g-import-not-at-top
        return kubeflow_handler.KubeflowHandler(flags_dict)
    elif engine == 'beam':
        from tfx.tools.cli.handler import beam_handler  # pylint: disable=g-import-not-at-top
        return beam_handler.BeamHandler(flags_dict)
    elif engine == 'local':
        from tfx.tools.cli.handler import local_handler  # pylint: disable=g-import-not-at-top
        return local_handler.LocalHandler(flags_dict)
    elif engine == 'auto':
        return detect_handler(flags_dict)
    else:
        raise RuntimeError('Engine {} is not supported.'.format(engine))
Пример #7
0
 def testCompilePipelineNoPipelineArgs(self):
     flags_dict = {
         labels.ENGINE_FLAG: self.engine,
         labels.PIPELINE_DSL_PATH: 'wrong_pipeline_path.py'
     }
     handler = beam_handler.BeamHandler(flags_dict)
     with self.assertRaisesRegex(SystemExit, 'Invalid pipeline path'):
         handler.compile_pipeline()
Пример #8
0
    def testDeletePipeline(self):
        # First create a pipeline.
        flags_dict = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_DSL_PATH: self.pipeline_path
        }
        handler = beam_handler.BeamHandler(flags_dict)
        handler.create_pipeline()

        # Now delete the pipeline created aand check if pipeline folder is deleted.
        flags_dict = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_NAME: self.pipeline_name
        }
        handler = beam_handler.BeamHandler(flags_dict)
        handler.delete_pipeline()
        self.assertFalse(
            fileio.exists(handler._get_pipeline_info_path(self.pipeline_name)))
Пример #9
0
 def testCompilePipeline(self):
     flags_dict = {
         labels.ENGINE_FLAG: self.engine,
         labels.PIPELINE_DSL_PATH: self.pipeline_path
     }
     handler = beam_handler.BeamHandler(flags_dict)
     with self.captureWritesToStream(sys.stdout) as captured:
         handler.compile_pipeline()
     self.assertIn('Pipeline compiled successfully', captured.contents())
Пример #10
0
 def testCreatePipeline(self):
     flags_dict = {
         labels.ENGINE_FLAG: self.engine,
         labels.PIPELINE_DSL_PATH: self.pipeline_path
     }
     handler = beam_handler.BeamHandler(flags_dict)
     handler.create_pipeline()
     self.assertTrue(
         fileio.exists(handler._get_pipeline_args_path(self.pipeline_name)))
Пример #11
0
    def testPipelineSchemaNoPipelineRoot(self):
        flags_dict = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_DSL_PATH: self.pipeline_path
        }
        handler = beam_handler.BeamHandler(flags_dict)
        handler.create_pipeline()

        flags_dict = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_NAME: self.pipeline_name,
        }
        handler = beam_handler.BeamHandler(flags_dict)
        with self.assertRaises(SystemExit) as err:
            handler.get_schema()
        self.assertEqual(
            str(err.exception),
            'Create a run before inferring schema. If pipeline is already running, then wait for it to successfully finish.'
        )
Пример #12
0
 def testSavePipeline(self):
     flags_dict = {
         labels.ENGINE_FLAG: self.engine,
         labels.PIPELINE_DSL_PATH: self.pipeline_path
     }
     handler = beam_handler.BeamHandler(flags_dict)
     handler._save_pipeline({labels.PIPELINE_NAME: self.pipeline_name})
     self.assertTrue(
         fileio.exists(
             os.path.join(handler._handler_home_dir, self.pipeline_name)))
Пример #13
0
  def testDeletePipeline(self):
    # First create a pipeline.
    flags_dict = {
        labels.ENGINE_FLAG: self.engine,
        labels.PIPELINE_DSL_PATH: self.pipeline_path
    }
    handler = beam_handler.BeamHandler(flags_dict)
    handler.create_pipeline()

    # Now delete the pipeline created aand check if pipeline folder is deleted.
    flags_dict = {
        labels.ENGINE_FLAG: self.engine,
        labels.PIPELINE_NAME: self.pipeline_name
    }
    handler = beam_handler.BeamHandler(flags_dict)
    handler.delete_pipeline()
    handler_pipeline_path = os.path.join(
        handler._handler_home_dir, self.pipeline_args[labels.PIPELINE_NAME], '')
    self.assertFalse(tf.io.gfile.exists(handler_pipeline_path))
Пример #14
0
 def testCompilePipelineNoPipelineArgs(self):
     flags_dict = {
         labels.ENGINE_FLAG: self.engine,
         labels.PIPELINE_DSL_PATH: self.pipeline_path
     }
     handler = beam_handler.BeamHandler(flags_dict)
     with self.assertRaises(SystemExit) as err:
         handler.compile_pipeline()
     self.assertEqual(
         str(err.exception),
         'Unable to compile pipeline. Check your pipeline dsl.')
Пример #15
0
 def testDeletePipelineNonExistentPipeline(self):
     flags_dict = {
         labels.ENGINE_FLAG: self.engine,
         labels.PIPELINE_NAME: self.pipeline_name
     }
     handler = beam_handler.BeamHandler(flags_dict)
     with self.assertRaises(SystemExit) as err:
         handler.delete_pipeline()
     self.assertEqual(
         str(err.exception), 'Pipeline "{}" does not exist.'.format(
             flags_dict[labels.PIPELINE_NAME]))
Пример #16
0
  def testGetRun(self):
    # Create a pipeline in beam home.
    handler_pipeline_path = os.path.join(
        os.environ['BEAM_HOME'], self.pipeline_args[labels.PIPELINE_NAME])
    tf.io.gfile.makedirs(handler_pipeline_path)

    # Now run the pipeline
    flags_dict = {labels.ENGINE_FLAG: self.engine, labels.RUN_ID: self.run_id}
    handler = beam_handler.BeamHandler(flags_dict)
    with self.captureWritesToStream(sys.stdout) as captured:
      handler.get_run()
    self.assertIn('Not supported for Beam.', captured.contents())
Пример #17
0
 def testCreatePipeline(self):
     flags_dict = {
         labels.ENGINE_FLAG: self.engine,
         labels.PIPELINE_DSL_PATH: self.pipeline_path
     }
     handler = beam_handler.BeamHandler(flags_dict)
     handler.create_pipeline()
     handler_pipeline_path = handler._get_handler_pipeline_path(
         self.pipeline_args[labels.PIPELINE_NAME])
     self.assertTrue(
         tf.io.gfile.exists(
             os.path.join(handler_pipeline_path, 'pipeline_args.json')))
Пример #18
0
 def testSavePipeline(self):
     flags_dict = {
         labels.ENGINE_FLAG: self.engine,
         labels.PIPELINE_DSL_PATH: self.pipeline_path
     }
     handler = beam_handler.BeamHandler(flags_dict)
     pipeline_args = handler._extract_pipeline_args()
     handler._save_pipeline(pipeline_args)
     self.assertTrue(
         tf.io.gfile.exists(
             os.path.join(handler._handler_home_dir,
                          self.pipeline_args[labels.PIPELINE_NAME])))
Пример #19
0
    def testPipelineSchemaNoSchemaGenOutput(self):
        # First create a pipeline.
        flags_dict = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_DSL_PATH: self.pipeline_path
        }
        handler = beam_handler.BeamHandler(flags_dict)
        handler.create_pipeline()

        flags_dict = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_NAME: self.pipeline_name,
        }
        handler = beam_handler.BeamHandler(flags_dict)
        fileio.makedirs(self.pipeline_root)
        with self.assertRaises(SystemExit) as err:
            handler.get_schema()
        self.assertEqual(
            str(err.exception),
            'Either SchemaGen component does not exist or pipeline is still running. If pipeline is running, then wait for it to successfully finish.'
        )
Пример #20
0
 def testCreateRunNoPipeline(self):
     # Run pipeline without creating one.
     flags_dict = {
         labels.ENGINE_FLAG: self.engine,
         labels.PIPELINE_NAME: self.pipeline_name
     }
     handler = beam_handler.BeamHandler(flags_dict)
     with self.assertRaises(SystemExit) as err:
         handler.create_run()
     self.assertEqual(
         str(err.exception), 'Pipeline "{}" does not exist.'.format(
             flags_dict[labels.PIPELINE_NAME]))
Пример #21
0
 def testUpdatePipelineNoPipeline(self):
     # Update pipeline without creating one.
     flags_dict = {
         labels.ENGINE_FLAG: self.engine,
         labels.PIPELINE_DSL_PATH: self.pipeline_path
     }
     handler = beam_handler.BeamHandler(flags_dict)
     with self.assertRaises(SystemExit) as err:
         handler.update_pipeline()
     self.assertEqual(
         str(err.exception), 'Pipeline "{}" does not exist.'.format(
             self.pipeline_args[labels.PIPELINE_NAME]))
Пример #22
0
 def testCreatePipelineExistentPipeline(self):
     flags_dict = {
         labels.ENGINE_FLAG: self.engine,
         labels.PIPELINE_DSL_PATH: self.pipeline_path
     }
     handler = beam_handler.BeamHandler(flags_dict)
     handler.create_pipeline()
     # Run create_pipeline again to test.
     with self.assertRaises(SystemExit) as err:
         handler.create_pipeline()
     self.assertEqual(
         str(err.exception), 'Pipeline "{}" already exists.'.format(
             self.pipeline_args[labels.PIPELINE_NAME]))
Пример #23
0
    def testTerminateRun(self):
        # Create a pipeline in dags folder.
        handler_pipeline_path = os.path.join(os.environ['BEAM_HOME'], 'dags',
                                             self.pipeline_name)
        tf.io.gfile.makedirs(handler_pipeline_path)

        # Now run the pipeline
        flags_dict = {
            labels.ENGINE_FLAG: self.engine,
            labels.RUN_ID: self.run_id
        }
        handler = beam_handler.BeamHandler(flags_dict)
        with self.captureWritesToStream(sys.stdout) as captured:
            handler.terminate_run()
        self.assertIn('Not supported for Beam.', captured.contents())
Пример #24
0
def detect_handler(flags_dict: Dict[Text, Any]) -> base_handler.BaseHandler:
    """Detect handler from the environment.

  Details:
    When the engine flag is set to 'auto', this method first finds all the
    packages in the local environment. The environment is first checked
    for multiple orchestrators and if true the user must rerun the command with
    required engine. If only one orchestrator is present, the engine is set to
    that.

  Args:
    flags_dict: A dictionary containing the flags of a command.

  Returns:
    Corrosponding Handler object.
  """
    packages_list = pip_utils.get_package_names()
    if (labels.AIRFLOW_PACKAGE_NAME
            in packages_list) and (labels.KUBEFLOW_PACKAGE_NAME
                                   in packages_list):
        sys.exit(
            'Multiple orchestrators found. Choose one using --engine flag.')
    if labels.AIRFLOW_PACKAGE_NAME in packages_list:
        click.echo('Detected Airflow.')
        click.echo(
            'Use --engine flag if you intend to use a different orchestrator.')
        flags_dict[labels.ENGINE_FLAG] = 'airflow'
        from tfx.tools.cli.handler import airflow_handler  # pylint: disable=g-import-not-at-top
        return airflow_handler.AirflowHandler(flags_dict)
    elif labels.KUBEFLOW_PACKAGE_NAME in packages_list:
        click.echo('Detected Kubeflow.')
        click.echo(
            'Use --engine flag if you intend to use a different orchestrator.')
        flags_dict[labels.ENGINE_FLAG] = 'kubeflow'
        from tfx.tools.cli.handler import kubeflow_handler  # pylint: disable=g-import-not-at-top
        return kubeflow_handler.KubeflowHandler(flags_dict)
    else:
        click.echo('Detected Beam.')
        click.echo(
            '[WARNING] Default engine will be changed to "local" in the near future.'
        )
        click.echo(
            'Use --engine flag if you intend to use a different orchestrator.')
        flags_dict[labels.ENGINE_FLAG] = 'beam'
        from tfx.tools.cli.handler import beam_handler  # pylint: disable=g-import-not-at-top
        return beam_handler.BeamHandler(flags_dict)
Пример #25
0
    def testListPipelinesNonEmpty(self):
        # First create two pipelines in the dags folder.
        handler_pipeline_path_1 = os.path.join(os.environ['BEAM_HOME'],
                                               'pipeline_1')
        handler_pipeline_path_2 = os.path.join(os.environ['BEAM_HOME'],
                                               'pipeline_2')
        fileio.makedirs(handler_pipeline_path_1)
        fileio.makedirs(handler_pipeline_path_2)

        # Now, list the pipelines
        flags_dict = {labels.ENGINE_FLAG: self.engine}
        handler = beam_handler.BeamHandler(flags_dict)

        with self.captureWritesToStream(sys.stdout) as captured:
            handler.list_pipelines()
        self.assertIn('pipeline_1', captured.contents())
        self.assertIn('pipeline_2', captured.contents())
Пример #26
0
    def testCreateRun(self):
        # Create a pipeline in dags folder.
        handler_pipeline_path = os.path.join(
            os.environ['BEAM_HOME'], self.pipeline_args[labels.PIPELINE_NAME])
        fileio.makedirs(handler_pipeline_path)
        with open(os.path.join(handler_pipeline_path, 'pipeline_args.json'),
                  'w') as f:
            json.dump(self.pipeline_args, f)

        # Now run the pipeline
        flags_dict = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_NAME: self.pipeline_name
        }
        handler = beam_handler.BeamHandler(flags_dict)
        with self.captureWritesToStream(sys.stdout) as captured:
            handler.create_run()
        self.assertIn(self.pipeline_path, captured.contents())
Пример #27
0
    def testCreateRun(self, mock_call):
        # Create a pipeline in dags folder.
        handler_pipeline_path = os.path.join(
            os.environ['BEAM_HOME'], self.pipeline_args[labels.PIPELINE_NAME])
        fileio.makedirs(handler_pipeline_path)

        flags_dict = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_NAME: self.pipeline_name
        }
        handler = beam_handler.BeamHandler(flags_dict)
        with open(handler._get_pipeline_args_path(self.pipeline_name),
                  'w') as f:
            json.dump(self.pipeline_args, f)

        # Now run the pipeline
        handler.create_run()

        mock_call.assert_called_once()
        self.assertIn(self.pipeline_path, mock_call.call_args[0][0])
Пример #28
0
 def testListPipelinesEmpty(self):
     flags_dict = {labels.ENGINE_FLAG: self.engine}
     handler = beam_handler.BeamHandler(flags_dict)
     with self.captureWritesToStream(sys.stdout) as captured:
         handler.list_pipelines()
     self.assertIn('No pipelines to display.', captured.contents())