示例#1
0
    def testTaxiPipelineCheckDagConstruction(self):
        airflow_config = {
            'schedule_interval': None,
            'start_date': datetime.datetime(2019, 1, 1),
        }

        # Create directory structure and write expected user module file.
        os.makedirs(os.path.join(self._test_dir, 'taxi'))
        module_file = os.path.join(self._test_dir, 'taxi/taxi_utils.py')
        with open(module_file, 'w') as f:
            f.write('# Placeholder user module file.')

        # Patch $HOME directory for pipeline DAG construction.
        original_home = os.environ['HOME']
        os.environ['HOME'] = self._test_dir
        from tfx.examples.chicago_taxi_pipeline import taxi_pipeline_simple  # pylint: disable=g-import-not-at-top
        os.environ['HOME'] = original_home

        logical_pipeline = taxi_pipeline_simple._create_pipeline(
            pipeline_name='Test',
            pipeline_root=self._test_dir,
            data_root=self._test_dir,
            module_file=module_file,
            serving_model_dir=self._test_dir,
            metadata_path=self._test_dir,
            beam_pipeline_args=[])
        self.assertEqual(9, len(logical_pipeline.components))
        pipeline = AirflowDagRunner(
            AirflowPipelineConfig(airflow_config)).run(logical_pipeline)
        self.assertIsInstance(pipeline, models.DAG)
    def run(self):
        # clear local log folder
        logging.info('Cleaning local log folder : %s' % self.LOCAL_LOG_DIR)
        os.makedirs(self.LOCAL_LOG_DIR, exist_ok=True)
        # self.remove_folders(self.LOCAL_LOG_DIR)
        """Define an airflow pipeline and run it."""

        self.dag = AirflowDagRunner(AirflowPipelineConfig(
            self._airflow_config)).run(self._create_pipeline())
        return self
示例#3
0
def create_dag(name, url, output_dir, airflow_config):
    pipeline_name = name
    pipeline_root = os.path.join(output_dir, 'pipelines', pipeline_name)
    metadata_path = os.path.join(output_dir, 'metadata', pipeline_name,
                                 'metadata.db')

    crawler = NewsCrawler(url=url)
    tfx_pipeline = pipeline.Pipeline(pipeline_name=pipeline_name,
                                     pipeline_root=pipeline_root,
                                     components=[crawler],
                                     enable_cache=False,
                                     metadata_connection_config=metadata.sqlite_metadata_connection_config(
                                         metadata_path))

    return AirflowDagRunner(AirflowPipelineConfig(airflow_config)).run(tfx_pipeline)
示例#4
0
def create_dag(name,
               url,
               airflow_config,
               backup_dir="pipelines_backup",
               mongo_ip=None,
               mongo_port=None,
               dag_type="default",
               output_dir="/output",
               updated_collections=[],
               update_collections=[]):
    pipeline_name = name.replace(".py", "")
    pipeline_root = os.path.join(output_dir, 'pipelines', pipeline_name)
    metadata_path = os.path.join(output_dir, 'metadata', pipeline_name,
                                 'metadata.db')

    components = []
    if dag_type == "default":
        crawler = NewsCrawler(url=url)
        mongo = MongoImport(ip=mongo_ip,
                            port=mongo_port,
                            rss_feed=crawler.outputs["rss_feed"],
                            colname=pipeline_name)
        components = components + [crawler, mongo]
    elif dag_type == "backup":
        load_news = OldNewsImport(backup_dir=os.path.join(
            "/output", backup_dir),
                                  ip=mongo_ip,
                                  port=mongo_port)
        components = components + [load_news]
    elif dag_type == "update":
        update_news = UpdateMongoNews(ip=mongo_ip,
                                      port=mongo_port,
                                      updated_collections=updated_collections,
                                      update_collections=update_collections)
        components = components + [update_news]

    airflow_config["catchup"] = False
    tfx_pipeline = pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=components,
        enable_cache=False,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path))

    return AirflowDagRunner(
        AirflowPipelineConfig(airflow_config)).run(tfx_pipeline)
示例#5
0
 def testTaxiPipelineCheckDagConstruction(self):
   airflow_config = {
       'schedule_interval': None,
       'start_date': datetime.datetime(2019, 1, 1),
   }
   logical_pipeline = taxi_pipeline_simple._create_pipeline(
       pipeline_name='Test',
       pipeline_root=self._test_dir,
       data_root=self._test_dir,
       module_file=self._test_dir,
       serving_model_dir=self._test_dir,
       metadata_path=self._test_dir,
       beam_pipeline_args=[])
   self.assertEqual(9, len(logical_pipeline.components))
   pipeline = AirflowDagRunner(
       AirflowPipelineConfig(airflow_config)).run(logical_pipeline)
   self.assertIsInstance(pipeline, models.DAG)
示例#6
0
  pusher = Pusher(
      model=trainer.outputs['model'],
      model_blessing=evaluator.outputs['blessing'],
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=serving_model_dir)))

  return pipeline.Pipeline(
      pipeline_name=pipeline_name,
      pipeline_root=pipeline_root,
      components=[
          example_gen, statistics_gen, schema_gen, example_validator, transform,
          trainer, model_resolver, evaluator, pusher
      ],
      enable_cache=True,
      metadata_connection_config=metadata.sqlite_metadata_connection_config(
          metadata_path),
      beam_pipeline_args=beam_pipeline_args)


# 'DAG' below need to be kept for Airflow to detect dag.
DAG = AirflowDagRunner(AirflowPipelineConfig(_airflow_config)).run(
    _create_pipeline(
        pipeline_name=_pipeline_name,
        pipeline_root=_pipeline_root,
        data_root=_data_root,
        module_file=_module_file,
        serving_model_dir=_serving_model_dir,
        metadata_path=_metadata_path,
        beam_pipeline_args=_beam_pipeline_args))
示例#7
0
# example code and metadata library is relative to $HOME, but you can store
# these files anywhere on your local filesystem.
_tfx_root = os.path.join(os.environ['HOME'], 'tfx')
_pipeline_root = os.path.join(_tfx_root, 'pipelines')
_metadata_path = os.path.join(_tfx_root, 'metadata', _pipeline_name,
                              'metadata.db')
_log_root = os.path.join(_tfx_root, 'logs')

# Airflow-specific configs; these will be passed directly to airflow
_airflow_config = {
    'schedule_interval': None,
    'start_date': datetime.datetime(2019, 1, 1),
}


def _create_pipeline():
    """Implements the chicago taxi pipeline with TFX."""
    return pipeline.Pipeline(
        pipeline_name=_pipeline_name,
        pipeline_root=_pipeline_root,
        components=test_utils.create_e2e_components(_data_root),
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            _metadata_path),
    )


# Airflow checks 'DAG' keyword for finding the dag.
airflow_pipeline = AirflowDagRunner(
    AirflowPipelineConfig(_airflow_config)).run(_create_pipeline())
示例#8
0
    baseline_model=model_resolver.outputs['model'],
    # Change threshold will be ignored if there is no baseline (first run).
    eval_config=eval_config)

pusher = Pusher(model=trainer.outputs['model'],
                model_blessing=model_analyzer.outputs['blessing'],
                push_destination=pusher_pb2.PushDestination(
                    filesystem=pusher_pb2.PushDestination.Filesystem(
                        base_directory=serving_model_dir)))

tfx_pipeline = pipeline.Pipeline(
    pipeline_name=pipeline_name,
    pipeline_root=pipeline_dir,
    components=[
        example_gen, statistics_gen, schema_gen, validate_stats, transform,
        trainer, model_resolver, model_analyzer, pusher
    ],
    enable_cache=True,
    metadata_connection_config=metadata.sqlite_metadata_connection_config(
        metadata_path),
    # 0 means auto-detect based on on the number of CPUs available during
    # execution time.
    beam_pipeline_args=['--direct_num_workers=0'])

# 'DAG' below need to be kept for Airflow to detect dag.
airflow_config = {
    'schedule_interval': None,
    'start_date': datetime.datetime(2019, 1, 1)
}
DAG = AirflowDagRunner(AirflowPipelineConfig(airflow_config)).run(tfx_pipeline)
示例#9
0
}


def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     metadata_path: Text) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[example_gen, statistics_gen],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path))


# Airflow checks 'DAG' keyword to find the dag.
airflow_pipeline = AirflowDagRunner(
    AirflowPipelineConfig(_airflow_config)).run(
        _create_pipeline(pipeline_name=_pipeline_name,
                         pipeline_root=_pipeline_root,
                         data_root=_data_root,
                         metadata_path=_metadata_path))
示例#10
0
                                  metrics_specs=[metrics_spec])
    evaluator = Evaluator(examples=example_gen.outputs['examples'],
                          model=trainer.outputs['model'],
                          eval_config=eval_config)

    filesystem = pusher_pb2.PushDestination.Filesystem(
        base_directory=SERVING_MODEL_DIR)
    push_destination = pusher_pb2.PushDestination(filesystem=filesystem)
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    push_destination=push_destination)

    pipeline = Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        components=[
            example_gen, statistics_gen, schema_gen, example_validator,
            transform, trainer, evaluator, pusher
        ],
        enable_cache=True,
        beam_pipeline_args=['--direct_num_workers=0'])
    return pipeline


DAG = AirflowDagRunner(AirflowPipelineConfig(AIRFLOW_CONFIG)).run(
    create_pipeline(pipeline_name=PIPELINE_NAME,
                    pipeline_root=PIPELINE_ROOT,
                    metadata_path=METADATA_PATH))