def testTaxiPipelineCheckDagConstruction(self): airflow_config = { 'schedule_interval': None, 'start_date': datetime.datetime(2019, 1, 1), } # Create directory structure and write expected user module file. os.makedirs(os.path.join(self._test_dir, 'taxi')) module_file = os.path.join(self._test_dir, 'taxi/taxi_utils.py') with open(module_file, 'w') as f: f.write('# Placeholder user module file.') # Patch $HOME directory for pipeline DAG construction. original_home = os.environ['HOME'] os.environ['HOME'] = self._test_dir from tfx.examples.chicago_taxi_pipeline import taxi_pipeline_simple # pylint: disable=g-import-not-at-top os.environ['HOME'] = original_home logical_pipeline = taxi_pipeline_simple._create_pipeline( pipeline_name='Test', pipeline_root=self._test_dir, data_root=self._test_dir, module_file=module_file, serving_model_dir=self._test_dir, metadata_path=self._test_dir, beam_pipeline_args=[]) self.assertEqual(9, len(logical_pipeline.components)) pipeline = AirflowDagRunner( AirflowPipelineConfig(airflow_config)).run(logical_pipeline) self.assertIsInstance(pipeline, models.DAG)
def run(self): # clear local log folder logging.info('Cleaning local log folder : %s' % self.LOCAL_LOG_DIR) os.makedirs(self.LOCAL_LOG_DIR, exist_ok=True) # self.remove_folders(self.LOCAL_LOG_DIR) """Define an airflow pipeline and run it.""" self.dag = AirflowDagRunner(AirflowPipelineConfig( self._airflow_config)).run(self._create_pipeline()) return self
def create_dag(name, url, output_dir, airflow_config): pipeline_name = name pipeline_root = os.path.join(output_dir, 'pipelines', pipeline_name) metadata_path = os.path.join(output_dir, 'metadata', pipeline_name, 'metadata.db') crawler = NewsCrawler(url=url) tfx_pipeline = pipeline.Pipeline(pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[crawler], enable_cache=False, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path)) return AirflowDagRunner(AirflowPipelineConfig(airflow_config)).run(tfx_pipeline)
def create_dag(name, url, airflow_config, backup_dir="pipelines_backup", mongo_ip=None, mongo_port=None, dag_type="default", output_dir="/output", updated_collections=[], update_collections=[]): pipeline_name = name.replace(".py", "") pipeline_root = os.path.join(output_dir, 'pipelines', pipeline_name) metadata_path = os.path.join(output_dir, 'metadata', pipeline_name, 'metadata.db') components = [] if dag_type == "default": crawler = NewsCrawler(url=url) mongo = MongoImport(ip=mongo_ip, port=mongo_port, rss_feed=crawler.outputs["rss_feed"], colname=pipeline_name) components = components + [crawler, mongo] elif dag_type == "backup": load_news = OldNewsImport(backup_dir=os.path.join( "/output", backup_dir), ip=mongo_ip, port=mongo_port) components = components + [load_news] elif dag_type == "update": update_news = UpdateMongoNews(ip=mongo_ip, port=mongo_port, updated_collections=updated_collections, update_collections=update_collections) components = components + [update_news] airflow_config["catchup"] = False tfx_pipeline = pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=components, enable_cache=False, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path)) return AirflowDagRunner( AirflowPipelineConfig(airflow_config)).run(tfx_pipeline)
def testTaxiPipelineCheckDagConstruction(self): airflow_config = { 'schedule_interval': None, 'start_date': datetime.datetime(2019, 1, 1), } logical_pipeline = taxi_pipeline_simple._create_pipeline( pipeline_name='Test', pipeline_root=self._test_dir, data_root=self._test_dir, module_file=self._test_dir, serving_model_dir=self._test_dir, metadata_path=self._test_dir, beam_pipeline_args=[]) self.assertEqual(9, len(logical_pipeline.components)) pipeline = AirflowDagRunner( AirflowPipelineConfig(airflow_config)).run(logical_pipeline) self.assertIsInstance(pipeline, models.DAG)
pusher = Pusher( model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=beam_pipeline_args) # 'DAG' below need to be kept for Airflow to detect dag. DAG = AirflowDagRunner(AirflowPipelineConfig(_airflow_config)).run( _create_pipeline( pipeline_name=_pipeline_name, pipeline_root=_pipeline_root, data_root=_data_root, module_file=_module_file, serving_model_dir=_serving_model_dir, metadata_path=_metadata_path, beam_pipeline_args=_beam_pipeline_args))
# example code and metadata library is relative to $HOME, but you can store # these files anywhere on your local filesystem. _tfx_root = os.path.join(os.environ['HOME'], 'tfx') _pipeline_root = os.path.join(_tfx_root, 'pipelines') _metadata_path = os.path.join(_tfx_root, 'metadata', _pipeline_name, 'metadata.db') _log_root = os.path.join(_tfx_root, 'logs') # Airflow-specific configs; these will be passed directly to airflow _airflow_config = { 'schedule_interval': None, 'start_date': datetime.datetime(2019, 1, 1), } def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" return pipeline.Pipeline( pipeline_name=_pipeline_name, pipeline_root=_pipeline_root, components=test_utils.create_e2e_components(_data_root), enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( _metadata_path), ) # Airflow checks 'DAG' keyword for finding the dag. airflow_pipeline = AirflowDagRunner( AirflowPipelineConfig(_airflow_config)).run(_create_pipeline())
baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_analyzer.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) tfx_pipeline = pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_dir, components=[ example_gen, statistics_gen, schema_gen, validate_stats, transform, trainer, model_resolver, model_analyzer, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # 0 means auto-detect based on on the number of CPUs available during # execution time. beam_pipeline_args=['--direct_num_workers=0']) # 'DAG' below need to be kept for Airflow to detect dag. airflow_config = { 'schedule_interval': None, 'start_date': datetime.datetime(2019, 1, 1) } DAG = AirflowDagRunner(AirflowPipelineConfig(airflow_config)).run(tfx_pipeline)
} def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[example_gen, statistics_gen], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path)) # Airflow checks 'DAG' keyword to find the dag. airflow_pipeline = AirflowDagRunner( AirflowPipelineConfig(_airflow_config)).run( _create_pipeline(pipeline_name=_pipeline_name, pipeline_root=_pipeline_root, data_root=_data_root, metadata_path=_metadata_path))
metrics_specs=[metrics_spec]) evaluator = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=eval_config) filesystem = pusher_pb2.PushDestination.Filesystem( base_directory=SERVING_MODEL_DIR) push_destination = pusher_pb2.PushDestination(filesystem=filesystem) pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=push_destination) pipeline = Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, evaluator, pusher ], enable_cache=True, beam_pipeline_args=['--direct_num_workers=0']) return pipeline DAG = AirflowDagRunner(AirflowPipelineConfig(AIRFLOW_CONFIG)).run( create_pipeline(pipeline_name=PIPELINE_NAME, pipeline_root=PIPELINE_ROOT, metadata_path=METADATA_PATH))