overwrite={'has_diabetes': { 'transform': [{'method': 'no_transform', 'parameters': {}}]}} )) # Add a trainer training_pipeline.add_trainer(FeedForwardTrainer( loss='binary_crossentropy', last_activation='sigmoid', output_units=1, metrics=['accuracy'], epochs=20)) # Add an evaluator training_pipeline.add_evaluator( TFMAEvaluator(slices=[['has_diabetes']], metrics={'has_diabetes': ['binary_crossentropy', 'binary_accuracy']})) # Run the pipeline on a Google Cloud VM and train on GCP as well # In order for this to work, the orchestrator and the backend should be in the # same GCP project. Also, the metadata store and artifact store should be # accessible by the orchestrator VM and the GCAIP worker VM. # Note: If you are using a custom Trainer, then you need # to build a new Docker image based on the ZenML Trainer image, and pass that # into the `image` parameter in the SingleGPUTrainingGCAIPBackend. # Define the orchestrator backend orchestrator_backend = OrchestratorGCPBackend( cloudsql_connection_name=cloudsql_connection_name,
def get_tfx_component_list(self, config: Dict[Text, Any]) -> List: """ Builds the training pipeline as a series of TFX components. Args: config: A ZenML configuration in dictionary format. Returns: A chronological list of TFX components making up the training pipeline. """ steps = config[keys.GlobalKeys.STEPS] component_list = [] ############ # RAW DATA # ############ data_config = steps[keys.TrainingSteps.DATA] data = DataGen(source=data_config[keys.StepKeys.SOURCE], source_args=data_config[keys.StepKeys.ARGS]).with_id( GDPComponent.DataGen.name) statistics_data = StatisticsGen( examples=data.outputs.examples).with_id( GDPComponent.DataStatistics.name) schema_data = SchemaGen( statistics=statistics_data.outputs.output, ).with_id( GDPComponent.DataSchema.name) component_list.extend([data, statistics_data, schema_data]) datapoints = data.outputs.examples ############# # SPLITTING # ############# # Block to read the data from the corresponding BQ table split_config = steps[keys.TrainingSteps.SPLIT] splits = SplitGen( input_examples=datapoints, source=split_config[keys.StepKeys.SOURCE], source_args=split_config[keys.StepKeys.ARGS], schema=schema_data.outputs.schema, statistics=statistics_data.outputs.output, ).with_id(GDPComponent.SplitGen.name) datapoints = splits.outputs.examples statistics_split = StatisticsGen(examples=datapoints).with_id( GDPComponent.SplitStatistics.name) schema_split = SchemaGen( statistics=statistics_split.outputs.output, ).with_id( GDPComponent.SplitSchema.name) schema = schema_split.outputs.schema component_list.extend([splits, statistics_split, schema_split]) ############## # SEQUENCING # ############## if keys.TrainingSteps.SEQUENCER in steps: sequencer_config = steps[keys.TrainingSteps.SEQUENCER] sequencer = Sequencer( input_examples=datapoints, schema=schema, statistics=statistics_split.outputs.statistics, source=sequencer_config[keys.StepKeys.SOURCE], source_args=sequencer_config[keys.StepKeys.ARGS]).with_id( GDPComponent.Sequencer.name) sequencer_statistics = StatisticsGen( examples=sequencer.outputs.output_examples).with_id( GDPComponent.SequencerStatistics.name) sequencer_schema = SchemaGen( statistics=sequencer_statistics.outputs.output, infer_feature_shape=True, ).with_id(GDPComponent.SequencerSchema.name) datapoints = sequencer.outputs.output_examples schema = sequencer_schema.outputs.schema component_list.extend( [sequencer, sequencer_statistics, sequencer_schema]) ################# # PREPROCESSING # ################# transform = Transform( preprocessing_fn=constants.PREPROCESSING_FN, examples=datapoints, schema=schema, custom_config=steps[keys.TrainingSteps.PREPROCESSER]).with_id( GDPComponent.Transform.name) component_list.extend([transform]) ############ # TRAINING # ############ training_backend: TrainingLocalBackend = \ self.backends_dict[TrainingLocalBackend.BACKEND_KEY] training_kwargs = { 'custom_executor_spec': training_backend.get_executor_spec(), 'custom_config': steps[keys.TrainingSteps.TRAINER] } training_kwargs['custom_config'].update( training_backend.get_custom_config()) trainer = Trainer( transformed_examples=transform.outputs.transformed_examples, transform_graph=transform.outputs.transform_graph, run_fn=constants.TRAINER_FN, schema=schema, train_args=trainer_pb2.TrainArgs(), eval_args=trainer_pb2.EvalArgs(), **training_kwargs).with_id(GDPComponent.Trainer.name) component_list.extend([trainer]) ############# # EVALUATOR # ############# if keys.TrainingSteps.EVALUATOR in steps: from zenml.utils import source_utils eval_module = '.'.join( constants.EVALUATOR_MODULE_FN.split('.')[:-1]) eval_module_file = constants.EVALUATOR_MODULE_FN.split('.')[-1] abs_path = source_utils.get_absolute_path_from_module(eval_module) custom_extractor_path = os.path.join(abs_path, eval_module_file) + '.py' eval_step: TFMAEvaluator = TFMAEvaluator.from_config( steps[keys.TrainingSteps.EVALUATOR]) eval_config = eval_step.build_eval_config() evaluator = Evaluator( examples=transform.outputs.transformed_examples, model=trainer.outputs.model, eval_config=eval_config, module_file=custom_extractor_path, ).with_id(GDPComponent.Evaluator.name) component_list.append(evaluator) ########### # SERVING # ########### if keys.TrainingSteps.DEPLOYER in steps: serving_args = steps[keys.TrainingSteps.DEPLOYER]['args'] project_id = serving_args['project_id'] output_base_dir = self.artifact_store.path if 'model_name' in serving_args: model_name = serving_args['model_name'] else: model_name = self.pipeline_name().replace('-', '_') gcaip_deployer = GCAIPDeployer(output_base_dir=output_base_dir, project_id=project_id, model_name=model_name) pusher_config = gcaip_deployer.build_pusher_config() pusher_executor_spec = gcaip_deployer.get_executor_spec() pusher = Pusher(model_export=trainer.outputs.output, custom_executor_spec=pusher_executor_spec, **pusher_config).with_id( GDPComponent.Deployer.name) component_list.append(pusher) return component_list
features=['times_pregnant', 'pgc', 'dbp', 'tst', 'insulin', 'bmi', 'pedigree', 'age'], labels=['has_diabetes'], overwrite={'has_diabetes': { 'transform': [{'method': 'no_transform', 'parameters': {}}]}} ).with_backend(processing_backend) ) # Add a trainer training_pipeline.add_trainer(FeedForwardTrainer( loss='binary_crossentropy', last_activation='sigmoid', output_units=1, metrics=['accuracy'], epochs=20)) # Add an evaluator training_pipeline.add_evaluator( TFMAEvaluator( slices=[['has_diabetes']], metrics={'has_diabetes': ['binary_crossentropy', 'binary_accuracy']} ).with_backend(processing_backend) ) # Define the artifact store artifact_store = ArtifactStore( os.path.join(GCP_BUCKET, 'dataflow_processing/artifact_store')) # Run the pipeline training_pipeline.run(artifact_store=artifact_store)