def __init__(self): pipeline_description = Pipeline() pipeline_description.add_input(name="inputs") # Ts formatter step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.time_series_formatter.DistilTimeSeriesFormatter" )) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0", ) step.add_output("produce") pipeline_description.add_step(step) # DS to DF on formatted ts DS step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step.add_output("produce") pipeline_description.add_step(step) # Grouping Field Compose step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.grouping_field_compose.Common") ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.1.produce", ) step.add_output("produce") pipeline_description.add_step(step) # Storc primitive -> KMeans step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.clustering.k_means.Sloth")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.2.produce", ) step.add_hyperparameter(name="nclusters", argument_type=ArgumentType.VALUE, data=3) step.add_output("produce") pipeline_description.add_step(step) # Final Output pipeline_description.add_output(name="output predictions", data_reference="steps.3.produce") self.pipeline = pipeline_description
def __init__(self, epochs: int = 10, n_steps: int = 20): pipeline_description = Pipeline() pipeline_description.add_input(name="inputs") # Denormalize primitive step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.denormalize.Common")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0", ) step.add_output("produce") pipeline_description.add_step(step) # DS to DF on input DS step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step.add_output("produce") pipeline_description.add_step(step) # RetinaNet primitive step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.object_detection.retina_net.ObjectDetectionRN")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.1.produce", ) step.add_argument( name="outputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.1.produce", ) step.add_hyperparameter(name="n_epochs", argument_type=ArgumentType.VALUE, data=epochs) step.add_hyperparameter(name="n_steps", argument_type=ArgumentType.VALUE, data=n_steps) step.add_hyperparameter(name="weights_path", argument_type=ArgumentType.VALUE, data="/scratch_dir/") step.add_output("produce") pipeline_description.add_step(step) # Final Output pipeline_description.add_output(name="output predictions", data_reference="steps.2.produce") self.pipeline = pipeline_description
def _gen_pipeline(self): pipeline = meta_pipeline.Pipeline() pipeline.add_input(name='inputs') # Step 0: dataset_to_dataframe step_0 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) # Step 1: dataset_to_dataframe step_1 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_1.add_hyperparameter(name='dataframe_resource', argument_type=ArgumentType.VALUE, data='1') step_1.add_output('produce') pipeline.add_step(step_1) # Step 2: dataset_to_dataframe step_2 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_2.add_hyperparameter(name='dataframe_resource', argument_type=ArgumentType.VALUE, data='2') step_2.add_output('produce') pipeline.add_step(step_2) step_3 = meta_pipeline.PrimitiveStep( primitive_description=EuclideanNomination.metadata.query()) step_3.add_argument(name='inputs_1', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_3.add_argument(name='inputs_2', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_3.add_output('produce') pipeline.add_step(step_3) # Adding output step to the pipeline pipeline.add_output(name='Predictions', data_reference='steps.3.produce') return pipeline
def test_register(self): FooBarPrimitive = create_primitive( 'e2fc24f8-5b32-4759-be5b-8126a42522a3', 'd3m.primitives.foo.bar.FooBarPrimitive') # To hide any logging or stdout output. with self.assertLogs(level=logging.DEBUG) as cm: with utils.redirect_to_logging(): index.register_primitive( 'd3m.primitives.foo.bar.FooBarPrimitive', FooBarPrimitive) # Just to log something, otherwise "assertLogs" can fail. logging.getLogger().debug("Start test.") index.get_primitive('d3m.primitives.foo.bar.FooBarPrimitive')
def ListPrimitives(self, request, context): ''' List all primitives known to TA2, their IDs, versions, names, and digests. Using this information a TA3 should know which primitives may be put into a pipeline template. To narrow down potential primitives to use a TA3 can also ask a TA2 to do a solution search and then observe which primitives the TA2 is using. If more metadata about primitives is needed, then a TA3 can use the results of this call to map primitives to metadata (from Python code or primitive annotations) on its own. ''' list_primitives = [] source_primitives = [] primitives = index.search() for prim in primitives: try: p = index.get_primitive(prim) source_primitives.append(p) except: 0 for p in source_primitives: meta = p.metadata.to_json_structure() list_primitives.append( primitive_pb2.Primitive(id=meta['id'], version=meta['version'], python_path=meta['python_path'], name=meta['name'], digest=meta['digest'])) return core_pb2.ListPrimitivesResponse(primitives=list_primitives)
def test_entrypoint(self): working_set_entries = copy.copy(pkg_resources.working_set.entries) working_set_entry_keys = copy.copy( pkg_resources.working_set.entry_keys) working_set_by_key = copy.copy(pkg_resources.working_set.by_key) try: distribution = pkg_resources.Distribution(__file__) entry_point = pkg_resources.EntryPoint.parse( 'foo2.bar2.FooBar2Primitive = test_index:FooBar2Primitive', dist=distribution) distribution._ep_map = { 'd3m.primitives': { 'foo2.bar2.FooBar2Primitive': entry_point } } pkg_resources.working_set.add(distribution) python_path = 'd3m.primitives.foo2.bar2.FooBar2Primitive' self.assertIn(python_path, index.search()) self.assertIs(index.get_primitive(python_path), FooBar2Primitive) finally: pkg_resources.working_set.entries = working_set_entries pkg_resources.working_set.entry_keys = working_set_entry_keys pkg_resources.working_set.by_key = working_set_by_key
def available_primitives(): primitives_info = [] with d3m_utils.silence(): for primitive_path in d3m_index.search(): if primitive_path in PrimitivesList.BlockList: continue try: primitive = d3m_index.get_primitive(primitive_path) primitive_id = primitive.metadata.query()['id'] version = primitive.metadata.query()['version'] python_path = primitive.metadata.query()['python_path'] name = primitive.metadata.query()['name'] digest = primitive.metadata.query().get('digest', None) primitive_info = { 'id': primitive_id, 'version': version, 'python_path': python_path, 'name': name, 'digest': digest } primitives_info.append(primitive_info) except: continue return primitives_info
def __init__(self, ): pipeline_description = Pipeline() pipeline_description.add_input(name="inputs") # DS to DF on input DS step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0", ) step.add_output("produce") pipeline_description.add_step(step) # column parser step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.column_parser.Common")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step.add_output("produce") pipeline_description.add_step(step) # Duke primitive step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_cleaning.text_summarization.Duke")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.1.produce", ) step.add_output("produce") pipeline_description.add_step(step) # Final Output pipeline_description.add_output(name="output predictions", data_reference="steps.2.produce") self.pipeline = pipeline_description
def get_dataframe(self, input_data): # denormalize denormalize = index.get_primitive( 'd3m.primitives.data_transformation.denormalize.Common') hyperparams_class = denormalize.metadata.get_hyperparams() primitive = denormalize(hyperparams=hyperparams_class.defaults()) dataset = primitive.produce(inputs=input_data[0]).value # Add Target column into dataset dataset = self.mark_columns(dataset) # dataset to dataframe dataset_dataframe = index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common') hyperparams_class = dataset_dataframe.metadata.get_hyperparams() primitive = dataset_dataframe(hyperparams=hyperparams_class.defaults()) dataframe = primitive.produce(inputs=dataset).value return dataframe
def class_hyperparameter_generator(primitive_name, parameter_name, definition): from d3m import index g = None try: g = index.get_primitive(primitive_name).metadata.query( )["primitive_code"]["hyperparams"][parameter_name][ 'structural_type'](definition) except Exception: _logger.error(f"Hyperparameter not valid for {primitive_name}!") pass return g
def get_preprocessor(input_data, problem, treatment): metadata = input_data.metadata task_description = schemas_utils.get_task_description( problem['problem']['task_keywords']) task_type = task_description['task_type'] semi = task_description['semi'] data_types = task_description['data_types'] task = pipeline_utils.infer_primitive_family(task_type=task_type, data_types=data_types, is_semi=semi) main_resource = pipeline_utils.get_tabular_resource_id(dataset=input_data) # Loading primitives primitives = { 'DatasetToDataFrame': 'd3m.primitives.data_transformation.dataset_to_dataframe.Common', 'ColumnParser': 'd3m.primitives.data_transformation.column_parser.Common', 'ExtractColumnsBySemanticTypes': 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common', 'Denormalize': 'd3m.primitives.data_transformation.denormalize.Common', 'Imputer': 'd3m.primitives.data_cleaning.imputer.SKlearn', 'SimpleProfiler': 'd3m.primitives.schema_discovery.profiler.Common', 'TextEncoder': 'd3m.primitives.data_transformation.encoder.DistilTextEncoder', } loaded_primitives = dict() try: for primitive_name in primitives.keys(): loaded_primitives[primitive_name] = index.get_primitive( primitives[primitive_name]) except Exception as e: print("Cannot load primitive {}".format(e)) candidates = [] for preprocessor in preprocessors: if preprocessor.check_task_treatment(task, treatment) \ and preprocessor.check_expected_data_types(data_types) \ and preprocessor.check_unsupported_data_types(data_types): candidates.append( preprocessor(metadata, main_resource, data_types, loaded_primitives, problem)) if not candidates: candidates.append( TabularPreprocessor(metadata, main_resource, data_types, loaded_primitives)) return candidates
def load_hyperparameters(primitive_name): primitive = index.get_primitive(primitive_name) hyperparameters_metadata = primitive.metadata.query( )['primitive_code']['hyperparams'] hyperparameter_class = typing.get_type_hints( primitive.__init__)['hyperparams'] hyperparameters = {} if hyperparameter_class: for hp_name, hp_value in hyperparameter_class.configuration.items(): if 'https://metadata.datadrivendiscovery.org/types/TuningParameter' in hyperparameters_metadata[ hp_name]['semantic_types']: hyperparameters[hp_name] = hp_value return hyperparameters
def get_x(self, dataframe): # reading images image_reader = index.get_primitive( 'd3m.primitives.data_preprocessing.image_reader.Common') hyperparams_class = image_reader.metadata.get_hyperparams() primitive = image_reader(hyperparams=hyperparams_class.defaults(). replace({'return_result': 'replace'})) columns_to_use = primitive._get_columns(dataframe.metadata) column_index = columns_to_use[0] temp = [ primitive._read_filename( column_index, dataframe.metadata.query((row_index, column_index)), value) for row_index, value in enumerate(dataframe.iloc[:, column_index]) ] x = np.array(temp, dtype=np.float64) return x
def get_y(self, dataframe): # extract targets get_columns_semantic = index.get_primitive( 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common' ) hyperparams_class = get_columns_semantic.metadata.get_hyperparams() primitive = get_columns_semantic(hyperparams=hyperparams_class.defaults( ).replace({ 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TrueTarget', 'https://metadata.datadrivendiscovery.org/types/Target', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget') })) targets = primitive.produce(inputs=dataframe).value y = np.array(targets, dtype=np.int64) return y
def update_pipeline( pipeline_to_update, filename=None ): """ This function updates the pipeline's digests and version numbers Parameters ---------- pipeline_json_structure: the pipeline in JSON form (WITHOUT) digests. This or the `filename` parameter is mandatory filename: the filename of the pipeline json, so we can read it in :return a pipeline with updated digests """ if pipeline_to_update is None and filename is None: raise ValueError("No pipeline json was given") elif pipeline_to_update is None: with open(filename, "r") as file: # NOTE: must be a pipeline with no digests, or recent digests # NOTE: reading this in as straight JSON doesn't work so we have to use the pipeline_module pipeline_to_update = pipeline_module.Pipeline.from_json(string_or_file=file).to_json_structure() else: try: pipeline_to_update = pipeline_module.Pipeline.from_json(json.dumps(pipeline_to_update)).to_json_structure() except Exception as e: pass for step in pipeline_to_update['steps']: # if not updated, check and update primitive = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( step["primitive"]["python_path"] ) ) check_step = primitive.to_json_structure() # lets verify that both are updated id_matches = check_step["primitive"]["id"] == step["primitive"]["id"] if not id_matches: step["primitive"]["id"] = check_step["primitive"]["id"] version_matches = check_step["primitive"]["version"] == step["primitive"]["version"] if not version_matches: step["primitive"]["version"] = check_step["primitive"]["version"] return pipeline_to_update
def add_classifier(pipeline_description, dataset_to_dataframe_step, attributes, targets): lr = PrimitiveStep(primitive=SKLogisticRegression) lr.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) lr.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference=targets) lr.add_output('produce') pipeline_description.add_step(lr) construct_pred = PrimitiveStep( primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common')) construct_pred.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=pipeline_utils.int_to_step(lr.index)) construct_pred.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference=dataset_to_dataframe_step) construct_pred.add_output('produce') pipeline_description.add_step(construct_pred) # Final Output pipeline_description.add_output(name='output predictions', data_reference=pipeline_utils.int_to_step(construct_pred.index))
from d3m import index from d3m.metadata.base import ArgumentType from d3m.metadata.pipeline import Pipeline, PrimitiveStep from d3m.metadata import hyperparams import numpy as np # -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest # extract_columns_by_semantic_types(targets) -> ^ # Creating pipeline pipeline_description = Pipeline() pipeline_description.add_input(name='inputs') # Step 0: dataset_to_dataframe primitive_0 = index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common') step_0 = PrimitiveStep(primitive=primitive_0) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # # Step 1: column_parser primitive_1 = index.get_primitive( 'd3m.primitives.data_transformation.column_parser.Common') step_1 = PrimitiveStep(primitive=primitive_1) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce')
from d3m import index from d3m.metadata.base import ArgumentType, Context from d3m.metadata.pipeline import Pipeline, PrimitiveStep import sys # Creating pipeline pipeline_description = Pipeline() pipeline_description.add_input(name='inputs') #d3m.primitives.data_transformation.column_parser.Common #d3m.primitives.data_cleaning.column_type_profiler.Simon # Step 0: dataset_to_dataframe step_0 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # Step 1: Column profiler step_1 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.schema_discovery.profiler.Common')) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce') pipeline_description.add_step(step_1) # Step 2: DISTIL/NK pca feature selection
from d3m import index from d3m.metadata.base import ArgumentType from d3m.metadata.pipeline import Pipeline, PrimitiveStep from d3m.metadata import hyperparams # -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest # extract_columns_by_semantic_types(targets) -> ^ # Creating pipeline pipeline_description = Pipeline() pipeline_description.add_input(name='inputs') # Step 0: dataset_to_dataframe primitive_0 = index.get_primitive( 'd3m.primitives.tods.data_processing.dataset_to_dataframe') step_0 = PrimitiveStep(primitive=primitive_0) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # # Step 1: column_parser primitive_1 = index.get_primitive( 'd3m.primitives.data_transformation.column_parser.Common') step_1 = PrimitiveStep(primitive=primitive_1) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce') pipeline_description.add_step(step_1)
def __init__( self, epochs: int = 5000, attention_lstm: bool = True, ): pipeline_description = Pipeline() pipeline_description.add_input(name="inputs") # Ts formatter step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.time_series_formatter.DistilTimeSeriesFormatter" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0", ) step.add_output("produce") pipeline_description.add_step(step) # DS to DF on formatted ts DS step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step.add_output("produce") pipeline_description.add_step(step) # DS to DF on input DS step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0", ) step.add_output("produce") pipeline_description.add_step(step) # column parser on input DF step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.column_parser.Common" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.2.produce", ) step.add_output("produce") step.add_hyperparameter( name="parse_semantic_types", argument_type=ArgumentType.VALUE, data=[ "http://schema.org/Boolean", "http://schema.org/Integer", "http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/FloatVector", ], ) pipeline_description.add_step(step) # parse target semantic types step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.3.produce", ) step.add_hyperparameter( name="semantic_types", argument_type=ArgumentType.VALUE, data=[ "https://metadata.datadrivendiscovery.org/types/Target", ], ) step.add_output("produce") pipeline_description.add_step(step) # LSTM FCN step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.time_series_classification.convolutional_neural_net.LSTM_FCN" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.1.produce", ) step.add_argument( name="outputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.4.produce", ) step.add_hyperparameter( name="epochs", argument_type=ArgumentType.VALUE, data=epochs ) step.add_hyperparameter( name="attention_lstm", argument_type=ArgumentType.VALUE, data=attention_lstm ) step.add_output("produce") pipeline_description.add_step(step) # construct predictions step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.construct_predictions.Common" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.5.produce", ) step.add_argument( name="reference", argument_type=ArgumentType.CONTAINER, data_reference="steps.2.produce", ) step.add_output("produce") pipeline_description.add_step(step) # Final Output pipeline_description.add_output( name="output predictions", data_reference="steps.6.produce" ) self.pipeline = pipeline_description
from d3m import index from d3m.metadata.base import ArgumentType from d3m.metadata.pipeline import Pipeline, PrimitiveStep # Creating pipeline pipeline_description = Pipeline() pipeline_description.add_input(name="inputs") # Step 0: DS to DF on input DS step_0 = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common")) step_0.add_argument(name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0") step_0.add_output("produce") pipeline_description.add_step(step_0) # Step 1: Simple Profiler Column Role Annotation step_1 = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.schema_discovery.profiler.Common")) step_1.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step_1.add_output("produce") pipeline_description.add_step(step_1) # Step 2: column parser on input DF step_2 = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.column_parser.Common"))
from d3m import index from d3m.metadata.base import ArgumentType from d3m.metadata.pipeline import Pipeline, PrimitiveStep # -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest # extract_columns_by_semantic_types(targets) -> ^ # Creating pipeline pipeline_description = Pipeline() pipeline_description.add_input(name='inputs') # Step 0: dataset_to_dataframe primitive_0 = index.get_primitive( 'd3m.primitives.tods.data_processing.dataset_to_dataframe') step_0 = PrimitiveStep(primitive=primitive_0) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # Step 1: Column Parser primitive_1 = index.get_primitive( 'd3m.primitives.data_transformation.column_parser.Common') step_1 = PrimitiveStep(primitive=primitive_1) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce') pipeline_description.add_step(step_1)
def generate_only(): # Creating pipeline pipeline_description = Pipeline() pipeline_description.add_input(name='inputs') # Step 0: dataset_to_dataframe step_0 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # Step 1: column_parser step_1 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.schema_discovery.profiler.Common')) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce') pipeline_description.add_step(step_1) # Step 2: column_parser step_2 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.column_parser.Common')) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline_description.add_step(step_2) # Step 3: DFS Single Table step_3 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.feature_construction.deep_feature_synthesis.SingleTableFeaturization' )) step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_output('produce') pipeline_description.add_step(step_3) # Step 4: learn model step_4 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.regression.xgboost_gbtree.Common')) step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') step_4.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_4.add_output('produce') pipeline_description.add_step(step_4) # Step 5: construct output step_5 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.construct_predictions.Common')) step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') step_5.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_5.add_output('produce') pipeline_description.add_step(step_5) # Final Output pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') # Generate .yml file for the pipeline import featuretools_ta1 from pipeline_tests.utils import generate_pipeline dataset_name = 'LL1_retail_sales_total_MIN_METADATA' dataset_path = '/featuretools_ta1/datasets/seed_datasets_current' primitive_name = 'd3m.primitives.feature_construction.deep_feature_synthesis.SingleTableFeaturization' version = featuretools_ta1.__version__ test_name = os.path.splitext(os.path.basename(__file__))[0][5:] yml, pipeline_run_file = generate_pipeline( primitive_name=primitive_name, pipeline_description=pipeline_description, dataset_name=dataset_name, test_name=test_name) # fit-score command fs_cmd = 'python3 -m d3m runtime -d /featuretools_ta1/datasets/ fit-score -p {}'.format( yml) fs_cmd += ' -r {}/{}/{}_problem/problemDoc.json'.format( dataset_path, dataset_name, dataset_name) fs_cmd += ' -i {}/{}/TRAIN/dataset_TRAIN/datasetDoc.json'.format( dataset_path, dataset_name) fs_cmd += ' -t {}/{}/TEST/dataset_TEST/datasetDoc.json'.format( dataset_path, dataset_name) fs_cmd += ' -a {}/{}/SCORE/dataset_SCORE/datasetDoc.json'.format( dataset_path, dataset_name) fs_cmd += ' -O {}'.format(pipeline_run_file) # Run pipeline to save pipeline_run file os.system(fs_cmd) # Create and return command for running from pipeline_run file: pipeline_run_cmd = 'python3 -m d3m --pipelines-path /featuretools_ta1/MIT_FeatureLabs/{}/{}/pipelines/'.format( primitive_name, version) pipeline_run_cmd += ' runtime -d /featuretools_ta1/datasets/ fit-score' pipeline_run_cmd += ' -u {}'.format(pipeline_run_file) return pipeline_run_cmd
def build_pipeline(pipepline_info, pipepline_mapping, stdout=None): default_stdout = sys.stdout if stdout is not None: sys.stdout = stdout # Creating pipeline pipeline_description = Pipeline() pipeline_description.add_input(name='inputs') for primitive_info in pipepline_info: print(primitive_info.python_path) print(primitive_info.hyperparameter) print(primitive_info.ancestors) if primitive_info.python_path == 'HEAD': dataset_fullname = primitive_info.hyperparameter['dataset_folder'] print(dataset_fullname) continue elif primitive_info.python_path == 'ENDING': ancestors = primitive_info.ancestors end_step_num = pipepline_mapping[ancestors['inputs']] - 1 pipeline_description.add_output(name='output predictions', data_reference='steps.' + str(end_step_num) + '.produce') else: # print(primitive_info.python_path) primitive = index.get_primitive(primitive_info.python_path) step = PrimitiveStep(primitive=primitive) hyperparameters = primitive_info.hyperparameter ancestors = primitive_info.ancestors # add add_inputs # print(ancestors) if ancestors['inputs'] != 0: for ances_key in ancestors.keys(): print(ances_key, ancestors[ances_key], pipepline_mapping[ancestors[ances_key]] - 1) step_num = pipepline_mapping[ancestors[ances_key]] - 1 step.add_argument(name=ances_key, argument_type=ArgumentType.CONTAINER, data_reference='steps.' + str(step_num) + '.produce') else: step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') # add add_hyperparameter for hyper in hyperparameters.keys(): # print(hyper, hyperparameters[hyper], type(hyperparameters[hyper])) hyper_value = hyperparameters[hyper] step.add_hyperparameter(name=hyper, argument_type=ArgumentType.VALUE, data=hyper_value) step.add_output('produce') pipeline_description.add_step(step) # print('\n') # Output to json data = pipeline_description.to_json() with open('example_pipeline.json', 'w') as f: f.write(data) print(data) # yaml = pipeline_description.to_yaml() # with open('example_pipeline.yml', 'w') as f: # f.write(yaml) # print(yaml) sys.stdout.flush() sys.stdout = default_stdout
def _gen_pipeline(self): #pipeline context is just metadata, ignore for now pipeline = meta_pipeline.Pipeline() #define inputs. This will be read in automatically as a Dataset object. pipeline.add_input(name='inputs') # Step 0: DatasetToDataFrame step_0 = meta_pipeline.PrimitiveStep( primitive_description=DatasetToDataFramePrimitive.metadata.query()) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) # Step 1: Simple Profiler Column Role Annotation step_1 = meta_pipeline.PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.schema_discovery.profiler.Common")) step_1.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step_1.add_output("produce") pipeline.add_step(step_1) # Step 1: ColumnParser step_2 = meta_pipeline.PrimitiveStep( primitive_description=ColumnParserPrimitive.metadata.query()) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline.add_step(step_2) # Step 3: Extract Attributes step_3 = meta_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_output('produce') step_3.add_hyperparameter( name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) pipeline.add_step(step_3) # Step 4: Extract Targets step_4 = meta_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_4.add_output('produce') step_4.add_hyperparameter( name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) pipeline.add_step(step_4) #Transform attributes dataframe into an ndarray step_5 = meta_pipeline.PrimitiveStep( primitive_description=DataFrameToNDArrayPrimitive.metadata.query()) step_5.add_argument( name='inputs', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.3.produce' #inputs here are the outputs from step 3 ) step_5.add_output('produce') pipeline.add_step(step_5) #Run L1LowRank step_6 = meta_pipeline.PrimitiveStep( primitive_description=RandomizedPolyPCA.metadata.query()) step_6.add_argument( name='inputs', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.5.produce' #inputs here are the outputs from step 4 ) step_6.add_hyperparameter(name='n_components', argument_type=ArgumentType.VALUE, data=15) step_6.add_hyperparameter(name='degree', argument_type=ArgumentType.VALUE, data=2) step_6.add_output('produce') pipeline.add_step(step_6) # convert numpy-formatted attribute data to a dataframe step_7 = meta_pipeline.PrimitiveStep( primitive_description=NDArrayToDataFramePrimitive.metadata.query()) step_7.add_argument( name='inputs', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.6.produce' # inputs here are the outputs from step 5 ) step_7.add_output('produce') pipeline.add_step(step_7) #Linear Regression on low-rank data (inputs and outputs for sklearns are both dataframes) step_8 = meta_pipeline.PrimitiveStep( primitive_description=d3m.primitives.regression.gradient_boosting. SKlearn.metadata.query()) step_8.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.7.produce') step_8.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') step_8.add_hyperparameter(name='n_estimators', argument_type=ArgumentType.VALUE, data=50000) step_8.add_hyperparameter(name='learning_rate', argument_type=ArgumentType.VALUE, data=0.002) step_8.add_hyperparameter(name='max_depth', argument_type=ArgumentType.VALUE, data=2) #step_7.add_hyperparameter( # name = 'loss', # argument_type = ArgumentType.VALUE, # data = 'ls' #) step_8.add_output('produce') pipeline.add_step(step_8) #finally generate a properly-formatted output dataframe from the prediction outputs using the input dataframe as a reference step_9 = meta_pipeline.PrimitiveStep( primitive_description=ConstructPredictionsPrimitive.metadata.query( )) step_9.add_argument( name='inputs', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.8.produce' # inputs here are the prediction column ) step_9.add_argument( name='reference', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.1.produce' # inputs here are the dataframed input dataset ) step_9.add_output('produce') pipeline.add_step(step_9) # Adding output step to the pipeline pipeline.add_output(name='output', data_reference='steps.9.produce') return pipeline
from d3m import index from d3m.metadata.base import ArgumentType, Context from d3m.metadata.pipeline import Pipeline, PrimitiveStep import sys # Creating pipeline pipeline_description = Pipeline() pipeline_description.add_input(name='inputs') # Step 0: Denormalize primitive step_0 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.denormalize.Common')) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # Step 1: dataset_to_dataframe step_1 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_hyperparameter(name='dataframe_resource', argument_type=ArgumentType.VALUE, data='learningData') step_1.add_output('produce') pipeline_description.add_step(step_1) # Step 2: DISTIL/NK VAR primitive
def __init__(self): pipeline_description = Pipeline() pipeline_description.add_input(name="inputs") # DS to DF on input DS step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common")) step.add_argument(name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0") step.add_output("produce") pipeline_description.add_step(step) # Simon step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_cleaning.column_type_profiler.Simon")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step.add_output("produce") pipeline_description.add_step(step) # column parser on input DF step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.column_parser.Common")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.1.produce", ) step.add_output("produce") pipeline_description.add_step(step) # XG Boost step = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.classification.xgboost_gbtree.Common')) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step.add_output('produce') step.add_hyperparameter(name='add_index_columns', argument_type=ArgumentType.VALUE, data=True) pipeline_description.add_step(step) # construct predictions step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.construct_predictions.Common")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.3.produce", ) step.add_argument( name="reference", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step.add_output("produce") pipeline_description.add_step(step) # Final Output pipeline_description.add_output(name="output predictions", data_reference="steps.4.produce") self.pipeline = pipeline_description
from d3m import index from d3m.metadata.base import ArgumentType, Context from d3m.metadata.pipeline import Pipeline, PrimitiveStep # Creating pipeline pipeline_description = Pipeline() pipeline_description.add_input(name='inputs') # Step 0: Denormalize primitive step_0 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.denormalize.Common')) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # Step 1: dataset_to_dataframe step_1 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_hyperparameter(name='dataframe_resource', argument_type=ArgumentType.VALUE, data='learningData') step_1.add_output('produce') pipeline_description.add_step(step_1) # Step 2: DISTIL/NK data cleaning step_2 = PrimitiveStep(primitive=index.get_primitive(
from d3m import index from d3m.metadata.base import ArgumentType, Context from d3m.metadata.pipeline import Pipeline, PrimitiveStep import sys # Creating pipeline pipeline_description = Pipeline() pipeline_description.add_input(name='inputs') # Step 0: Denormalize primitive step_0 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.denormalize.Common')) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # Step 1: DISTIL/NK Storc primitive step_1 = PrimitiveStep( primitive=index.get_primitive('d3m.primitives.clustering.k_means.Sloth')) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_hyperparameter(name='nclusters', argument_type=ArgumentType.VALUE, data=10) step_1.add_hyperparameter(name='long_format', argument_type=ArgumentType.VALUE, data=True) step_1.add_output('produce')
def _gen_pipeline(self): pipeline = d3m_pipeline.Pipeline() #define inputs. This will be read in automatically as a Dataset object. pipeline.add_input(name='inputs') #step 0: Denormalize: join multiple tabular resource? # Why is there no entry point for Denormalize? #step 0: Dataset -> Dataframe step_0 = d3m_pipeline.PrimitiveStep( primitive_description=DatasetToDataFramePrimitive.metadata.query()) step_0.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) # Step 1: Simple Profiler Column Role Annotation step_1 = d3m_pipeline.PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.schema_discovery.profiler.Common")) step_1.add_argument( name="inputs", argument_type=d3m_base.ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step_1.add_output("produce") pipeline.add_step(step_1) #step 2: ColumnParser step_2 = d3m_pipeline.PrimitiveStep( primitive_description=ColumnParserPrimitive.metadata.query()) step_2.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline.add_step(step_2) #step 3: Imputer step_3 = d3m_pipeline.PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_cleaning.imputer.SKlearn')) step_3.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_hyperparameter(name='use_semantic_types', argument_type=d3m_base.ArgumentType.VALUE, data=True) step_3.add_output('produce') pipeline.add_step(step_3) #step 4: Extract attributes from dataset into a dedicated dataframe step_4 = d3m_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_4.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.3.produce') step_4.add_output('produce') step_4.add_hyperparameter( name='semantic_types', argument_type=d3m_base.ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) pipeline.add_step(step_4) #step 5: Binary encoding for categorical features step_5 = d3m_pipeline.PrimitiveStep( primitive_description=BinaryEncoderPrimitive.metadata.query()) step_5.add_hyperparameter(name='min_binary', argument_type=d3m_base.ArgumentType.VALUE, data=2) step_5.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.4.produce') step_5.add_output('produce') pipeline.add_step(step_5) #step 5: Extract Targets step_6 = d3m_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_6.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.2.produce') step_6.add_hyperparameter( name='semantic_types', argument_type=d3m_base.ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) step_6.add_output('produce') pipeline.add_step(step_6) #step 7: transform targets dataframe into an ndarray step_7 = d3m_pipeline.PrimitiveStep( primitive_description=DataFrameToNDArrayPrimitive.metadata.query()) step_7.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.6.produce') step_7.add_output('produce') pipeline.add_step(step_7) #step 8 : transform features dataframe into an ndarray step_8 = d3m_pipeline.PrimitiveStep( primitive_description=DataFrameToNDArrayPrimitive.metadata.query()) step_8.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.5.produce') step_8.add_output('produce') pipeline.add_step(step_8) attributes = 'steps.8.produce' targets = 'steps.7.produce' #step 9: call RFMPreconditionedGaussianKRR for regression #Run SparsePCA step_9 = d3m_pipeline.PrimitiveStep( primitive_description=SparsePCA.metadata.query()) step_9.add_argument( name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference=attributes #inputs here are the outputs from step 7 ) step_9.add_hyperparameter(name='n_components', argument_type=d3m_base.ArgumentType.VALUE, data=4) step_9.add_hyperparameter(name='beta', argument_type=d3m_base.ArgumentType.VALUE, data=1e-8) step_9.add_hyperparameter(name='alpha', argument_type=d3m_base.ArgumentType.VALUE, data=1e-3) step_9.add_hyperparameter(name='degree', argument_type=d3m_base.ArgumentType.VALUE, data=2) step_9.add_output('produce') pipeline.add_step(step_9) #step 10: convert numpy-formatted prediction outputs to a dataframe step_10 = d3m_pipeline.PrimitiveStep( primitive_description=NDArrayToDataFramePrimitive.metadata.query()) step_10.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.9.produce') step_10.add_output('produce') pipeline.add_step(step_10) #step 11: convert numpy-formatted prediction outputs to a dataframe step_11 = d3m_pipeline.PrimitiveStep( primitive_description=HorizontalConcatPrimitive.metadata.query()) step_11.add_argument(name='left', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.10.produce') step_11.add_argument(name='right', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.5.produce') step_11.add_output('produce') pipeline.add_step(step_11) #Linear Regression on low-rank data (inputs and outputs for sklearns are both dataframes) step_12 = d3m_pipeline.PrimitiveStep( primitive_description=d3m.primitives.regression.gradient_boosting. SKlearn.metadata.query()) step_12.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.11.produce') step_12.add_argument(name='outputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.6.produce') step_12.add_hyperparameter(name='n_estimators', argument_type=d3m_base.ArgumentType.VALUE, data=10000) step_12.add_hyperparameter(name='learning_rate', argument_type=d3m_base.ArgumentType.VALUE, data=0.001) step_12.add_hyperparameter(name='max_depth', argument_type=d3m_base.ArgumentType.VALUE, data=2) step_12.add_output('produce') pipeline.add_step(step_12) #step 13: generate a properly-formatted output dataframe from the dataframed prediction outputs using the input dataframe as a reference step_13 = d3m_pipeline.PrimitiveStep( primitive_description=ConstructPredictionsPrimitive.metadata.query( )) step_13.add_argument( name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference= 'steps.12.produce' #inputs here are the prediction column ) step_13.add_argument( name='reference', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference= 'steps.1.produce' #inputs here are the dataframe input dataset ) step_13.add_output('produce') pipeline.add_step(step_13) # Final Output pipeline.add_output(name='output', data_reference='steps.13.produce') return pipeline