def _gen_pipeline(self): pipeline = meta_pipeline.Pipeline() pipeline.add_input(name = 'inputs') step_0 = meta_pipeline.PrimitiveStep(primitive_description = DataConversion.metadata.query()) step_0.add_argument( name = 'inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0' ) step_0.add_output('produce') pipeline.add_step(step_0) step_1 = meta_pipeline.PrimitiveStep(primitive_description = AdjacencySpectralEmbedding.metadata.query()) step_1.add_argument( name = 'inputs', argument_type = ArgumentType.CONTAINER, data_reference = 'steps.0.produce' ) step_1.add_hyperparameter( name = 'which_elbow', argument_type = ArgumentType.VALUE, data = 1 ) step_1.add_hyperparameter( name = 'max_dimension', argument_type = ArgumentType.VALUE, data = 2 ) step_1.add_hyperparameter( name = 'use_attributes', argument_type = ArgumentType.VALUE, data = False ) step_1.add_output('produce') pipeline.add_step(step_1) step_2 = meta_pipeline.PrimitiveStep(primitive_description = RankClassification.metadata.query()) step_2.add_argument( name = 'inputs', argument_type= ArgumentType.CONTAINER, data_reference= 'steps.1.produce' ) step_2.add_output('produce') pipeline.add_step(step_2) # Adding output step to the pipeline pipeline.add_output(name = 'Predictions', data_reference = 'steps.2.produce') return pipeline
def _gen_pipeline(self): pipeline = meta_pipeline.Pipeline() pipeline.add_input(name='inputs') step_0 = meta_pipeline.PrimitiveStep( primitive_description=LoadGraphs.metadata.query()) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) step_1 = meta_pipeline.PrimitiveStep( primitive_description=LargestConnectedComponent.metadata.query()) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce') pipeline.add_step(step_1) step_2 = meta_pipeline.PrimitiveStep( primitive_description=LaplacianSpectralEmbedding.metadata.query()) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_hyperparameter(name='max_dimension', argument_type=ArgumentType.VALUE, data=5) step_2.add_hyperparameter(name='use_attributes', argument_type=ArgumentType.VALUE, data=True) step_2.add_output('produce') pipeline.add_step(step_2) step_3 = meta_pipeline.PrimitiveStep( primitive_description=GaussianClustering.metadata.query()) step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_hyperparameter(name='max_clusters', argument_type=ArgumentType.VALUE, data=10) step_3.add_output('produce') pipeline.add_step(step_3) # Adding output step to the pipeline pipeline.add_output(name='Predictions', data_reference='steps.3.produce') return pipeline
def _gen_pipeline(self): pipeline = meta_pipeline.Pipeline() pipeline.add_input(name='inputs') # Step 0: dataset_to_dataframe step_0 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) # Step 1: dataset_to_dataframe step_1 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_1.add_hyperparameter(name='dataframe_resource', argument_type=ArgumentType.VALUE, data='1') step_1.add_output('produce') pipeline.add_step(step_1) # Step 2: dataset_to_dataframe step_2 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_2.add_hyperparameter(name='dataframe_resource', argument_type=ArgumentType.VALUE, data='2') step_2.add_output('produce') pipeline.add_step(step_2) step_3 = meta_pipeline.PrimitiveStep( primitive_description=EuclideanNomination.metadata.query()) step_3.add_argument(name='inputs_1', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_3.add_argument(name='inputs_2', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_3.add_output('produce') pipeline.add_step(step_3) # Adding output step to the pipeline pipeline.add_output(name='Predictions', data_reference='steps.3.produce') return pipeline
def _gen_pipeline(self): pipeline = meta_pipeline.Pipeline(context = Context.TESTING) pipeline.add_input(name = 'inputs') step_0 = meta_pipeline.PrimitiveStep(primitive_description=LargestConnectedComponent.metadata.query()) step_0.add_argument( name = 'inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0' ) step_0.add_output('produce') pipeline.add_step(step_0) step_1 = meta_pipeline.PrimitiveStep(primitive_description = OutOfSampleLaplacianSpectralEmbedding.metadata.query()) step_1.add_argument( name = 'inputs', argument_type = ArgumentType.CONTAINER, data_reference = 'steps.0.produce' ) step_1.add_output('produce') pipeline.add_step(step_1) step_2 = meta_pipeline.PrimitiveStep(primitive_description= GaussianClustering.metadata.query()) step_2.add_argument( name = 'inputs', argument_type= ArgumentType.CONTAINER, data_reference= 'steps.1.produce' ) step_2.add_output('produce') pipeline.add_step(step_2) # Adding output step to the pipeline pipeline.add_output(name = 'Predictions', data_reference = 'steps.2.produce') return pipeline
def _gen_pipeline(self): pipeline = meta_pipeline.Pipeline() pipeline.add_input(name='inputs') step_0 = meta_pipeline.PrimitiveStep(primitive_description=SpectralGraphClustering.metadata.query()) step_0.add_argument( name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0' ) step_0.add_output('produce') pipeline.add_step(step_0) # Adding output step to the pipeline pipeline.add_output(name='results', data_reference='steps.0.produce') return pipeline
def _gen_pipeline(self): pipeline = meta_pipeline.Pipeline() pipeline.add_input(name='inputs') step_0 = meta_pipeline.PrimitiveStep( primitive_description=SeededGraphMatching.metadata.query()) step_0.add_argument( name='inputs', argument_type=meta_pipeline.metadata_base.ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) # Adding output step to the pipeline pipeline.add_output(name='Predictions', data_reference='steps.0.produce') return pipeline
def update_pipeline( pipeline_to_update, filename=None ): """ This function updates the pipeline's digests and version numbers Parameters ---------- pipeline_json_structure: the pipeline in JSON form (WITHOUT) digests. This or the `filename` parameter is mandatory filename: the filename of the pipeline json, so we can read it in :return a pipeline with updated digests """ if pipeline_to_update is None and filename is None: raise ValueError("No pipeline json was given") elif pipeline_to_update is None: with open(filename, "r") as file: # NOTE: must be a pipeline with no digests, or recent digests # NOTE: reading this in as straight JSON doesn't work so we have to use the pipeline_module pipeline_to_update = pipeline_module.Pipeline.from_json(string_or_file=file).to_json_structure() else: try: pipeline_to_update = pipeline_module.Pipeline.from_json(json.dumps(pipeline_to_update)).to_json_structure() except Exception as e: pass for step in pipeline_to_update['steps']: # if not updated, check and update primitive = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( step["primitive"]["python_path"] ) ) check_step = primitive.to_json_structure() # lets verify that both are updated id_matches = check_step["primitive"]["id"] == step["primitive"]["id"] if not id_matches: step["primitive"]["id"] = check_step["primitive"]["id"] version_matches = check_step["primitive"]["version"] == step["primitive"]["version"] if not version_matches: step["primitive"]["version"] = check_step["primitive"]["version"] return pipeline_to_update
def _gen_pipeline(self): pipeline = meta_pipeline.Pipeline(context=Context.TESTING) pipeline.add_input(name='inputs') step_0 = meta_pipeline.PrimitiveStep( primitive_description=SeededGraphMatching.metadata.query()) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_hyperparameter(name='reps', argument_type=ArgumentType.VALUE, data=10) step_0.add_hyperparameter(name='threshold', argument_type=ArgumentType.VALUE, data=0.1) step_0.add_output('produce') pipeline.add_step(step_0) # Adding output step to the pipeline pipeline.add_output(name='Predictions', data_reference='steps.0.produce') return pipeline
def _gen_pipeline(self): #pipeline context is just metadata, ignore for now pipeline = meta_pipeline.Pipeline() #define inputs. This will be read in automatically as a Dataset object. pipeline.add_input(name='inputs') # Step 0: DatasetToDataFrame step_0 = meta_pipeline.PrimitiveStep( primitive_description=DatasetToDataFramePrimitive.metadata.query()) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) # Step 1: Simple Profiler Column Role Annotation step_1 = meta_pipeline.PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.schema_discovery.profiler.Common")) step_1.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step_1.add_output("produce") pipeline.add_step(step_1) # Step 1: ColumnParser step_2 = meta_pipeline.PrimitiveStep( primitive_description=ColumnParserPrimitive.metadata.query()) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline.add_step(step_2) # Step 3: Extract Attributes step_3 = meta_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_output('produce') step_3.add_hyperparameter( name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) pipeline.add_step(step_3) # Step 4: Impute missing attributes step_4 = meta_pipeline.PrimitiveStep( primitive_description=SimpleImputerPrimitive.metadata.query()) step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') step_4.add_output('produce') pipeline.add_step(step_4) # Step 5: Convert attributes to ndarray step_5 = meta_pipeline.PrimitiveStep( primitive_description=DataFrameToNDArrayPrimitive.metadata.query()) step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') step_5.add_output('produce') pipeline.add_step(step_5) # Step 6: Extract Targets step_6 = meta_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_6.add_output('produce') step_6.add_hyperparameter( name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) pipeline.add_step(step_6) # Step 7: Transform targets into an ndarray step_7 = meta_pipeline.PrimitiveStep( primitive_description=DataFrameToNDArrayPrimitive.metadata.query()) step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce') step_7.add_output('produce') pipeline.add_step(step_7) # Step 8: use TensorMachinesBinaryClassification step_8 = meta_pipeline.PrimitiveStep( primitive_description=TensorMachinesBinaryClassification.metadata. query()) step_8.add_argument( name='inputs', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.5.produce' #inputs here are the attributes from step 4 ) step_8.add_argument( name='outputs', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.7.produce' #outputs are the targets from step 6 ) step_8.add_output('produce') pipeline.add_step(step_8) #step 9: convert numpy-formatted prediction outputs to a dataframe step_9 = meta_pipeline.PrimitiveStep( primitive_description=NDArrayToDataFramePrimitive.metadata.query()) step_9.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.8.produce') step_9.add_output('produce') pipeline.add_step(step_9) # Step 10: generate a properly-formatted output dataframe from the prediction outputs using the input dataframe as a reference step_10 = meta_pipeline.PrimitiveStep( primitive_description=ConstructPredictionsPrimitive.metadata.query( )) step_10.add_argument( name='inputs', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.9.produce' # inputs here are the prediction column ) step_10.add_argument( name='reference', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.1.produce' # inputs here are the dataframed input dataset ) step_10.add_output('produce') pipeline.add_step(step_10) # Adding output step to the pipeline pipeline.add_output(name='output', data_reference='steps.10.produce') return pipeline
def generate_imputer_pipeline(task_type, random_id=False): if random_id: pipeline_id = str(uuid.uuid4()) elif task_type == 'classification': pipeline_id = '168d3fbf-a3fe-456a-93a3-d2720ef8cb42' elif task_type == 'regression': pipeline_id = 'faeb3eb9-648f-4059-b067-791ebff47bc4' else: raise ValueError('Invalid task_type: {}'.format(task_type)) d3m_index.register_primitive( RandomSamplingImputer.metadata.query()['python_path'], RandomSamplingImputer ) pipeline = pipeline_module.Pipeline(pipeline_id) pipeline.add_input(name='inputs') step_counter = 0 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common' ) ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='inputs.0' ) step.add_output('produce') pipeline.add_step(step) raw_data_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.schema_discovery.profiler.Common' ) ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=raw_data_data_reference ) step.add_output('produce') pipeline.add_step(step) profiled_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_transformation.column_parser.Common' ) ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=profiled_data_reference ) step.add_output('produce') pipeline.add_step(step) parsed_data_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common' ) ) step.add_hyperparameter( name='semantic_types', argument_type=metadata_base.ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute'] ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=parsed_data_data_reference ) step.add_output('produce') pipeline.add_step(step) raw_attributes_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common' ) ) step.add_hyperparameter( name='semantic_types', argument_type=metadata_base.ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'] ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=parsed_data_data_reference ) step.add_output('produce') pipeline.add_step(step) true_targets_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_preprocessing.random_sampling_imputer.BYU' ) ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=raw_attributes_data_reference ) step.add_output('produce') pipeline.add_step(step) imputed_attributes_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 if task_type == 'regression': step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.regression.random_forest.SKlearn' ) ) step.add_hyperparameter( name='use_semantic_types', argument_type=metadata_base.ArgumentType.VALUE, data=True ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=imputed_attributes_data_reference ) step.add_argument( name='outputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=true_targets_data_reference ) step.add_output('produce') pipeline.add_step(step) step_counter += 1 elif task_type == 'classification': step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.classification.random_forest.SKlearn' ) ) step.add_hyperparameter( name='use_semantic_types', argument_type=metadata_base.ArgumentType.VALUE, data=True ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=imputed_attributes_data_reference ) step.add_argument( name='outputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=true_targets_data_reference ) step.add_output('produce') pipeline.add_step(step) step_counter += 1 else: raise ValueError('Invalid task_type: {}'.format(task_type)) step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_transformation.construct_predictions.Common' ) ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='steps.{}.produce'.format(step_counter - 1) ) step.add_argument( name='reference', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=raw_data_data_reference ) step.add_output('produce') pipeline.add_step(step) step_counter += 1 pipeline.add_output( name='predictions', data_reference='steps.{}.produce'.format(step_counter - 1) ) return pipeline
def generate_profiler_pipeline(task_type, random_id=False): if random_id: pipeline_id = str(uuid.uuid4()) elif task_type == 'classification': pipeline_id = 'f4ebb9c9-ef15-491d-9a39-595c20f3e78e' elif task_type == 'regression': pipeline_id = '9f5f6042-6582-494a-bc4b-92c7797a6614' else: raise ValueError('Invalid task_type: {}'.format(task_type)) d3m_index.register_primitive( SemanticProfilerPrimitive.metadata.query()['python_path'], SemanticProfilerPrimitive ) pipeline = pipeline_module.Pipeline(pipeline_id) pipeline.add_input(name='inputs') step_counter = 0 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common' ) ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='inputs.0' ) step.add_output('produce') pipeline.add_step(step) raw_data_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.schema_discovery.profiler.BYU' ) ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=raw_data_data_reference ) step.add_output('produce') pipeline.add_step(step) profiled_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_transformation.column_parser.Common' ) ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=profiled_data_reference ) step.add_output('produce') pipeline.add_step(step) parsed_data_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common' ) ) step.add_hyperparameter( name='semantic_types', argument_type=metadata_base.ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute'] ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=parsed_data_data_reference ) step.add_output('produce') pipeline.add_step(step) raw_attributes_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common' ) ) step.add_hyperparameter( name='semantic_types', argument_type=metadata_base.ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'] ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=parsed_data_data_reference ) step.add_output('produce') pipeline.add_step(step) true_targets_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_cleaning.imputer.SKlearn' ) ) step.add_hyperparameter( name='use_semantic_types', argument_type=metadata_base.ArgumentType.VALUE, data=True ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=raw_attributes_data_reference ) step.add_output('produce') pipeline.add_step(step) imputed_attributes_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 if task_type == 'regression': step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.regression.random_forest.SKlearn' ) ) step.add_hyperparameter( name='use_semantic_types', argument_type=metadata_base.ArgumentType.VALUE, data=True ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=imputed_attributes_data_reference ) step.add_argument( name='outputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=true_targets_data_reference ) step.add_output('produce') pipeline.add_step(step) step_counter += 1 elif task_type == 'classification': step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.classification.random_forest.SKlearn' ) ) step.add_hyperparameter( name='use_semantic_types', argument_type=metadata_base.ArgumentType.VALUE, data=True ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=imputed_attributes_data_reference ) step.add_argument( name='outputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=true_targets_data_reference ) step.add_output('produce') pipeline.add_step(step) step_counter += 1 else: raise ValueError('Invalid task_type: {}'.format(task_type)) step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_transformation.construct_predictions.Common' ) ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='steps.{}.produce'.format(step_counter - 1) ) step.add_argument( name='reference', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=raw_data_data_reference ) step.add_output('produce') pipeline.add_step(step) step_counter += 1 pipeline.add_output( name='predictions', data_reference='steps.{}.produce'.format(step_counter - 1) ) return pipeline
def _gen_pipeline(): #pipeline context is just metadata, ignore for now pipeline = meta_pipeline.Pipeline() #define inputs. This will be read in automatically as a Dataset object. pipeline.add_input(name='inputs') # DatasetToDataFrame step_0 = meta_pipeline.PrimitiveStep( primitive_description=DatasetToDataFramePrimitive.metadata.query()) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) # Profiler to infer semantic types step_1 = meta_pipeline.PrimitiveStep( primitive_description=SimpleProfilerPrimitive.metadata.query()) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce') pipeline.add_step(step_1) # ColumnParser step_2 = meta_pipeline.PrimitiveStep( primitive_description=ColumnParserPrimitive.metadata.query()) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline.add_step(step_2) # Extract Attributes step_3 = meta_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_output('produce') step_3.add_hyperparameter( name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) step_3.add_hyperparameter(name='exclude_columns', argument_type=ArgumentType.VALUE, data=[0, 1, 6, 7]) pipeline.add_step(step_3) # Impute missing data and nans step_4 = meta_pipeline.PrimitiveStep( primitive_description=SKImputer.metadata.query()) step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') step_4.add_output('produce') step_4.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) step_4.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='replace') pipeline.add_step(step_4) # Extract Targets step_5 = meta_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_5.add_output('produce') step_5.add_hyperparameter( name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) pipeline.add_step(step_5) # Transform attributes dataframe into an ndarray step_6 = meta_pipeline.PrimitiveStep( primitive_description=DataFrameToNDArrayPrimitive.metadata.query()) step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') step_6.add_output('produce') pipeline.add_step(step_6) # Run GRASTA step_7 = meta_pipeline.PrimitiveStep( primitive_description=GRASTA.metadata.query()) step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce') step_7.add_output('produce') pipeline.add_step(step_7) # Convert numpy-formatted attribute data to a dataframe step_8 = meta_pipeline.PrimitiveStep( primitive_description=NDArrayToDataFramePrimitive.metadata.query()) step_8.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.7.produce') step_8.add_output('produce') pipeline.add_step(step_8) # Linear Regression on low-rank data (inputs and outputs for sklearns are both dataframes) step_9 = meta_pipeline.PrimitiveStep( primitive_description=SKLinearSVR.metadata.query()) step_9.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.8.produce') step_9.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') step_9.add_output('produce') pipeline.add_step(step_9) # Finally generate a properly-formatted output dataframe from the prediction outputs using the input dataframe as a reference step_10 = meta_pipeline.PrimitiveStep( primitive_description=ConstructPredictionsPrimitive.metadata.query( )) step_10.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.9.produce') step_10.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_10.add_output('produce') pipeline.add_step(step_10) # Adding output step to the pipeline pipeline.add_output(name='output', data_reference='steps.10.produce') return pipeline
def generate_metafeature_pipeline(task_type, random_id=False): if random_id: pipeline_id = str(uuid.uuid4()) elif task_type == 'classification': pipeline_id = 'baa68a80-3a7d-472d-8d4f-54918cc1bd8f' elif task_type == 'regression': pipeline_id = '28e413f9-6085-4e34-b2c2-a5182a322a4b' else: raise ValueError('Invalid task_type: {}'.format(task_type)) d3m_index.register_primitive( MetafeatureExtractor.metadata.query()['python_path'], MetafeatureExtractor ) pipeline = pipeline_module.Pipeline(pipeline_id) pipeline.add_input(name='inputs') step_counter = 0 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common' ) ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='inputs.0' ) step.add_output('produce') pipeline.add_step(step) raw_data_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.schema_discovery.profiler.Common' ) ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=raw_data_data_reference ) step.add_output('produce') pipeline.add_step(step) profiled_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_transformation.column_parser.Common' ) ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=profiled_data_reference ) step.add_output('produce') pipeline.add_step(step) parsed_data_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.metalearning.metafeature_extractor.BYU' ) ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=parsed_data_data_reference ) step.add_output('produce') pipeline.add_step(step) imputed_data_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common' ) ) step.add_hyperparameter( name='semantic_types', argument_type=metadata_base.ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute'] ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=parsed_data_data_reference ) step.add_output('produce') pipeline.add_step(step) raw_attributes_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common' ) ) step.add_hyperparameter( name='semantic_types', argument_type=metadata_base.ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'] ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=parsed_data_data_reference ) step.add_output('produce') pipeline.add_step(step) true_targets_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_cleaning.imputer.SKlearn' ) ) step.add_hyperparameter( name='use_semantic_types', argument_type=metadata_base.ArgumentType.VALUE, data=True ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=raw_attributes_data_reference ) step.add_output('produce') pipeline.add_step(step) imputed_attributes_data_reference = 'steps.{}.produce'.format(step_counter) step_counter += 1 if task_type == 'regression': step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.regression.random_forest.SKlearn' ) ) step.add_hyperparameter( name='use_semantic_types', argument_type=metadata_base.ArgumentType.VALUE, data=True ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=imputed_attributes_data_reference ) step.add_argument( name='outputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=true_targets_data_reference ) step.add_output('produce') pipeline.add_step(step) step_counter += 1 elif task_type == 'classification': step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.classification.random_forest.SKlearn' ) ) step.add_hyperparameter( name='use_semantic_types', argument_type=metadata_base.ArgumentType.VALUE, data=True ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=imputed_attributes_data_reference ) step.add_argument( name='outputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=true_targets_data_reference ) step.add_output('produce') pipeline.add_step(step) step_counter += 1 else: raise ValueError('Invalid task_type: {}'.format(task_type)) step = pipeline_module.PrimitiveStep( primitive=d3m_index.get_primitive( 'd3m.primitives.data_transformation.construct_predictions.Common' ) ) step.add_argument( name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='steps.{}.produce'.format(step_counter - 1) ) step.add_argument( name='reference', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=raw_data_data_reference ) step.add_output('produce') pipeline.add_step(step) step_counter += 1 pipeline.add_output( name='predictions', data_reference='steps.{}.produce'.format(step_counter - 1) ) return pipeline
big_pipeline.add_step(step_0) step_0_output = step_0.add_output('output') step_1 = pipeline_module.FittedPipelineStep(fitted1.id, fitted1) step_1.add_input(pipeline_input) big_pipeline.add_step(step_1) step_1_output = step_1.add_output('output') step_2 = pipeline_module.FittedPipelineStep(fitted2.id, fitted2) step_2.add_input(pipeline_input) big_pipeline.add_step(step_2) step_2_output = step_2.add_output('output') concat_step = pipeline_module.PrimitiveStep({ "python_path": "d3m.primitives.dsbox.HorizontalConcat", "id": "dsbox-horizontal-concat", "version": "1.3.0", "name": "DSBox horizontal concat" }) concat_step.add_argument(name='inputs1', argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=step_0_output) concat_step.add_argument(name='inputs2', argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=step_1_output) # concat_step.add_argument(name='inputs2', argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=step_2_output) big_pipeline.add_step(concat_step) concat_step_output = concat_step.add_output('produce') # concat_step = pipeline_module.PrimitiveStep({ # "python_path": "d3m.primitives.dsbox.VerticalConcat", # "id": "dsbox-vertical-concat",
def _gen_pipeline(): #pipeline context is just metadata, ignore for now pipeline = meta_pipeline.Pipeline() # define inputs. This will be read in automatically as a Dataset object. pipeline.add_input(name='inputs') # step 0: Dataset -> Dataframe step_0 = meta_pipeline.PrimitiveStep(primitive_description=DatasetToDataFramePrimitive.metadata.query()) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) # Profiler to infer semantic types step_1 = meta_pipeline.PrimitiveStep(primitive_description=SimpleProfilerPrimitive.metadata.query()) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce') pipeline.add_step(step_1) # Dataframe -> Column Parsing step_2 = meta_pipeline.PrimitiveStep(primitive_description=ColumnParserPrimitive.metadata.query()) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline.add_step(step_2) # Column -> Column Extract attributes step_3 = meta_pipeline.PrimitiveStep(primitive_description=ExtractColumnsBySemanticTypesPrimitive.metadata.query()) step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_output('produce') step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) pipeline.add_step(step_3) # Attribute Dataframe -> NDArray step_4 = meta_pipeline.PrimitiveStep(primitive_description=DataFrameToNDArrayPrimitive.metadata.query()) step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') step_4.add_output('produce') pipeline.add_step(step_4) # NDARRAY -> Cluster step_5 = meta_pipeline.PrimitiveStep(primitive_description=SSC_CVX.metadata.query()) step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') step_5.add_hyperparameter(name='n_clusters', argument_type=ArgumentType.VALUE, data=100) step_5.add_output('produce') pipeline.add_step(step_5) # Cluster -> Dataframe step_6 = meta_pipeline.PrimitiveStep(primitive_description=NDArrayToDataFramePrimitive.metadata.query()) step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') step_6.add_output('produce') pipeline.add_step(step_6) # Dataframe -> combine with original step_7 = meta_pipeline.PrimitiveStep(primitive_description=ConstructPredictionsPrimitive.metadata.query()) step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce') step_7.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_7.add_output('produce') pipeline.add_step(step_7) # Final Output pipeline.add_output(name='output', data_reference='steps.7.produce') return pipeline
def preprocessing_pipeline(): preprocessing_pipeline = pipeline_module.Pipeline( 'big', context=pipeline_module.PipelineContext.TESTING) initial_input = preprocessing_pipeline.add_input(name="inputs") denormalize_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.dsbox.Denormalize").metadata.query())) denormalize_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=initial_input) preprocessing_pipeline.add_step(denormalize_step) denormalize_step_output = denormalize_step.add_output('produce') to_dataframe_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.datasets.DatasetToDataFrame").metadata.query()) ) to_dataframe_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=denormalize_step_output) preprocessing_pipeline.add_step(to_dataframe_step) to_dataframe_step_output = to_dataframe_step.add_output("produce") extract_attribute_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.data.ExtractColumnsBySemanticTypes").metadata. query())) extract_attribute_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=to_dataframe_step_output) preprocessing_pipeline.add_step(extract_attribute_step) extract_attribute_step_output = extract_attribute_step.add_output( "produce") extract_attribute_step.add_hyperparameter( name='semantic_types', argument_type=pipeline_module.ArgumentType.VALUE, data=( 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'https://metadata.datadrivendiscovery.org/types/Attribute', )) profiler_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.dsbox.Profiler").metadata.query())) profiler_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=extract_attribute_step_output) preprocessing_pipeline.add_step(profiler_step) profiler_step_output = profiler_step.add_output("produce") clean_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.dsbox.CleaningFeaturizer").metadata.query())) clean_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=profiler_step_output) preprocessing_pipeline.add_step(clean_step) clean_step_output = clean_step.add_output("produce") corex_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.dsbox.CorexText").metadata.query())) corex_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=clean_step_output) preprocessing_pipeline.add_step(corex_step) corex_step_output = corex_step.add_output("produce") encoder_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.dsbox.Encoder").metadata.query())) encoder_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=corex_step_output) preprocessing_pipeline.add_step(encoder_step) encoder_step_output = encoder_step.add_output("produce") impute_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.dsbox.MeanImputation").metadata.query())) impute_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=encoder_step_output) preprocessing_pipeline.add_step(impute_step) impute_step_output = impute_step.add_output("produce") scalar_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.dsbox.IQRScaler").metadata.query())) scalar_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=impute_step_output) preprocessing_pipeline.add_step(scalar_step) scalar_step_output = scalar_step.add_output("produce") extract_target_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.data.ExtractColumnsBySemanticTypes").metadata. query())) extract_target_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=to_dataframe_step_output) preprocessing_pipeline.add_step(extract_target_step) extract_target_step_output = extract_target_step.add_output("produce") extract_target_step.add_hyperparameter( name='semantic_types', argument_type=pipeline_module.ArgumentType.VALUE, data=('https://metadata.datadrivendiscovery.org/types/Target', 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) # preprocessing_pipeline.add_output(name="produce", data_reference=scalar_step_output) return preprocessing_pipeline, scalar_step_output, initial_input, extract_target_step_output
def _gen_pipeline(self): #pipeline context is just metadata, ignore for now pipeline = meta_pipeline.Pipeline() #define inputs. This will be read in automatically as a Dataset object. pipeline.add_input(name='inputs') # Step 0: DatasetToDataFrame step_0 = meta_pipeline.PrimitiveStep( primitive_description=DatasetToDataFramePrimitive.metadata.query()) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) # Step 1: ColumnParser step_1 = meta_pipeline.PrimitiveStep( primitive_description=ColumnParserPrimitive.metadata.query()) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce') pipeline.add_step(step_1) # Step 2: Extract Attributes step_2 = meta_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') step_2.add_hyperparameter( name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) pipeline.add_step(step_2) # Step 3: Extract Targets step_3 = meta_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_3.add_output('produce') step_3.add_hyperparameter( name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) pipeline.add_step(step_3) #Transform attributes dataframe into an ndarray step_4 = meta_pipeline.PrimitiveStep( primitive_description=DataFrameToNDArrayPrimitive.metadata.query()) step_4.add_argument( name='inputs', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.2.produce' #inputs here are the outputs from step 3 ) step_4.add_output('produce') pipeline.add_step(step_4) #Run L1LowRank step_5 = meta_pipeline.PrimitiveStep( primitive_description=L1LowRank.metadata.query()) step_5.add_argument( name='inputs', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.4.produce' #inputs here are the outputs from step 4 ) step_5.add_output('produce') pipeline.add_step(step_5) # convert numpy-formatted attribute data to a dataframe step_6 = meta_pipeline.PrimitiveStep( primitive_description=NDArrayToDataFramePrimitive.metadata.query()) step_6.add_argument( name='inputs', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.5.produce' # inputs here are the outputs from step 5 ) step_6.add_output('produce') pipeline.add_step(step_6) #Ridge Regression on low-rank data (inputs and outputs for sklearns are both dataframes) step_7 = meta_pipeline.PrimitiveStep( primitive_description=SKRidge.metadata.query()) step_7.add_hyperparameter(name='max_iter', argument_type=ArgumentType.VALUE, data=10000) step_7.add_hyperparameter(name='tol', argument_type=ArgumentType.VALUE, data=0.01) step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce') step_7.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') step_7.add_output('produce') pipeline.add_step(step_7) #finally generate a properly-formatted output dataframe from the prediction outputs using the input dataframe as a reference step_8 = meta_pipeline.PrimitiveStep( primitive_description=ConstructPredictionsPrimitive.metadata.query( )) step_8.add_argument( name='inputs', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.7.produce' # inputs here are the prediction column ) step_8.add_argument( name='reference', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.0.produce' # inputs here are the dataframed input dataset ) step_8.add_output('produce') pipeline.add_step(step_8) # Adding output step to the pipeline pipeline.add_output(name='output', data_reference='steps.8.produce') return pipeline
def _gen_pipeline(): #pipeline context is just metadata, ignore for now pipeline = meta_pipeline.Pipeline() #define inputs. This will be read in automatically as a Dataset object. pipeline.add_input(name='inputs') # Dataset -> Dataframe step_0 = meta_pipeline.PrimitiveStep( primitive_description=DatasetToDataFramePrimitive.metadata.query()) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) # Profiler to infer semantic types step_1 = meta_pipeline.PrimitiveStep( primitive_description=SimpleProfilerPrimitive.metadata.query()) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce') pipeline.add_step(step_1) # ColumnParser step_2 = meta_pipeline.PrimitiveStep( primitive_description=ColumnParserPrimitive.metadata.query()) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline.add_step(step_2) # Extract Attributes step_3 = meta_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_output('produce') step_3.add_hyperparameter( name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) pipeline.add_step(step_3) # Extract Targets step_4 = meta_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_4.add_output('produce') step_4.add_hyperparameter( name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) pipeline.add_step(step_4) # Impute missing data and nans step_5 = meta_pipeline.PrimitiveStep( primitive_description=SKImputer.metadata.query()) step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') step_5.add_output('produce') step_5.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) step_5.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='replace') pipeline.add_step(step_5) # Transform attributes dataframe into an ndarray step_6 = meta_pipeline.PrimitiveStep( primitive_description=DataFrameToNDArrayPrimitive.metadata.query()) step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') step_6.add_output('produce') pipeline.add_step(step_6) # Transform targets dataframe into an ndarray step_7 = meta_pipeline.PrimitiveStep( primitive_description=DataFrameToNDArrayPrimitive.metadata.query()) step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') step_7.add_output('produce') pipeline.add_step(step_7) # OWLRegression step_8 = meta_pipeline.PrimitiveStep( primitive_description=OWLRegression.metadata.query()) step_8.add_hyperparameter(name='normalize', argument_type=ArgumentType.VALUE, data=True) step_8.add_hyperparameter(name='learning_rate', argument_type=ArgumentType.VALUE, data=2e-1) step_8.add_hyperparameter(name='tol', argument_type=ArgumentType.VALUE, data=1e-3) step_8.add_hyperparameter(name='weight_max_val', argument_type=ArgumentType.VALUE, data=175) step_8.add_hyperparameter(name='weight_max_off', argument_type=ArgumentType.VALUE, data=0) step_8.add_hyperparameter(name='weight_min_val', argument_type=ArgumentType.VALUE, data=0) step_8.add_hyperparameter(name='weight_min_off', argument_type=ArgumentType.VALUE, data=13) step_8.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce') step_8.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.7.produce') step_8.add_output('produce') pipeline.add_step(step_8) # Convert numpy-formatted prediction outputs to a dataframe step_9 = meta_pipeline.PrimitiveStep( primitive_description=NDArrayToDataFramePrimitive.metadata.query()) step_9.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.8.produce') step_9.add_output('produce') pipeline.add_step(step_9) # Generate a properly-formatted output dataframe from the dataframed prediction outputs using the input dataframe as a reference step_10 = meta_pipeline.PrimitiveStep( primitive_description=ConstructPredictionsPrimitive.metadata.query( )) step_10.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.9.produce') step_10.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_10.add_output('produce') pipeline.add_step(step_10) # Final Output pipeline.add_output(name='output', data_reference='steps.10.produce') return pipeline
def _gen_pipeline(self): pipeline = d3m_pipeline.Pipeline() #define inputs. This will be read in automatically as a Dataset object. pipeline.add_input(name='inputs') #step 0: Denormalize: join multiple tabular resource? # Why is there no entry point for Denormalize? #step 0: Dataset -> Dataframe step_0 = d3m_pipeline.PrimitiveStep( primitive_description=DatasetToDataFramePrimitive.metadata.query()) step_0.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) # Step 1: Simple Profiler Column Role Annotation step_1 = d3m_pipeline.PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.schema_discovery.profiler.Common")) step_1.add_argument( name="inputs", argument_type=d3m_base.ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step_1.add_output("produce") pipeline.add_step(step_1) #step 1: ColumnParser step_2 = d3m_pipeline.PrimitiveStep( primitive_description=ColumnParserPrimitive.metadata.query()) step_2.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline.add_step(step_2) #step 3: Extract attributes from dataset into a dedicated dataframe step_3 = d3m_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_3.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_output('produce') step_3.add_hyperparameter( name='semantic_types', argument_type=d3m_base.ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) pipeline.add_step(step_3) #step 4: Extract Targets step_4 = d3m_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_4.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.2.produce') step_4.add_output('produce') step_4.add_hyperparameter( name='semantic_types', argument_type=d3m_base.ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) pipeline.add_step(step_4) #step 5: transform targets dataframe into an ndarray step_5 = d3m_pipeline.PrimitiveStep( primitive_description=DataFrameToNDArrayPrimitive.metadata.query()) step_5.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.4.produce') step_5.add_output('produce') pipeline.add_step(step_5) #step 6 : transform features dataframe into an ndarray step_6 = d3m_pipeline.PrimitiveStep( primitive_description=DataFrameToNDArrayPrimitive.metadata.query()) step_6.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.3.produce') step_6.add_output('produce') pipeline.add_step(step_6) attributes = 'steps.6.produce' targets = 'steps.5.produce' #step 7: call RFMPreconditionedGaussianKRR for regression step_7 = d3m_pipeline.PrimitiveStep( primitive_description=RFMPreconditionedGaussianKRR.metadata.query( )) step_7.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference=attributes) step_7.add_argument(name='outputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference=targets) step_7.add_output('produce') pipeline.add_step(step_7) #step 8: convert numpy-formatted prediction outputs to a dataframe step_8 = d3m_pipeline.PrimitiveStep( primitive_description=NDArrayToDataFramePrimitive.metadata.query()) step_8.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.7.produce') step_8.add_output('produce') pipeline.add_step(step_8) #step 9: generate a properly-formatted output dataframe from the dataframed prediction outputs using the input dataframe as a reference step_9 = d3m_pipeline.PrimitiveStep( primitive_description=ConstructPredictionsPrimitive.metadata.query( )) step_9.add_argument( name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference= 'steps.8.produce' #inputs here are the prediction column ) step_9.add_argument( name='reference', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference= 'steps.1.produce' #inputs here are the dataframe input dataset ) step_9.add_output('produce') pipeline.add_step(step_9) # Final Output pipeline.add_output(name='output', data_reference='steps.9.produce') return pipeline
def generate_ensemble_pipeline(self): if not self.pids: raise ValueError( "No candidate pipeline ids found, unable to generate the ensemble pipeline." ) elif len(self.pids) == 1: raise ValueError( "Only 1 candidate pipeline id found, unable to generate the ensemble pipeline." ) step_outputs = [] self.big_pipeline, pipeline_output, pipeline_input, target = self.preprocessing_pipeline( ) for each_pid in self.pids: each_dsbox_fitted = FittedPipeline.load(self.pipeline_files_dir, each_pid) each_runtime = each_dsbox_fitted.runtime each_fitted = runtime_module.FittedPipeline( each_pid, each_runtime, context=pipeline_module.PipelineContext.TESTING) each_step = pipeline_module.FittedPipelineStep( each_fitted.id, each_fitted) each_step.add_input(pipeline_input) self.big_pipeline.add_step(each_step) step_outputs.append(each_step.add_output('output')) concat_step = pipeline_module.PrimitiveStep({ "python_path": "d3m.primitives.data_preprocessing.horizontal_concat.DSBOX", "id": "dsbox-horizontal-concat", "version": "1.3.0", "name": "DSBox horizontal concat" }) for i in range(len(self.pids) - 1): each_concact_step = copy.deepcopy(concat_step) if i == 0: each_concact_step.add_argument( name='inputs1', argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=step_outputs[i]) else: each_concact_step.add_argument( name='inputs1', argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=concat_step_output) each_concact_step.add_argument( name='inputs2', argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=step_outputs[i + 1]) each_concact_step.add_hyperparameter( name="column_name", argument_type=pipeline_module.ArgumentType.VALUE, data=i) self.big_pipeline.add_step(each_concact_step) # update concat_step_output concat_step_output = each_concact_step.add_output('produce') encode_res_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.data_preprocessing.encoder.DSBOX"). metadata.query())) encode_res_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=concat_step_output) self.big_pipeline.add_step(encode_res_step) encode_res_step_output = encode_res_step.add_output("produce") concat_step1 = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.data.HorizontalConcat").metadata.query())) concat_step1.add_argument( name="left", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=encode_res_step_output) concat_step1.add_argument( name="right", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=pipeline_output) concat_step1.add_hyperparameter( name="use_index", argument_type=pipeline_module.ArgumentType.VALUE, data=False) self.big_pipeline.add_step(concat_step1) concat_output1 = concat_step1.add_output("produce") model_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( self.final_step_primitive).metadata.query())) model_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=concat_output1) model_step.add_argument( name="outputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=target) self.big_pipeline.add_step(model_step) big_output = model_step.add_output("produce") final_output = self.big_pipeline.add_output(name="final", data_reference=big_output) self._logger.info("Ensemble pipeline created successfully")
def _gen_pipeline(self): #pipeline context is just metadata, ignore for now pipeline = meta_pipeline.Pipeline() #define inputs. This will be read in automatically as a Dataset object. pipeline.add_input(name='inputs') # Step 0: DatasetToDataFrame step_0 = meta_pipeline.PrimitiveStep( primitive_description=DatasetToDataFramePrimitive.metadata.query()) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) # Step 1: Simple Profiler Column Role Annotation step_1 = meta_pipeline.PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.schema_discovery.profiler.Common")) step_1.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step_1.add_output("produce") pipeline.add_step(step_1) # Step 1: ColumnParser step_2 = meta_pipeline.PrimitiveStep( primitive_description=ColumnParserPrimitive.metadata.query()) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline.add_step(step_2) # Step 3: Extract Attributes step_3 = meta_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_output('produce') step_3.add_hyperparameter( name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) pipeline.add_step(step_3) # Step 4: Extract Targets step_4 = meta_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_4.add_output('produce') step_4.add_hyperparameter( name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) pipeline.add_step(step_4) #Transform attributes dataframe into an ndarray step_5 = meta_pipeline.PrimitiveStep( primitive_description=DataFrameToNDArrayPrimitive.metadata.query()) step_5.add_argument( name='inputs', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.3.produce' #inputs here are the outputs from step 3 ) step_5.add_output('produce') pipeline.add_step(step_5) #Run L1LowRank step_6 = meta_pipeline.PrimitiveStep( primitive_description=RandomizedPolyPCA.metadata.query()) step_6.add_argument( name='inputs', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.5.produce' #inputs here are the outputs from step 4 ) step_6.add_hyperparameter(name='n_components', argument_type=ArgumentType.VALUE, data=15) step_6.add_hyperparameter(name='degree', argument_type=ArgumentType.VALUE, data=2) step_6.add_output('produce') pipeline.add_step(step_6) # convert numpy-formatted attribute data to a dataframe step_7 = meta_pipeline.PrimitiveStep( primitive_description=NDArrayToDataFramePrimitive.metadata.query()) step_7.add_argument( name='inputs', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.6.produce' # inputs here are the outputs from step 5 ) step_7.add_output('produce') pipeline.add_step(step_7) #Linear Regression on low-rank data (inputs and outputs for sklearns are both dataframes) step_8 = meta_pipeline.PrimitiveStep( primitive_description=d3m.primitives.regression.gradient_boosting. SKlearn.metadata.query()) step_8.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.7.produce') step_8.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') step_8.add_hyperparameter(name='n_estimators', argument_type=ArgumentType.VALUE, data=50000) step_8.add_hyperparameter(name='learning_rate', argument_type=ArgumentType.VALUE, data=0.002) step_8.add_hyperparameter(name='max_depth', argument_type=ArgumentType.VALUE, data=2) #step_7.add_hyperparameter( # name = 'loss', # argument_type = ArgumentType.VALUE, # data = 'ls' #) step_8.add_output('produce') pipeline.add_step(step_8) #finally generate a properly-formatted output dataframe from the prediction outputs using the input dataframe as a reference step_9 = meta_pipeline.PrimitiveStep( primitive_description=ConstructPredictionsPrimitive.metadata.query( )) step_9.add_argument( name='inputs', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.8.produce' # inputs here are the prediction column ) step_9.add_argument( name='reference', argument_type=ArgumentType.CONTAINER, data_reference= 'steps.1.produce' # inputs here are the dataframed input dataset ) step_9.add_output('produce') pipeline.add_step(step_9) # Adding output step to the pipeline pipeline.add_output(name='output', data_reference='steps.9.produce') return pipeline
def preprocessing_pipeline(self): preprocessing_pipeline = pipeline_module.Pipeline( 'big', context=pipeline_module.PipelineContext.TESTING) initial_input = preprocessing_pipeline.add_input(name="inputs") denormalize_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.normalization.denormalize.DSBOX").metadata. query())) denormalize_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=initial_input) preprocessing_pipeline.add_step(denormalize_step) denormalize_step_output = denormalize_step.add_output('produce') to_dataframe_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common" ).metadata.query())) to_dataframe_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=denormalize_step_output) preprocessing_pipeline.add_step(to_dataframe_step) to_dataframe_step_output = to_dataframe_step.add_output("produce") extract_attribute_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.data_transformation.extract_columns_by_semantic_types.DataFrameCommon" ).metadata.query())) extract_attribute_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=to_dataframe_step_output) preprocessing_pipeline.add_step(extract_attribute_step) extract_attribute_step_output = extract_attribute_step.add_output( "produce") extract_attribute_step.add_hyperparameter( name='semantic_types', argument_type=pipeline_module.ArgumentType.VALUE, data=( 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'https://metadata.datadrivendiscovery.org/types/Attribute', )) profiler_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.schema_discovery.profiler.DSBOX").metadata. query())) profiler_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=extract_attribute_step_output) preprocessing_pipeline.add_step(profiler_step) profiler_step_output = profiler_step.add_output("produce") clean_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.data_cleaning.cleaning_featurizer.DSBOX"). metadata.query())) clean_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=profiler_step_output) preprocessing_pipeline.add_step(clean_step) clean_step_output = clean_step.add_output("produce") corex_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.feature_construction.corex_text.CorexText" ).metadata.query())) corex_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=clean_step_output) preprocessing_pipeline.add_step(corex_step) corex_step_output = corex_step.add_output("produce") encoder_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.data_preprocessing.encoder.DSBOX"). metadata.query())) encoder_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=corex_step_output) preprocessing_pipeline.add_step(encoder_step) encoder_step_output = encoder_step.add_output("produce") impute_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.data_preprocessing.mean_imputation.DSBOX"). metadata.query())) impute_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=encoder_step_output) preprocessing_pipeline.add_step(impute_step) impute_step_output = impute_step.add_output("produce") scalar_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.normalization.iqr_scaler.DSBOX").metadata. query())) scalar_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=impute_step_output) preprocessing_pipeline.add_step(scalar_step) scalar_step_output = scalar_step.add_output("produce") extract_target_step = pipeline_module.PrimitiveStep( dict( d3m_index.get_primitive( "d3m.primitives.data_transformation.extract_columns_by_semantic_types.DataFrameCommon" ).metadata.query())) extract_target_step.add_argument( name="inputs", argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=to_dataframe_step_output) preprocessing_pipeline.add_step(extract_target_step) extract_target_step_output = extract_target_step.add_output("produce") extract_target_step.add_hyperparameter( name='semantic_types', argument_type=pipeline_module.ArgumentType.VALUE, data=('https://metadata.datadrivendiscovery.org/types/Target', 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) # preprocessing_pipeline.add_output(name="produce", data_reference=scalar_step_output) return preprocessing_pipeline, scalar_step_output, initial_input, extract_target_step_output
def _gen_pipeline(self): pipeline = d3m_pipeline.Pipeline() #define inputs. This will be read in automatically as a Dataset object. pipeline.add_input(name='inputs') #step 0: Denormalize: join multiple tabular resource? # Why is there no entry point for Denormalize? #step 0: Dataset -> Dataframe step_0 = d3m_pipeline.PrimitiveStep( primitive_description=DatasetToDataFramePrimitive.metadata.query()) step_0.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) # Step 1: Simple Profiler Column Role Annotation step_1 = d3m_pipeline.PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.schema_discovery.profiler.Common")) step_1.add_argument( name="inputs", argument_type=d3m_base.ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step_1.add_output("produce") pipeline.add_step(step_1) #step 2: ColumnParser step_2 = d3m_pipeline.PrimitiveStep( primitive_description=ColumnParserPrimitive.metadata.query()) step_2.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline.add_step(step_2) #step 3: Imputer step_3 = d3m_pipeline.PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_cleaning.imputer.SKlearn')) step_3.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_hyperparameter(name='use_semantic_types', argument_type=d3m_base.ArgumentType.VALUE, data=True) step_3.add_output('produce') pipeline.add_step(step_3) #step 4: Extract attributes from dataset into a dedicated dataframe step_4 = d3m_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_4.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.3.produce') step_4.add_output('produce') step_4.add_hyperparameter( name='semantic_types', argument_type=d3m_base.ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) pipeline.add_step(step_4) #step 5: Binary encoding for categorical features step_5 = d3m_pipeline.PrimitiveStep( primitive_description=BinaryEncoderPrimitive.metadata.query()) step_5.add_hyperparameter(name='min_binary', argument_type=d3m_base.ArgumentType.VALUE, data=2) step_5.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.4.produce') step_5.add_output('produce') pipeline.add_step(step_5) #step 5: Extract Targets step_6 = d3m_pipeline.PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive. metadata.query()) step_6.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.2.produce') step_6.add_hyperparameter( name='semantic_types', argument_type=d3m_base.ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) step_6.add_output('produce') pipeline.add_step(step_6) #step 7: transform targets dataframe into an ndarray step_7 = d3m_pipeline.PrimitiveStep( primitive_description=DataFrameToNDArrayPrimitive.metadata.query()) step_7.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.6.produce') step_7.add_output('produce') pipeline.add_step(step_7) #step 8 : transform features dataframe into an ndarray step_8 = d3m_pipeline.PrimitiveStep( primitive_description=DataFrameToNDArrayPrimitive.metadata.query()) step_8.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.5.produce') step_8.add_output('produce') pipeline.add_step(step_8) attributes = 'steps.8.produce' targets = 'steps.7.produce' #step 9: call RFMPreconditionedGaussianKRR for regression #Run SparsePCA step_9 = d3m_pipeline.PrimitiveStep( primitive_description=SparsePCA.metadata.query()) step_9.add_argument( name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference=attributes #inputs here are the outputs from step 7 ) step_9.add_hyperparameter(name='n_components', argument_type=d3m_base.ArgumentType.VALUE, data=4) step_9.add_hyperparameter(name='beta', argument_type=d3m_base.ArgumentType.VALUE, data=1e-8) step_9.add_hyperparameter(name='alpha', argument_type=d3m_base.ArgumentType.VALUE, data=1e-3) step_9.add_hyperparameter(name='degree', argument_type=d3m_base.ArgumentType.VALUE, data=2) step_9.add_output('produce') pipeline.add_step(step_9) #step 10: convert numpy-formatted prediction outputs to a dataframe step_10 = d3m_pipeline.PrimitiveStep( primitive_description=NDArrayToDataFramePrimitive.metadata.query()) step_10.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.9.produce') step_10.add_output('produce') pipeline.add_step(step_10) #step 11: convert numpy-formatted prediction outputs to a dataframe step_11 = d3m_pipeline.PrimitiveStep( primitive_description=HorizontalConcatPrimitive.metadata.query()) step_11.add_argument(name='left', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.10.produce') step_11.add_argument(name='right', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.5.produce') step_11.add_output('produce') pipeline.add_step(step_11) #Linear Regression on low-rank data (inputs and outputs for sklearns are both dataframes) step_12 = d3m_pipeline.PrimitiveStep( primitive_description=d3m.primitives.regression.gradient_boosting. SKlearn.metadata.query()) step_12.add_argument(name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.11.produce') step_12.add_argument(name='outputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference='steps.6.produce') step_12.add_hyperparameter(name='n_estimators', argument_type=d3m_base.ArgumentType.VALUE, data=10000) step_12.add_hyperparameter(name='learning_rate', argument_type=d3m_base.ArgumentType.VALUE, data=0.001) step_12.add_hyperparameter(name='max_depth', argument_type=d3m_base.ArgumentType.VALUE, data=2) step_12.add_output('produce') pipeline.add_step(step_12) #step 13: generate a properly-formatted output dataframe from the dataframed prediction outputs using the input dataframe as a reference step_13 = d3m_pipeline.PrimitiveStep( primitive_description=ConstructPredictionsPrimitive.metadata.query( )) step_13.add_argument( name='inputs', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference= 'steps.12.produce' #inputs here are the prediction column ) step_13.add_argument( name='reference', argument_type=d3m_base.ArgumentType.CONTAINER, data_reference= 'steps.1.produce' #inputs here are the dataframe input dataset ) step_13.add_output('produce') pipeline.add_step(step_13) # Final Output pipeline.add_output(name='output', data_reference='steps.13.produce') return pipeline
def generate_ensemble_pipeline(self): """ Function used to generate the Pipeline for ensemble tuning """ if not self.pids: raise ValueError( "No candidate pipeline ids found, unable to generate the ensemble pipeline." ) elif len(self.pids) == 1: raise ValueError( "Only 1 candidate pipeline id found, unable to generate the ensemble pipeline." ) step_outputs = [] self.voting_pipeline = pipeline_module.Pipeline( 'voting', context=pipeline_module.PipelineContext.TESTING) pipeline_input = self.voting_pipeline.add_input(name='inputs') for each_pid in self.pids: each_dsbox_fitted = FittedPipeline.load(self.pipeline_files_dir, each_pid) each_runtime = each_dsbox_fitted.runtime each_fitted = runtime_module.FittedPipeline( each_pid, each_runtime, context=pipeline_module.PipelineContext.TESTING) each_step = pipeline_module.FittedPipelineStep( each_fitted.id, each_fitted) each_step.add_input(pipeline_input) self.voting_pipeline.add_step(each_step) step_outputs.append(each_step.add_output('output')) concat_step = pipeline_module.PrimitiveStep({ "python_path": "d3m.primitives.data_preprocessing.vertical_concatenate.DSBOX", "id": "dsbox-vertical-concat", # "version": "1.3.0", "name": "DSBox vertically concat" }) concat_step_output = None for i in range(len(self.pids) - 1): each_concact_step = copy.deepcopy(concat_step) if i == 0: each_concact_step.add_argument( name='inputs1', argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=step_outputs[i]) else: each_concact_step.add_argument( name='inputs1', argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=concat_step_output) each_concact_step.add_argument( name='inputs2', argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=step_outputs[i + 1]) self.voting_pipeline.add_step(each_concact_step) # update concat_step_output concat_step_output = each_concact_step.add_output('produce') vote_step = pipeline_module.PrimitiveStep({ "python_path": "d3m.primitives.data_preprocessing.ensemble_voting.DSBOX", "id": "dsbox-ensemble-voting", "version": "1.3.0", "name": "DSBox ensemble voting" }) vote_step.add_argument( name='inputs', argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=concat_step_output) self.voting_pipeline.add_step(vote_step) voting_output = vote_step.add_output('produce') self.voting_pipeline.add_output(name='Metafeatures', data_reference=voting_output) self._logger.info("Ensemble pipeline created successfully")