def load_pipeline(path, tunables=True, defaults=True): """Load a d3m json or yaml pipeline.""" if not os.path.exists(path): base_path = os.path.abspath(os.path.dirname(__file__)) path = os.path.join('templates', path) path = os.path.join(base_path, path) if not os.path.isfile(path): raise ValueError('Could not find pipeline: {}'.format(path)) LOGGER.warn('Loading pipeline from %s', path) with open(path) as pipeline: if path.endswith('yml'): data = yaml.safe_load(pipeline) else: data = json.load(pipeline) pipeline = Pipeline.from_json_structure(data) if tunables: # extract tunable hyperparameters tunable_hyperparameters = extract_tunable_hyperparams(pipeline) return pipeline, tunable_hyperparameters return pipeline
def __setstate__(self, state: typing.Dict) -> None: """ This method is used for unpickling the object. It takes a dictionary of saved state of object and restores the object to that state. Args: state: typing.Dict dictionary of the objects picklable state Returns: """ # print("[INFO] Set state called!") fitted = state['fitted_pipe'] del state['fitted_pipe'] structure = state['pipeline'] state['pipeline'] = Pipeline.from_json_structure(structure) random_seed = state['random_seed'] run = Runtime(state['pipeline'], fitted_pipeline_id=state['id'], random_seed=random_seed, volumes_dir=FittedPipeline.runtime_setting.volumes_dir, log_dir=FittedPipeline.runtime_setting.log_dir) run.steps_state = fitted state['runtime'] = run self.__dict__ = state
def load_pipeline(pipeline_file: typing.Union[str, typing.Dict]): """ Load pipeline from a pipeline URI Parameters ---------- pipeline_file: Union[str, dict] The URI pointing to a json file of pipeline or dict of string that is a pipeline Returns ------- pipeline: Pipeline An object of Pipeline """ if isinstance(pipeline_file, dict): try: with d3m_utils.silence(): pipeline = Pipeline.from_json_structure(pipeline_file) except: pipeline = None else: with d3m_utils.silence(): pipeline = get_pipeline(pipeline_path=pipeline_file, load_all_primitives=False) return pipeline
def load_template(): with open( join(os.path.dirname(__file__), '../resource/pipelines/example_metalearningdb.json')) as fin: json_pipeline = json.load(fin) d3m_pipeline = Pipeline.from_json_structure(json_pipeline, ) grpc_pipeline = encode_pipeline_description(d3m_pipeline, ['RAW'], '/tmp') return grpc_pipeline
def evaluate(pipeline, data_pipeline, dataset, metrics, problem, scoring_config, dataset_uri, timeout_run): if is_collection(dataset_uri[7:]): dataset = get_dataset_sample(dataset, problem) json_pipeline = convert.to_d3m_json(pipeline) if TaskKeyword.GRAPH in problem['problem'][ 'task_keywords'] and json_pipeline['description'].startswith( 'MtLDB'): return {0: {'ACCURACY': 1.0}, 1: {'ACCURACY': 1.0}} logger.info( "Pipeline to be scored:\n\t%s", '\n\t'.join( [x['primitive']['python_path'] for x in json_pipeline['steps']])) d3m_pipeline = Pipeline.from_json_structure(json_pipeline, ) if 'method' in scoring_config: scoring_config.pop('method') manager = Manager() return_dict = manager.dict() p = Process(target=worker, args=(d3m_pipeline, data_pipeline, scoring_pipeline, problem, dataset, scoring_config, metrics, return_dict)) p.start() p.join(timeout_run) p.terminate() if 'run_results' not in return_dict or 'run_scores' not in return_dict: raise TimeoutError('Reached timeout (%d seconds) to score a pipeline' % timeout_run) run_results = return_dict['run_results'] run_scores = return_dict['run_scores'] for result in run_results: if result.has_error(): raise RuntimeError(result.pipeline_run.status['message']) #save_pipeline_runs(run_results.pipeline_runs) combined_folds = d3m.runtime.combine_folds([fold for fold in run_scores]) scores = {} for _, row in combined_folds.iterrows(): if row['fold'] not in scores: scores[row['fold']] = {} scores[row['fold']][row['metric']] = row['value'] return scores
def load_schema_only( cls, pipeline_id: str, folder_loc: str, pipeline_schema_subdir: str ) -> typing.Tuple[Pipeline, typing.Dict]: pipeline_dir = os.path.join(folder_loc, pipeline_schema_subdir) subpipeline_dir = os.path.join(folder_loc, cls.subpipelines_subdir) pipeline_schema = os.path.join(pipeline_dir, pipeline_id + '.json') with open(pipeline_schema, 'r') as f: structure = json.load(f) resolver = Resolver( pipeline_search_paths=[pipeline_dir, subpipeline_dir]) pipeline = Pipeline.from_json_structure(pipeline_description=structure, resolver=resolver) return (pipeline, structure)
def add_extra_primitive(self, primitive_name: typing.List[str], location_number: int) -> None: """ Add extra primitives, usually it should be "d3m.primitives.data_transformation.denormalize.Common" or "d3m.primitives.data_preprocessing.do_nothing_for_dataset.DSBOX" or "d3m.primitives.data_augmentation.datamart_query.DSBOX" or "d3m.primitives.data_augmentation.datamart_augmentation.DSBOX" """ structure = self.pipeline.to_json_structure() for each_primitive_name in primitive_name: # considering adding datamart query and augment must be add in the same time # we should support adding multiple primitives in once if location_number == 0: input_names = ["inputs.0"] else: input_names = [ "steps." + str(location_number - 1) + ".produce" ] # if each_primitive_name == "datamart_augmentation": # if location_number >= 2: # input_names = ["steps."+str(location_number - 1)+".produce", "steps."+str(location_number - 2)+".produce"] # if location_number == 1: # which should not occur any more # _logger.warn("detect DatamartAugmentation primitive was added in second step, which should not happen!") # input_names = ["steps."+str(location_number - 1)+".produce", "inputs.0"] primitive_augument = self.get_primitive_augment( each_primitive_name, input_names) hyperparams_file_loc = os.path.join( self.runtime_setting.scratch_dir, self.dataset_id + each_primitive_name + ".json") with open(hyperparams_file_loc, "r") as f: hyperparams_file = json.load(f) new_hyper_file = {} for key, value in hyperparams_file.items(): new_hyper_file[key] = {"type": "VALUE", "data": value} primitive_augument['hyperparams'] = new_hyper_file # update output reference output_step_reference = structure[ "outputs"] # it should look like "steps.11.produce" for i, each_output_step_reference in enumerate( output_step_reference): each_output_step_reference_split = each_output_step_reference[ "data"].split(".") each_output_step_reference_split[1] = str( int(each_output_step_reference_split[1]) + 1) structure["outputs"][i]["data"] = ".".join( each_output_step_reference_split) # add the step in corresponding position detail_steps = structure["steps"] detail_steps.insert(location_number, primitive_augument) for i in range(location_number + 1, len(detail_steps)): each_step = detail_steps[i] if "arguments" in each_step: for each_argument_key in each_step["arguments"].keys(): argument_target = each_step["arguments"][ each_argument_key]["data"] if argument_target == "inputs.0": # and "denormalize" in each_step["primitive"]["python_path"]: argument_target_new = "steps.0.produce" each_step["arguments"][each_argument_key][ "data"] = argument_target_new else: argument_target_list = argument_target.split(".") if int( argument_target_list[1] ) >= location_number or i == location_number + 1: argument_target_list[1] = str( int(argument_target_list[1]) + 1) argument_target_new = ".".join( argument_target_list) each_step["arguments"][each_argument_key][ "data"] = argument_target_new # update each_step detail_steps[i] = each_step # update original structure structure["steps"] = detail_steps # add into runtime primitive_pickle_file_loc = os.path.join( self.runtime_setting.scratch_dir, self.dataset_id + each_primitive_name + ".pkl") with open(primitive_pickle_file_loc, "rb") as f: primitive_pickle_file = pickle.load(f) self.runtime.steps_state.insert(location_number, primitive_pickle_file) location_number += 1 # update cracked Pipeline from new structure self.pipeline = Pipeline.from_json_structure(structure) # ForkedPdb().set_trace() steps_state_old = self.runtime.steps_state # generate new runtime cross_validation_result = self.runtime.cross_validation_result self.runtime = Runtime( self.pipeline, fitted_pipeline_id=self.id, random_seed=self.random_seed, volumes_dir=FittedPipeline.runtime_setting.volumes_dir, log_dir=FittedPipeline.runtime_setting.log_dir) self.runtime.cross_validation_result = cross_validation_result self.runtime.steps_state = steps_state_old