def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: dataframe_resource_id, dataframe = base_utils.get_tabular_resource( inputs, self.hyperparams["dataframe_resource"]) base_file_path = "/".join( inputs.metadata._current_metadata.metadata["location_uris"] [0].split("/")[:-1]) graph1 = os.path.join(base_file_path, "graphs", inputs["0"].values[0][0]) graph1 = nx.read_gml(graph1[7:]) int2str_map = dict(zip(graph1.nodes, [str(n) for n in graph1.nodes])) graph = nx.relabel_nodes(graph1, mapping=int2str_map) # graph = inputs['0'] # int2str_map = dict(zip(graph.nodes, [str(n) for n in graph.nodes])) graph = nx.relabel_nodes(graph, mapping=int2str_map) dataframe.metadata = self._update_metadata(inputs.metadata, dataframe_resource_id) assert isinstance(dataframe, container.DataFrame), type(dataframe) U_train = {"graph": graph} y_train = self.produce_target(inputs=inputs).value X_train = dataframe X_train = self._typify_dataframe(X_train) return base.CallResult([X_train, y_train, U_train])
def produce(self, *, inputs: container.List, timeout: float = None, iterations: int = None) -> base.CallResult[container.Dataset]: # build the list of dataframes from the list of inputs dataframes = [] metadata = None for input in inputs: if isinstance(input, container.DataFrame): dataframes.append(input) try: _, main_dr = d3m_base_utils.get_tabular_resource(input, None) dataframes.append(main_dr) metadata = input.metadata except ValueError as error: raise exceptions.InvalidArgumentValueError( "Failure to find tabular resource in dataset") from error if self.hyperparams["column_overlap"] == "exact": columns_to_handle = dataframes[0].columns if np.sum( np.array([ np.all(df.columns == columns_to_handle) for df in dataframes ])) != len(dataframes): raise exceptions.InvalidArgumentValueError( "Dataframes don't have same columns, cannot exact concat") concated = pd.concat(dataframes, ignore_index=True) elif self.hyperparams["column_overlap"] == "union": concated = pd.concat(dataframes, ignore_index=True) elif self.hyperparams["column_overlap"] == "intersection": concated = pd.concat(dataframes, join="inner", ignore_index=True) if self.hyperparams["remove_duplicate_rows"]: concated.drop_duplicates(subset="d3mIndex", keep="first", inplace=True, ignore_index=True) if metadata is None: metadata = container.Dataset({ "learningData": concated.head(1) }, generate_metadata=True).metadata outputs = container.Dataset({"learningData": concated}, metadata) outputs.metadata = outputs.metadata.update( (metadata_base.ALL_ELEMENTS, ), {"dimension": { "length": concated.shape[0] }}) return base.CallResult(outputs)
def get_dataframe(dataset: container.Dataset, resource_id: str) -> container.DataFrame: # extracts a dataframe from a dataset and ensures its metadata is transferred over # grab the resource and its metadata out of the dataset dataframe_resource_id, dataframe = base_utils.get_tabular_resource(dataset, resource_id) resource_metadata = dict(dataset.metadata.query((dataframe_resource_id,))) # copy the resource metadata from the dataset into the resource new_metadata = metadata_base.DataMetadata(resource_metadata) new_metadata = dataset.metadata.copy_to(new_metadata, (resource_id,)) new_metadata = new_metadata.remove_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint') dataframe.metadata = new_metadata return dataframe
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: dataframe_resource_id, dataframe = base_utils.get_tabular_resource( inputs, self.hyperparams['dataframe_resource']) dataframe.metadata = self._update_metadata(inputs.metadata, dataframe_resource_id) assert isinstance(dataframe, container.DataFrame), type(dataframe) return base.CallResult(dataframe)
def produce_target(self, *, inputs: Inputs, timeout: float = None, iterations: int = None ) -> base.CallResult[container.DataFrame]: logger.debug(f"Running {__name__} produce_target") _, dataframe = base_utils.get_tabular_resource( inputs, self.hyperparams["dataframe_resource"]) outputs = dataframe.copy() # find the target column and remove all others num_cols = outputs.metadata.query( (metadata_base.ALL_ELEMENTS, ))["dimension"]["length"] target_idx = -1 suggested_target_idx = -1 for i in range(num_cols): semantic_types = outputs.metadata.query( (metadata_base.ALL_ELEMENTS, i))["semantic_types"] if ("https://metadata.datadrivendiscovery.org/types/Target" in semantic_types or "https://metadata.datadrivendiscovery.org/types/TrueTarget" in semantic_types): target_idx = i outputs = self._update_type_info(semantic_types, outputs, i) elif ("https://metadata.datadrivendiscovery.org/types/SuggestedTarget" in semantic_types): suggested_target_idx = i elif ("https://metadata.datadrivendiscovery.org/types/PrimaryKey" in semantic_types): outputs = self._update_type_info(semantic_types, outputs, i) # fall back on suggested target if target_idx == -1: target_idx = suggested_target_idx # flip the d3mIndex to be the df index as well outputs = outputs.set_index("d3mIndex", drop=False) remove_indices = set(range(num_cols)) remove_indices.remove(target_idx) outputs = outputs.remove_columns(remove_indices) logger.debug(f"\n{outputs.dtypes}") logger.debug(f"\n{outputs}") return base.CallResult(outputs)
def get_dataframe(dataset: container.Dataset, resource_id: str, target_col: int) -> container.DataFrame: """ extracts a dataframe from a dataset and ensures its metadata is transferred over """ # grab the resource and its metadata out of the dataset dataframe_resource_id, dataframe = base_utils.get_tabular_resource(dataset, resource_id) resource_metadata = dict(dataset.metadata.query((dataframe_resource_id,))) # copy the resource metadata from the dataset into the resource new_metadata = metadata_base.DataMetadata(resource_metadata) new_metadata = dataset.metadata.copy_to(new_metadata, (resource_id,)) new_metadata = new_metadata.remove_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint') # add target metadata to specified column new_metadata = new_metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, target_col), 'https://metadata.datadrivendiscovery.org/types/TrueTarget' ) dataframe.metadata = new_metadata return dataframe
def produce( self, *, inputs: container.Dataset, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Running {__name__}") # get the learning data (the dataset entry point) learning_id, learning_df = base_utils.get_tabular_resource( inputs, None, pick_entry_point=True) learning_df = learning_df.head( int(learning_df.shape[0] * self.hyperparams["sample"])) learning_df.metadata = self._update_metadata(inputs.metadata, learning_id, learning_df) logger.debug(f"\n{learning_df}") return base.CallResult(learning_df)
def set_training_data(self, *, inputs: Inputs) -> None: self._target_resource_id, _ = d3m_utils.get_tabular_resource(inputs, self.hyperparams["target_resource"]) # d3m.base.utils. self._inputs = inputs self._fitted = False
def set_training_data(self, *, inputs: Input) -> None: self._training_inputs = inputs main_resource_id, main_resource = d3m_utils.get_tabular_resource( inputs, None, has_hyperparameter=False) self._main_resource_id = main_resource_id self._fitted = False
def produce_collection( self, *, inputs: container.Dataset, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Running {__name__}") # get the learning data (the dataset entry point) learning_id, learning_df = base_utils.get_tabular_resource( inputs, None, pick_entry_point=True) learning_df = learning_df.head( int(learning_df.shape[0] * self.hyperparams["sample"])) learning_df.metadata = self._update_metadata(inputs.metadata, learning_id, learning_df) # find the column that is acting as the foreign key and extract the resource + column it references for i in range( learning_df.metadata.query( (metadata_base.ALL_ELEMENTS, ))["dimension"]["length"]): column_metadata = learning_df.metadata.query_column(i) if ("foreign_key" in column_metadata and column_metadata["foreign_key"]["type"] == "COLUMN"): resource_id = column_metadata["foreign_key"]["resource_id"] file_column_idx = column_metadata["foreign_key"][ "column_index"] # get the learning data (the dataset entry point) collection_id, collection_df = base_utils.get_tabular_resource( inputs, resource_id) collection_df = collection_df.head(learning_df.shape[0]) collection_df.metadata = self._update_metadata(inputs.metadata, collection_id, collection_df) # get the base path base_path = collection_df.metadata.query( (metadata_base.ALL_ELEMENTS, file_column_idx))["location_base_uris"][0] # create fully resolved paths and load paths = learning_df.iloc[:, file_column_idx] # TODO: remove, unused? file_paths = [] for i, row in learning_df.iterrows(): if i % 100 == 0: logger.debug(f"Loaded {i} / {len(learning_df.index)} files") try: start_end = row["start-end-time-slice-of-recording"] start, end = [float(x) for x in start_end.split(",")] file_paths.append((os.path.join(base_path, row["filename"]), start, end)) except AttributeError as e: logger.warning("no start/end ts for {}".format(row)) file_paths.append((os.path.join(base_path, row["filename"]), None, None)) outputs = self._audio_load(self.hyperparams["n_jobs"], file_paths) logger.debug(f"\n{outputs}") result_df = pd.DataFrame({"audio": outputs}) # d3m container takes for_ever_ return base.CallResult( container.DataFrame(result_df, generate_metadata=False))
def _evaluate(self, configuration: ConfigurationPoint, cache: PrimitivesCache, dump2disk: bool = True) -> typing.Dict: start_time = time.time() pipeline = self.template.to_pipeline(configuration) # Todo: update ResourceManager to run pipeline: ResourceManager.add_pipeline(pipeline) # initlize repeat_time_level self._repeat_times_level_2 = 1 self._repeat_times_level_1 = 1 # for timeseries forcasting, we can't compare directly if self.problem['problem'][ 'task_type'] == TaskType.TIME_SERIES_FORECASTING: # just skip for now # TODO: add one way to evalute time series forecasting pipeline quality # (something like sliding window) fitted_pipeline = FittedPipeline( pipeline=pipeline, dataset_id=self.train_dataset1.metadata.query(())['id'], metric_descriptions=self.performance_metrics, template=self.template, problem=self.problem, extra_primitive=self.extra_primitive, random_seed=self.random_seed) fitted_pipeline.fit(cache=cache, inputs=[self.train_dataset1]) fitted_pipeline.save(self.output_directory) training_ground_truth = get_target_columns(self.train_dataset1) # fake_metric = calculate_score(training_ground_truth, training_ground_truth, # self.performance_metrics, self.task_type, SpecialMetric().regression_metric) fake_metric = score_prediction(training_ground_truth, [self.train_dataset1], self.problem, self.performance_metrics, self.random_seed) # HACK, if mean_base_line then make it slightly worse if fitted_pipeline.template_name == 'SRI_Mean_Baseline_Template': result = fake_metric[0] if result['metric'].best_value( ) < result['metric'].worst_value(): result['value'] = result['value'] + 0.1 fake_metric[0].normalize(result['value']) else: result['value'] = result['value'] - 0.1 fake_metric[0].normalize(result['value']) fitted_pipeline.set_metric(fake_metric[0]) # [{'column_name': 'Class', 'metric': 'f1', 'value': 0.1}] data = { # 2019-7-10: return pipeline.id as id to make debugging easier 'id': fitted_pipeline.pipeline.id, 'fid': fitted_pipeline.id, 'fitted_pipeline': fitted_pipeline, 'training_metrics': fake_metric, 'cross_validation_metrics': None, 'test_metrics': fake_metric, 'total_runtime': time.time() - start_time, 'configuration': configuration, 'ensemble_tuning_result': None, 'ensemble_tuning_metrics': None, } fitted_pipeline.auxiliary = dict(data) fitted_pipeline.save(self.output_directory) return data # following codes should only for running in the normal validation that can be splitted and tested # if in cross validation mode if self.testing_mode == Mode.CROSS_VALIDATION_MODE: self._repeat_times_level_2 = int( self.validation_config['cross_validation']) # start training and testing fitted_pipeline = FittedPipeline( pipeline=pipeline, dataset_id=self.train_dataset1.metadata.query(())['id'], metric_descriptions=self.performance_metrics, template=self.template, problem=self.problem, extra_primitive=self.extra_primitive, random_seed=self.random_seed) fitted_pipeline.fit(cache=cache, inputs=[self.train_dataset1]) training_prediction = fitted_pipeline.get_fit_step_output( self.template.get_output_step_number()) # training_ground_truth = get_target_columns(self.train_dataset1) # training_metrics = calculate_score(training_ground_truth, training_prediction, # self.performance_metrics, self.task_type, SpecialMetric().regression_metric) training_metrics = score_prediction(training_prediction, [self.train_dataset1], self.problem, self.performance_metrics, self.random_seed) cv_metrics = fitted_pipeline.get_cross_validation_metrics() test_metrics = copy.deepcopy(training_metrics) # use cross validation's avg value as the test score for i in range(len(test_metrics)): test_metrics[i]["value"] = cv_metrics[i]["value"] _logger.info("CV finish") # if in normal testing mode(including default testing mode with train/test one time each) else: # update: 2019.3.19 # no need to run inside(level 2 split), run base on level 1 split now! if self.testing_mode == Mode.TRAIN_TEST_MODE: self._repeat_times_level_1 = int( self.validation_config['test_validation']) _logger.info( "Will use normal train-test mode (n={}) to choose best primitives." .format(self._repeat_times_level_2)) training_metrics = [] test_metrics = [] for each_repeat in range(self._repeat_times_level_2): # start training and testing fitted_pipeline = FittedPipeline( pipeline=pipeline, dataset_id=self.train_dataset2[each_repeat].metadata.query( ())['id'], metric_descriptions=self.performance_metrics, template=self.template, problem=self.problem, extra_primitive=self.extra_primitive, random_seed=self.random_seed) fitted_pipeline.fit(cache=cache, inputs=[self.train_dataset2[each_repeat]]) # fitted_pipeline.fit(inputs=[self.train_dataset2[each_repeat]]) training_prediction = fitted_pipeline.get_fit_step_output( self.template.get_output_step_number()) # training_ground_truth = get_target_columns(self.train_dataset2[each_repeat]) # training_metrics_each = calculate_score( # training_ground_truth, training_prediction, # self.performance_metrics, self.task_type, SpecialMetric().regression_metric) training_metrics_each = score_prediction( training_prediction, [self.train_dataset2[each_repeat]], self.problem, self.performance_metrics, self.random_seed) # only do test if the test_dataset exist if self.test_dataset2[each_repeat] is not None: results = fitted_pipeline.produce( inputs=[self.test_dataset2[each_repeat]]) # Note: results == test_prediction test_prediction = fitted_pipeline.get_produce_step_output( self.template.get_output_step_number()) # test_ground_truth = get_target_columns(self.test_dataset2[each_repeat]) # test_metrics_each = calculate_score(test_ground_truth, test_prediction, # self.performance_metrics, self.task_type, SpecialMetric().regression_metric) test_metrics_each = score_prediction( test_prediction, [self.test_dataset2[each_repeat]], self.problem, self.performance_metrics, self.random_seed) else: # test_ground_truth = None test_prediction = None test_metrics_each = copy.deepcopy(training_metrics_each) for each in test_metrics_each: each["value"] = each['metric'].worst_value() training_metrics.append(training_metrics_each) test_metrics.append(test_metrics_each) # END for TRAIN_TEST_MODES # sample format of the output # [{'metric': 'f1Macro', 'value': 0.48418535913661614, 'values': [0.4841025641025641, # 0.4841025641025641, 0.4843509492047203]] # modify the test_metrics and training_metrics format to fit the requirements # print("[INFO] Testing finish.!!!") if len(training_metrics) > 1: training_metrics = self.conclude_k_fold_metrics( training_metrics) else: if type(training_metrics[0]) is list: training_metrics = training_metrics[0] if len(test_metrics) > 1: test_metrics = self.conclude_k_fold_metrics(test_metrics) else: if type(test_metrics[0]) is list: test_metrics = test_metrics[0] # END evaluation part # Save results ensemble_tuning_result = None ensemble_tuning_metrics = None if self.test_dataset1 is None: # print("The dataset no need to split of split failed, will not train again.") fitted_pipeline2 = fitted_pipeline # set the metric for calculating the rank fitted_pipeline2.set_metric(training_metrics[0]) cv = fitted_pipeline2.get_cross_validation_metrics() if not cv: # CandidateCache asserts cv must be a list cv = [] data = { # 2019-7-10: return pipeline.id as id to make debugging easier 'id': fitted_pipeline2.pipeline.id, 'fid': fitted_pipeline2.id, 'fitted_pipeline': fitted_pipeline2, 'training_metrics': training_metrics, 'cross_validation_metrics': cv, 'test_metrics': training_metrics, 'total_runtime': time.time() - start_time, 'configuration': configuration, 'ensemble_tuning_result': ensemble_tuning_result, 'ensemble_tuning_metrics': ensemble_tuning_metrics, } fitted_pipeline.auxiliary = dict(data) # print("!!!! No test_dataset1") # pprint(data) # print("!!!!") if _logger.getEffectiveLevel() <= 10: data_to_logger_info = [] if 'metric' in data['test_metrics']: data_to_logger_info.append(data['test_metrics']['metric']) else: data_to_logger_info.append("No test metrics metric found") if 'value' in data['test_metrics']: data_to_logger_info.append(data['test_metrics']['value']) else: data_to_logger_info.append("No test metrics value found") _logger.info( 'fitted id: %(fitted_pipeline_id)s, metric: %(metric)s, value: %(value)s', { 'fitted_pipeline_id': fitted_pipeline2.id, 'metric': data_to_logger_info[0], 'value': data_to_logger_info[1] }) # Save fitted pipeline pickled = False if self.output_directory is not None and dump2disk: try: fitted_pipeline2.save(self.output_directory) pickled = True except Exception as e: _logger.warning( f'SKIPPING Pickle test. Saving pipeline failed: {e.message}' ) # Pickle test try: if pickled and self.output_directory is not None and dump2disk: _logger.debug("Test pickled pipeline. id: {}".format( fitted_pipeline2.id)) self.test_pickled_pipeline( folder_loc=self.output_directory, pipeline_id=fitted_pipeline2.id, test_dataset=self.train_dataset2[0], test_metrics=training_metrics # test_ground_truth=get_target_columns(self.train_dataset2[0], self.problem) ) except Exception as e: _logger.exception('Pickle test Failed', exc_info=True) else: # update v2019.3.17, running k-fold corss validation on level_1 split if self.quick_mode: _logger.info( "[INFO] Now in quick mode, will skip training with train_dataset1" ) # if in quick mode, we did not fit the model with dataset_train1 again # just generate the predictions on dataset_test1 directly and get the rank fitted_pipeline2 = fitted_pipeline fitted_pipeline2.produce(inputs=[self.test_dataset1]) test_prediction = fitted_pipeline2.get_produce_step_output( self.template.get_output_step_number()) # test_ground_truth = get_target_columns(self.test_dataset1) # test_metrics2 = calculate_score(test_ground_truth, test_prediction, # self.performance_metrics, self.task_type, SpecialMetric().regression_metric) test_metrics2 = score_prediction(test_prediction, [self.test_dataset1], self.problem, self.performance_metrics, self.random_seed) else: _logger.info( "[INFO] Now in normal mode, will add extra train with train_dataset1" ) # otherwise train again with dataset_train1 and get the rank if self._repeat_times_level_1 > 1: # generate split base on level 1 (do all-dataset level x-fold corss vaidation) from common_primitives.kfold_split import KFoldDatasetSplitPrimitive, Hyperparams as hyper_k_fold hyperparams_split = hyper_k_fold.defaults() hyperparams_split = hyperparams_split.replace({ "number_of_folds": self._repeat_times_level_1, "shuffle": True }) if self.task_type == 'CLASSIFICATION': hyperparams_split = hyperparams_split.replace( {"stratified": True}) else: # if not task_type == "REGRESSION": hyperparams_split = hyperparams_split.replace( {"stratified": False}) split_primitive = KFoldDatasetSplitPrimitive( hyperparams=hyperparams_split) split_primitive.set_training_data(dataset=self.all_dataset) split_primitive.fit() query_dataset_list = list(range( self._repeat_times_level_1)) train_return = split_primitive.produce( inputs=query_dataset_list).value #['learningData'] test_return = split_primitive.produce_score_data( inputs=query_dataset_list).value all_test_metrics = [] for i in range(self._repeat_times_level_1): current_train_dataset = train_return[i] current_test_dataset = test_return[i] fitted_pipeline2 = FittedPipeline( pipeline=pipeline, dataset_id=current_train_dataset.metadata.query( ())['id'], metric_descriptions=self.performance_metrics, template=self.template, problem=self.problem, extra_primitive=self.extra_primitive, random_seed=self.random_seed) # retrain and compute ranking/metric using self.train_dataset # fitted_pipeline2.fit(inputs = [self.train_dataset1]) fitted_pipeline2.fit(cache=cache, inputs=[current_train_dataset]) fitted_pipeline2.produce(inputs=[current_test_dataset]) test_prediction = fitted_pipeline2.get_produce_step_output( self.template.get_output_step_number()) # test_ground_truth = get_target_columns(current_test_dataset) # test_metrics_temp = calculate_score(test_ground_truth, test_prediction, # self.performance_metrics, self.task_type, SpecialMetric().regression_metric) test_metrics_temp = score_prediction( test_prediction, [current_test_dataset], self.problem, self.performance_metrics, self.random_seed) all_test_metrics.append(test_metrics_temp) results = self.conclude_k_fold_metrics(all_test_metrics) test_metrics2 = results[0] else: # otherwise still do as previously fitted_pipeline2 = FittedPipeline( pipeline=pipeline, dataset_id=self.train_dataset1.metadata.query( ())['id'], metric_descriptions=self.performance_metrics, template=self.template, problem=self.problem, extra_primitive=self.extra_primitive, random_seed=self.random_seed) # retrain and compute ranking/metric using self.train_dataset # fitted_pipeline2.fit(inputs = [self.train_dataset1]) fitted_pipeline2.fit(cache=cache, inputs=[self.train_dataset1]) fitted_pipeline2.produce(inputs=[self.test_dataset1]) test_prediction = fitted_pipeline2.get_produce_step_output( self.template.get_output_step_number()) # test_ground_truth = get_target_columns(self.test_dataset1) # test_metrics2 = calculate_score(test_ground_truth, test_prediction, # self.performance_metrics, self.task_type, SpecialMetric().regression_metric) test_metrics2 = score_prediction(test_prediction, [self.test_dataset1], self.problem, self.performance_metrics, self.random_seed) # update here: # Now new version of d3m runtime don't allow to run ".fit()" again on a given runtime # object second time # So here we need to create a new FittedPipeline object to run second time's # runtime.fit() fitted_pipeline_final = FittedPipeline( pipeline=pipeline, dataset_id=self.all_dataset.metadata.query(())['id'], metric_descriptions=self.performance_metrics, template=self.template, problem=self.problem, extra_primitive=self.extra_primitive, random_seed=self.random_seed) # set the metric for calculating the rank fitted_pipeline_final.set_metric(test_metrics2[0]) # end uptdate v2019.3.17 # finally, fit the model with all data and save it _logger.info( "[INFO] Now are training the pipeline with all dataset and saving the pipeline." ) fitted_pipeline_final.fit(cache=cache, inputs=[self.all_dataset]) if self.ensemble_tuning_dataset: fitted_pipeline_final.produce( inputs=[self.ensemble_tuning_dataset]) ensemble_tuning_result = fitted_pipeline_final.get_produce_step_output( self.template.get_output_step_number()) # ensemble_tuning_result_ground_truth = get_target_columns(self.ensemble_tuning_dataset) # ensemble_tuning_metrics = calculate_score(ensemble_tuning_result_ground_truth, ensemble_tuning_result, # self.performance_metrics, self.task_type, SpecialMetric().regression_metric) ensemble_tuning_metrics = score_prediction( ensemble_tuning_result, [self.ensemble_tuning_dataset], self.problem, self.performance_metrics, self.random_seed) cv = fitted_pipeline_final.get_cross_validation_metrics() if not cv: # CandidateCache asserts cv must be a list cv = [] data = { # 2019-7-10: return pipeline.id as id to make debugging easier 'id': fitted_pipeline_final.pipeline.id, 'fid': fitted_pipeline_final.id, 'fitted_pipeline': fitted_pipeline_final, 'training_metrics': training_metrics, 'cross_validation_metrics': cv, 'test_metrics': test_metrics2, 'total_runtime': time.time() - start_time, 'configuration': configuration, 'ensemble_tuning_result': ensemble_tuning_result, 'ensemble_tuning_metrics': ensemble_tuning_metrics, } fitted_pipeline.auxiliary = dict(data) # Save fiteed pipeline pickled = False if self.output_directory is not None and dump2disk: try: fitted_pipeline_final.save(self.output_directory) pickled = True except Exception as e: _logger.warning( 'SKIPPING Pickle test. Saving pipeline failed: {e.message}' ) # Pickle test if pickled and self.output_directory is not None and dump2disk: try: # remove the augmented columns in self.test_dataset1 to ensure we can pass the picking test res_id, test_dataset1_df = d3m_utils.get_tabular_resource( dataset=self.test_dataset1, resource_id=None) original_columns = [] remained_columns_number = 0 for i in range(test_dataset1_df.shape[1]): current_selector = (res_id, ALL_ELEMENTS, i) meta = self.test_dataset1.metadata.query( current_selector) if AUGMENTED_COLUMN_SEMANTIC_TYPE in meta[ 'semantic_types'] or Q_NODE_SEMANTIC_TYPE in meta[ 'semantic_types']: self.test_dataset1.metadata = self.test_dataset1.metadata.remove( selector=current_selector) else: original_columns.append(i) if remained_columns_number != i: self.test_dataset1.metadata = self.test_dataset1.metadata.remove( selector=current_selector) updated_selector = (res_id, ALL_ELEMENTS, remained_columns_number) self.test_dataset1.metadata = self.test_dataset1.metadata.update( selector=updated_selector, metadata=meta) remained_columns_number += 1 self.test_dataset1[res_id] = self.test_dataset1[ res_id].iloc[:, original_columns] meta = dict( self.test_dataset1.metadata.query( (res_id, ALL_ELEMENTS))) dimension = dict(meta['dimension']) dimension['length'] = remained_columns_number meta['dimension'] = frozendict.FrozenOrderedDict(dimension) self.test_dataset1.metadata = self.test_dataset1.metadata.update( (res_id, ALL_ELEMENTS), frozendict.FrozenOrderedDict(meta)) # end removing augmente columns _ = fitted_pipeline_final.produce( inputs=[self.test_dataset1]) test_prediction3 = fitted_pipeline_final.get_produce_step_output( self.template.get_output_step_number()) # test_ground_truth_for_test_pickle = get_target_columns(self.test_dataset1) # test_metrics3 = calculate_score(test_ground_truth_for_test_pickle, test_prediction3, # self.performance_metrics, self.task_type, SpecialMetric().regression_metric) test_metrics3 = score_prediction(test_prediction3, [self.test_dataset1], self.problem, self.performance_metrics, self.random_seed) _logger.info("Test pickled pipeline. id: {}".format( fitted_pipeline_final.id)) self.test_pickled_pipeline( folder_loc=self.output_directory, pipeline_id=fitted_pipeline_final.id, test_dataset=self.test_dataset1, test_metrics=test_metrics3 # test_ground_truth=test_ground_truth_for_test_pickle ) except Exception as e: _logger.exception('Pickle test Failed', exc_info=True) # still return the original fitted_pipeline with relation to train_dataset1 return data
def set_training_data(self, *, dataset: container.Dataset) -> None: # type: ignore main_resource_id, main_resource = base_utils.get_tabular_resource(dataset, None, has_hyperparameter=False) self._main_resource_id = main_resource_id self._dataset = dataset self._fitted = False
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: dataframe_resource_id, dataframe = base_utils.get_tabular_resource( inputs, self.hyperparams["dataframe_resource"]) # get attribute columns hyperparams_class = ( dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.query() ["primitive_code"]["class_type_arguments"]["Hyperparams"]) primitive = dataset_to_dataframe.DatasetToDataFramePrimitive( hyperparams=hyperparams_class.defaults()) dataframe_meta = primitive.produce(inputs=inputs).value attributes = list_columns_with_semantic_types( metadata=dataframe_meta.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute" ], ) base_file_path = "/".join( inputs.metadata._current_metadata.metadata["location_uris"] [0].split("/")[:-1]) edge_list = pd.read_csv(os.path.join(base_file_path, "graphs", "edgeList.csv"), index_col=0) if len(edge_list.columns) > 2: graph = nx.from_pandas_edgelist( edge_list, source=edge_list.columns[0], target=edge_list.columns[1], edge_attr=edge_list.columns[2], ) else: graph = nx.from_pandas_edgelist(edge_list, source=edge_list.columns[0], target=edge_list.columns[1]) if len(attributes) > 1: # add attributers to nodes. attribute_node_map = dataframe_meta[ dataframe_meta.columns[attributes]] attribute_node_map["nodeID"] = attribute_node_map["nodeID"].astype( int) attribute_node_map.index = attribute_node_map["nodeID"] attribute_cols = attribute_node_map.columns attribute_node_map.drop(["nodeID"], axis=1) attribute_node_map = attribute_node_map.to_dict(orient="index") for i in graph.nodes: default = {attribute: 0 for attribute in attribute_cols} default["nodeID"] = i graph.nodes[i].update(attribute_node_map.get(i, default)) else: # featurizer expects at a minimum nodeids to be present for i in graph.nodes: default = {} default["nodeID"] = i graph.nodes[i].update(default) # int2str_map = dict(zip(graph.nodes, [str(n) for n in graph.nodes])) # graph = nx.relabel_nodes(graph, mapping=int2str_map) dataframe.metadata = self._update_metadata(inputs.metadata, dataframe_resource_id) assert isinstance(dataframe, container.DataFrame), type(dataframe) U_train = {"graph": graph} y_train = self.produce_target(inputs=inputs).value X_train = dataframe # TODO use attribute in vertex classification X_train = self._typify_dataframe(X_train) X_train.value = pd.DataFrame(X_train.value["nodeID"]) return base.CallResult([X_train, y_train, U_train])
def _get_truth(self, score_dataset: container.Dataset) -> typing.Tuple[pandas.DataFrame, typing.Dict[str, typing.Any]]: """ Extracts true targets from the Dataset's entry point, or the only tabular resource. It requires that there is only one primary index column, which it makes the first column, named ``d3mIndex``. Then true target columns follow. We return a regular Pandas DataFrame with column names matching those in the metadata, and a dict mapping target columns to all label values in those columns, if available in metadata. We convert all columns to strings to match what would be loaded from ``predictions.csv`` file. It encodes any float vectors as strings. """ main_resource_id, main_resource = base_utils.get_tabular_resource(score_dataset, None, has_hyperparameter=False) # We first copy before modifying in-place. main_resource = container.DataFrame(main_resource, copy=True) main_resource = self._encode_columns(main_resource) dataframe = self._to_dataframe(main_resource) indices = list(score_dataset.metadata.get_index_columns(at=(main_resource_id,))) targets = list(score_dataset.metadata.list_columns_with_semantic_types( ['https://metadata.datadrivendiscovery.org/types/TrueTarget'], at=(main_resource_id,), )) if not indices: raise exceptions.InvalidArgumentValueError("No primary index column.") elif len(indices) > 1: raise exceptions.InvalidArgumentValueError("More than one primary index column.") if not targets: raise ValueError("No true target columns.") dataframe = dataframe.iloc[:, indices + targets] dataframe = dataframe.rename(columns={dataframe.columns[0]: metrics.INDEX_COLUMN}) if metrics.SCORE_COLUMN in dataframe.columns[1:]: raise ValueError("True target column cannot be named \"confidence\". It is a reserved name.") if metrics.RANK_COLUMN in dataframe.columns[1:]: raise ValueError("True target column cannot be named \"rank\". It is a reserved name.") if metrics.INDEX_COLUMN in dataframe.columns[1:]: raise ValueError("True target column cannot be named \"d3mIndex\". It is a reserved name.") if d3m_utils.has_duplicates(dataframe.columns): duplicate_names = list(dataframe.columns) for name in set(dataframe.columns): duplicate_names.remove(name) raise exceptions.InvalidArgumentValueError( "True target columns have duplicate names: {duplicate_names}".format( duplicate_names=sorted(set(duplicate_names)), ), ) all_labels = {} for target_column_name, main_resource_column_index in zip(dataframe.columns[1:], targets): try: column_labels = score_dataset.metadata.query_column_field(main_resource_column_index, 'all_distinct_values', at=(main_resource_id,)) except KeyError: continue all_labels[target_column_name] = [str(label) for label in column_labels] return dataframe, all_labels
def download(): try: logger.debug("Start datamart downloading...") search_result = read_file(request.files, 'task', 'json') # if not send the json via file if not search_result and request.form.get('task'): search_result = json.loads(request.form.get('task')) if search_result is None: return wrap_response( code='1000', msg= 'FAIL SEARCH - Unable to get search result or input is a bad format!', data=None) # if data is csv content data = read_file(request.files, 'data', 'csv') # if data is not a csv content but a str path if data is not None: loaded_dataset = load_csv_data(data) elif request.values.get('data'): path = request.values.get('data') if path.lower().endswith("csv"): loaded_dataset = load_csv_data(path) else: loaded_dataset = load_d3m_dataset(data) else: loaded_dataset = None return_format = request.values.get('format') if not return_format or return_format.lower() == "csv": return_format = "csv" elif return_format.lower() == "d3m": return_format = "d3m" else: return wrap_response(code='1000', msg='FAIL SEARCH - Unknown return format: ' + str(return_format), data=None) # search without supplied data, not implement yet # TODO: implement this part! if loaded_dataset is None: return wrap_response( code='1000', msg='FAIL SEARCH - Unable to load input supplied data', data=None) # search with supplied data else: # preprocess on loaded_dataset logger.debug("Start running wikifier...") search_result_wikifier = DatamartSearchResult( search_result={}, supplied_data=None, query_json={}, search_type="wikifier") logger.debug("Wikifier finished, start running download...") loaded_dataset = search_result_wikifier.augment( supplied_data=loaded_dataset) search_result = DatamartSearchResult.deserialize( search_result['materialize_info']) download_result = search_result.download( supplied_data=loaded_dataset) logger.debug("Download finished.") res_id, result_df = d3m_utils.get_tabular_resource( dataset=download_result, resource_id=None) # print("--------------") # print(loaded_dataset['learningData']) # print("--------------") # print(result_df) # print("--------------") # sys.stdout.flush() non_empty_rows = [] for i, v in result_df.iterrows(): if len(v["joining_pairs"]) != 0: non_empty_rows.append(i) if len(non_empty_rows) == 0: return wrap_response( code='1000', msg='FAIL DOWNLOAD - No joinable rows found!', data=None) logger.debug("Start saving the download results...") result_df = result_df.iloc[non_empty_rows, :] result_df.reset_index(drop=True) # set all cells to be str so that we can save correctly download_result[res_id] = result_df.astype(str) # update structure type update_part = {"structural_type": str} for i in range(result_df.shape[1]): download_result.metadata = download_result.metadata.update( metadata=update_part, selector=(res_id, ALL_ELEMENTS, i)) # update row length update_part = {"length": result_df.shape[0]} download_result.metadata = download_result.metadata.update( metadata=update_part, selector=(res_id, )) result_id = str(hash(result_df.values.tobytes())) # save_dir = "/tmp/download_result" + result_id # if os.path.isdir(save_dir) or os.path.exists(save_dir): # shutil.rmtree(save_dir) if return_format == "d3m": # save dataset with tempfile.TemporaryDirectory() as tmpdir: absolute_path_part_length = len(str(tmpdir)) save_dir = os.path.join(str(tmpdir), result_id) # print(save_dir) # sys.stdout.flush() download_result.save("file://" + save_dir + "/datasetDoc.json") # zip and send to client base_path = pathlib.Path(save_dir + '/') data = io.BytesIO() filePaths = retrieve_file_paths(save_dir) zip_file = zipfile.ZipFile(data, 'w') with zip_file: # write each file seperately for fileName in filePaths: shorter_path = fileName[absolute_path_part_length:] zip_file.write(fileName, shorter_path) data.seek(0) return send_file(data, mimetype='application/zip', as_attachment=True, attachment_filename='download_result' + result_id + '.zip') else: data = io.StringIO() result_df.to_csv(data, index=False) return Response(data.getvalue(), mimetype="text/csv") except Exception as e: return wrap_response(code='1000', msg="FAIL SEARCH - %s \n %s" % (str(e), str(traceback.format_exc())))
def get_resource(inputs, resource_name): _id, _df = base_utils.get_tabular_resource(inputs, resource_name) _df.metadata = _update_metadata(inputs.metadata, _id) return _id, _df
def produce( self, *, left: Inputs, # type: ignore right: Inputs, # type: ignore timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: # attempt to extract the main table try: left_resource_id, left_df = d3m_base_utils.get_tabular_resource( left, None) except ValueError as error: raise exceptions.InvalidArgumentValueError( "Failure to find tabular resource in left dataset") from error try: right_resource_id, right_df = d3m_base_utils.get_tabular_resource( right, None) except ValueError as error: raise exceptions.InvalidArgumentValueError( "Failure to find tabular resource in right dataset") from error accuracy = self.hyperparams['accuracy'] if accuracy <= 0.0 or accuracy > 1.0: raise exceptions.InvalidArgumentValueError('accuracy of ' + str(accuracy) + ' is out of range') left_col = self.hyperparams['left_col'] right_col = self.hyperparams['right_col'] # perform join based on semantic type join_type = self._get_join_semantic_type(left, left_resource_id, left_col, right, right_resource_id, right_col) joined: pd.Dataframe = None if join_type in self._STRING_JOIN_TYPES: joined = self._join_string_col(left_df, left_col, right_df, right_col, accuracy) elif join_type in self._NUMERIC_JOIN_TYPES: joined = self._join_numeric_col(left_df, left_col, right_df, right_col, accuracy) elif join_type in self._DATETIME_JOIN_TYPES: joined = self._join_datetime_col(left_df, left_col, right_df, right_col, accuracy) else: raise exceptions.InvalidArgumentValueError( 'join not surpported on type ' + str(join_type)) # create a new dataset to hold the joined data resource_map = {} for resource_id, resource in left.items(): # type: ignore if resource_id == left_resource_id: resource_map[resource_id] = joined else: resource_map[resource_id] = resource result_dataset = container.Dataset(resource_map, generate_metadata=True) return base.CallResult(result_dataset)
def augment(): try: logger.debug("Start running augment...") search_result = read_file(request.files, 'task', 'json') # if not send the json via file if not search_result and request.form.get('task'): search_result = json.loads(request.form.get('task')) if search_result is None: return wrap_response( code='1000', msg='FAIL SEARCH - Unable to get search result', data=None) # if data is csv content data = read_file(request.files, 'data', 'csv') # if data is not a csv content but a str path if data is not None: loaded_dataset = load_csv_data(data) elif request.values.get('data'): path = request.values.get('data') if path.lower().endswith("csv"): loaded_dataset = load_csv_data(path) else: loaded_dataset = load_d3m_dataset(data) else: loaded_dataset = None return_format = request.values.get('format') if not return_format or return_format.lower() == "csv": return_format = "csv" elif return_format.lower() == "d3m": return_format = "d3m" else: return wrap_response(code='1000', msg='FAIL SEARCH - Unknown return format: ' + str(return_format), data=None) # search without supplied data, not implement yet # TODO: implement this part! if loaded_dataset is None: return wrap_response( code='1000', msg='FAIL SEARCH - Unable to load input supplied data', data=None) # search with supplied data else: columns = request.values.get('columns') if columns and type(columns) is not list: columns = columns.split(", ") logger.info("Required columns found as: " + str(columns)) columns_formated = [] if columns: for each in columns: columns_formated.append( DatasetColumn(resource_id=AUGMENT_RESOURCE_ID, column_index=int(each))) logger.debug("Start running wikifier...") # preprocess on loaded_dataset search_result_wikifier = DatamartSearchResult( search_result={}, supplied_data=None, query_json={}, search_type="wikifier") loaded_dataset = search_result_wikifier.augment( supplied_data=loaded_dataset) logger.debug("Wikifier running finished, start running augment...") search_result = DatamartSearchResult.deserialize( search_result['materialize_info']) augment_result = search_result.augment( supplied_data=loaded_dataset, augment_columns=columns_formated) res_id, result_df = d3m_utils.get_tabular_resource( dataset=augment_result, resource_id=None) augment_result[res_id] = result_df.astype(str) # update structural type update_part = {"structural_type": str} for i in range(result_df.shape[1]): augment_result.metadata = augment_result.metadata.update( metadata=update_part, selector=(res_id, ALL_ELEMENTS, i)) result_id = str(hash(result_df.values.tobytes())) # if required to store in disk and return the path if request.values.get('destination'): logger.info("Saving to a given destination required.") save_dir = os.path.join(request.values.get('destination'), "augment_result" + result_id) if os.path.isdir(save_dir) or os.path.exists(save_dir): shutil.rmtree(save_dir) # save dataset augment_result.save("file://" + save_dir + "/datasetDoc.json") # zip and send to client base_path = pathlib.Path(save_dir + '/') data = io.BytesIO() filePaths = retrieve_file_paths(save_dir) # print('The following list of files will be zipped:') for fileName in filePaths: # print(fileName) zip_file = zipfile.ZipFile(data, 'w') with zip_file: # write each file seperately for file in filePaths: zip_file.write(file) data.seek(0) return wrap_response(code='0000', msg='Success', data=save_dir) else: # save dataset in temp directory logger.info("Return the augment result directly required.") with tempfile.TemporaryDirectory() as tmpdir: absolute_path_part_length = len(str(tmpdir)) save_dir = os.path.join(str(tmpdir), result_id) # print(save_dir) # sys.stdout.flush() augment_result.save("file://" + save_dir + "/datasetDoc.json") # zip and send to client base_path = pathlib.Path(save_dir + '/') data = io.BytesIO() filePaths = retrieve_file_paths(save_dir) zip_file = zipfile.ZipFile(data, 'w') with zip_file: # write each file seperately for fileName in filePaths: shorter_path = fileName[absolute_path_part_length:] zip_file.write(fileName, shorter_path) data.seek(0) return send_file(data, mimetype='application/zip', as_attachment=True, attachment_filename='download_result' + result_id + '.zip') except Exception as e: return wrap_response(code='1000', msg="FAIL SEARCH - %s \n %s" % (str(e), str(traceback.format_exc())))
def produce(self, *, inputs: container.Dataset, timeout: float = None, iterations: int = None) -> base.CallResult[container.Dataset]: # if this is a single resource dataset we don't need to reformat it if len(inputs) < 2: return base.CallResult(inputs) # find the main resource if supplied, infer if not main_resource_id, main_resource = base_utils.get_tabular_resource( inputs, self.hyperparams["main_resource_id"]) if main_resource_id is None: raise exceptions.InvalidArgumentValueError( "no main resource specified") # find the csv file column resource if supplied, infer if not file_index = self.hyperparams["file_col_index"] if file_index is not None: if not self._is_csv_file_column(inputs.metadata, main_resource_id, file_index): raise exceptions.InvalidArgumentValueError( "column idx=" + str(file_index) + " from does not contain csv file names") else: file_index = self._find_csv_file_column(inputs.metadata, main_resource_id) if file_index is None: raise exceptions.InvalidArgumentValueError( "no column from contains csv file names") # generate the long form timeseries data base_path = self._get_base_path(inputs.metadata, main_resource_id, file_index) csv_paths = [ os.path.join(base_path, local_path) for local_path in inputs[main_resource_id].iloc[:, file_index] ] new_dfs = [pd.read_csv(path) for path in csv_paths] original_dfs = [ pd.DataFrame( np.tile(row, (df.shape[0], 1)), columns=inputs[main_resource_id].columns, index=df.index, ) for row, df in zip(inputs[main_resource_id].values, new_dfs) ] combined_dfs = [ original_df.join(new_df) for original_df, new_df in zip(original_dfs, new_dfs) ] output_data = pd.concat(combined_dfs) timeseries_dataframe = container.DataFrame(output_data) timeseries_dataframe.reset_index(drop=True, inplace=True) # make sure that all timeseries have the same length, most downstream tasks will appreciate this. if self.hyperparams["equal_length"]: min_length = (timeseries_dataframe.groupby( timeseries_dataframe.columns[file_index]).count().min(). values[0]) group_count = timeseries_dataframe.groupby( timeseries_dataframe.columns[file_index]).cumcount() timeseries_dataframe = timeseries_dataframe.assign( group_count=group_count) timeseries_dataframe = timeseries_dataframe[ timeseries_dataframe["group_count"] < min_length] timeseries_dataframe = timeseries_dataframe.drop(["group_count"], axis=1) # create a dataset to hold the result timeseries_dataset = container.Dataset( {self._resource_id: timeseries_dataframe}, generate_metadata=True) timeseries_dataset.metadata = timeseries_dataset.metadata.update( (), {"id": inputs.metadata.query(())["id"]}) timeseries_dataset.metadata = timeseries_dataset.metadata.update( (), {"digest": inputs.metadata.query(())["digest"]}) # copy main resource column metadata to timeseries dataframe num_main_resource_cols = inputs.metadata.query( (main_resource_id, metadata_base.ALL_ELEMENTS))["dimension"]["length"] for i in range(num_main_resource_cols): source = inputs.metadata.query( (main_resource_id, metadata_base.ALL_ELEMENTS, i)) timeseries_dataset.metadata = timeseries_dataset.metadata.update_column( i, source, at=(self._resource_id, )) # remove the foreign key entry from the filename column if it exists metadata = dict( timeseries_dataset.metadata.query( (self._resource_id, metadata_base.ALL_ELEMENTS, file_index))) metadata["foreign_key"] = metadata_base.NO_VALUE timeseries_dataset.metadata = timeseries_dataset.metadata.update( (self._resource_id, metadata_base.ALL_ELEMENTS, file_index), metadata) # copy timeseries column metadata to timeseries if its available in the metadata (which is not necssarily true anymore) source = self._find_timeseries_metadata(inputs) i = 0 start_idx = 0 if source is not None: for col_info in source["file_columns"]: timeseries_dataset.metadata = timeseries_dataset.metadata.update_column( i + num_main_resource_cols, col_info, at=(self._resource_id, )) i += 1 # flag all other columns as attributes start_idx = i + num_main_resource_cols else: # loop over the appended time series columns start_idx = original_dfs[0].shape[1] for i in range(start_idx, timeseries_dataframe.shape[1]): timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, i), "https://metadata.datadrivendiscovery.org/types/Attribute", ) struct_type = timeseries_dataset.metadata.query( (self._resource_id, metadata_base.ALL_ELEMENTS, i))["structural_type"] if struct_type == np.float64: timeseries_dataset.metadata = ( timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, i), "http://schema.org/Float", )) elif struct_type == np.int64: timeseries_dataset.metadata = ( timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, i), "http://schema.org/Integer", )) else: timeseries_dataset.metadata = ( timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, i), "http://schema.org/Text", )) # mark the filename column as a grouping key timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, file_index), "https://metadata.datadrivendiscovery.org/types/GroupingKey", ) # mark the d3mIndex as a primary multi-key since there are now multiple instances of the value present primary_index_col = ( timeseries_dataset.metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/PrimaryKey", ), at=(self._resource_id, ), )) timeseries_dataset.metadata = timeseries_dataset.metadata.remove_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, primary_index_col[0]), "https://metadata.datadrivendiscovery.org/types/PrimaryKey", ) timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, primary_index_col[0]), "https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey", ) return base.CallResult(timeseries_dataset)
def produce( self, *, left: Inputs, # type: ignore right: Inputs, # type: ignore timeout: float = None, iterations: int = None, ) -> base.CallResult[Outputs]: # attempt to extract the main table try: left_resource_id, left_df = d3m_base_utils.get_tabular_resource(left, None) except ValueError as error: raise exceptions.InvalidArgumentValueError( "Failure to find tabular resource in left dataset" ) from error try: right_resource_id, right_df = d3m_base_utils.get_tabular_resource( right, None ) except ValueError as error: raise exceptions.InvalidArgumentValueError( "Failure to find tabular resource in right dataset" ) from error accuracy = self.hyperparams["accuracy"] absolute_accuracy = self.hyperparams["absolute_accuracy"] # hyperparams may be parsed as tuples # floats could be integers if round number is passed in if isinstance(accuracy, collections.Iterable): accuracy = [float(a) for a in accuracy] else: accuracy = float(accuracy) if isinstance(absolute_accuracy, collections.Iterable): absolute_accuracy = list(absolute_accuracy) if type(accuracy) == float and not type(absolute_accuracy) == bool: raise exceptions.InvalidArgumentValueError( "only 1 value of accuracy provided, but multiple values for absolute accuracy provided" ) if (not type(accuracy) == float) and type(absolute_accuracy) == bool: raise exceptions.InvalidArgumentValueError( "only 1 for absolute accuracy provided, but multiple values of accuracy provided" ) if type(accuracy) == float and not absolute_accuracy: if accuracy <= 0.0 or accuracy > 1.0: raise exceptions.InvalidArgumentValueError( "accuracy of " + str(accuracy) + " is out of range" ) elif type(accuracy) == list and type(absolute_accuracy) == list: if not len(accuracy) == len(absolute_accuracy): raise exceptions.InvalidArgumentValueError( "the count of accuracy hyperparams does not match the count of absolute_accuracy hyperparams" ) for i in range(len(accuracy)): if (accuracy[i] <= 0.0 or accuracy[i] > 1.0) and not absolute_accuracy[i]: raise exceptions.InvalidArgumentValueError( "accuracy of " + str(acc) + " is out of range" ) left_col = self.hyperparams["left_col"] right_col = self.hyperparams["right_col"] if type(left_col) != type(right_col) or ( type(left_col) == list and len(left_col) != len(right_col) and type(accuracy) != list and len(accuracy) != len(left_col) ): raise exceptions.InvalidArgumentTypeError( "both left_col and right_col need to have same data type and if they are lists, the same list lengths" ) if type(left_col) == str: left_col = [left_col] right_col = [right_col] accuracy = [accuracy] absolute_accuracy = [absolute_accuracy] join_types = [ self._get_join_semantic_type( left, left_resource_id, left_col[i], right, right_resource_id, right_col[i], ) for i in range(len(left_col)) ] num_splits = 32 joined_split = [None for i in range(num_splits)] left_df_split = np.array_split(left_df, num_splits) jobs = [delayed(self._produce_threaded)( index = i, left_df_full = left_df, left_dfs = left_df_split, right_df = right_df, join_types = join_types, left_col = left_col, right_col = right_col, accuracy = accuracy, absolute_accuracy = absolute_accuracy ) for i in range(num_splits)] joined_data = Parallel(n_jobs=self.hyperparams["n_jobs"], backend="loky", verbose=10)(jobs) # joined data needs to maintain order to mimic none split joining for i, d in joined_data: joined_split[i] = d joined = pd.concat(joined_split, ignore_index = True) # create a new dataset to hold the joined data resource_map = {} float_vector_columns = {} for resource_id, resource in left.items(): # type: ignore if resource_id == left_resource_id: for column in joined.columns: # need to avoid bug in container.Dataset, it doesn't like vector columns if type(joined[column].iloc[0]) == np.ndarray: float_vector_columns[column] = joined[column] joined[column] = np.NAN resource_map[resource_id] = joined else: resource_map[resource_id] = resource # Generate metadata for the dataset using only the first row of the resource for speed - # metadata generation runs over each cell in the dataframe, but we only care about column # level generation. Once that's done, set the actual dataframe value. result_dataset = container.Dataset( {k: v.head(1) for k, v in resource_map.items()}, generate_metadata=True ) for k, v in resource_map.items(): result_dataset[k] = v result_dataset.metadata = result_dataset.metadata.update( (k,), {"dimension": {"length": v.shape[0]}} ) for key in float_vector_columns.keys(): df = result_dataset[left_resource_id] df[key] = float_vector_columns[key] float_vec_loc = df.columns.get_loc(key) float_vec_col_indices = df.metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/FloatVector",) ) if float_vec_loc not in float_vec_col_indices: df.metadata = df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, float_vec_loc), "https://metadata.datadrivendiscovery.org/types/FloatVector", ) return base.CallResult(result_dataset)
def get_target_columns(dataset: container.Dataset): """ Extracts true targets from the Dataset's entry point, or the only tabular resource. It requires that there is only one primary index column, which it makes the first column, named ``d3mIndex``. Then true target columns follow. We return a regular Pandas DataFrame with column names matching those in the metadata. We convert all columns to strings to match what would be loaded from ``predictions.csv`` file. It encodes any float vectors as strings. From: d3m/contrib/primitives/compute_scores.py:ComputeScoresPrimitive._get_truth """ main_resource_id, main_resource = base_utils.get_tabular_resource( dataset, None, has_hyperparameter=False) # We first copy before modifying in-place. main_resource = container.DataFrame(main_resource, copy=True) main_resource = _encode_columns(main_resource) dataframe = _to_dataframe(main_resource) indices = list(dataset.metadata.get_index_columns(at=(main_resource_id, ))) targets = list( dataset.metadata.list_columns_with_semantic_types( ['https://metadata.datadrivendiscovery.org/types/TrueTarget'], at=(main_resource_id, ), )) if not indices: raise exceptions.InvalidArgumentValueError("No primary index column.") elif len(indices) > 1: raise exceptions.InvalidArgumentValueError( "More than one primary index column.") if not targets: raise ValueError("No true target columns.") dataframe = dataframe.iloc[:, indices + targets] dataframe = dataframe.rename({dataframe.columns[0]: 'd3mIndex'}) if 'confidence' in dataframe.columns[1:]: raise ValueError( "True target column cannot be named \"confidence\". It is a reserved name." ) if 'd3mIndex' in dataframe.columns[1:]: raise ValueError( "True target column cannot be named \"d3mIndex\". It is a reserved name." ) if d3m_utils.has_duplicates(dataframe.columns): duplicate_names = list(dataframe.columns) for name in set(dataframe.columns): duplicate_names.remove(name) raise exceptions.InvalidArgumentValueError( "True target columns have duplicate names: {duplicate_names}". format(duplicate_names=sorted(set(duplicate_names)), ), ) dataframe = container.DataFrame(dataframe) dataframe.metadata = dataframe.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') dataframe.metadata = dataframe.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), 'http://schema.org/Integer') dataframe.metadata = dataframe.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') return dataframe