class MetafeatureExtractor(FeaturizationTransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ A primitive which takes a DataFrame and computes metafeatures on the data. Target column is identified by being labeled with 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in 'semantic_types' metadata. Otherwise primitive assumes there is no target column and only metafeatures that do not involve targets are returned. If DataFrame metadata does not include semantic type labels for each column, columns will be classified as CATEGORICAL or NUMERIC according to their dtype: int and float are NUMERIC, all others are CATEGORICAL. Metafeatures are stored in the metadata object of the DataFrame, and the DataFrame itself is returned unchanged """ # This should contain only metadata which cannot be automatically determined from the code. metadata = metadata_base.PrimitiveMetadata({ 'id': '28d12214-8cb0-4ac0-8946-d31fcbcd4142', 'version': __version__, 'name': 'Dataset Metafeature Extraction', 'source': { 'name': 'byu-dml', 'contact': 'mailto:[email protected]', 'uris': ['https://github.com/byu-dml/d3m-primitives'] }, 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package': 'byudml', 'version': __version__, }], 'location_uris': [ 'https://github.com/byu-dml/d3m-primitives/blob/master/byudml/metafeature_extraction/metafeature_extraction.py' ], 'python_path': __metafeature_path__, 'primitive_family': metadata_base.PrimitiveFamily.METALEARNING, 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, metadata_base.PrimitiveAlgorithmType.STATISTICAL_MOMENT_ANALYSIS, metadata_base.PrimitiveAlgorithmType. INFORMATION_THEORETIC_METAFEATURE_EXTRACTION, # metadata_base.PrimitiveAlgorithmType.LANDMARKING_METAFEATURE_EXTRACTION, # TODO # metadata_base.PrimitiveAlgorithmType.MODEL_BASED_METAFEATURE_EXTRACTION, # TODO metadata_base.PrimitiveAlgorithmType. STATISTICAL_METAFEATURE_EXTRACTION, ], }) _mapping_file_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'metalearn_to_d3m_map.json') def __init__( self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) # prepare the data, target_series, and column_types arguments necessary for metafeature computation def _get_data_for_metafeature_computation(self, metadata, data): column_types = {} target_col_names = [] target_series = None for col_pos, column_name in enumerate(data.columns): column_metadata = metadata.query_column(col_pos) semantic_types = column_metadata.get('semantic_types', tuple()) column_name = column_metadata.get('name', column_name) if not self._remove_redacted_column(data, column_name, semantic_types): self._update_column_type(data, column_name, semantic_types, column_types) self._append_target_column_name(column_name, semantic_types, target_col_names) if INDEX_COLUMN_NAME in data.columns: data.drop(INDEX_COLUMN_NAME, axis=1, inplace=True) del column_types[INDEX_COLUMN_NAME] if len(target_col_names) == 1: target_series = data[target_col_names[0]] data.drop(target_col_names[0], axis=1, inplace=True) elif len(target_col_names) > 1: self.logger.warning( f'\nWARNING: Target dependent metafeatures are not supported for multi-label datasets and will not be computed\n' ) return data, target_series, column_types def _d3m_metafeature_name_to_metalearn_functions(self, d3m_metafeature_name): metalearn_functions = [] mapping = json.load(open(self._mapping_file_path)) for function_name, properties in mapping.items(): metafeature_name = properties['data_metafeatures_path'].split( '.')[0] if metafeature_name == d3m_metafeature_name: metalearn_functions.append(function_name) return metalearn_functions # recursively adds a value to a dictionary given a series of one or more keys def _place_value(self, dictionary, path, value): if len(path) == 0: return value sub_dict = dictionary.get(path[0], {}) dictionary[path[0]] = self._place_value(sub_dict, path[1:], value) return dictionary # parses the mapping file to obtain a list of all the metalearn metafeatures that are classified as inexpensive def _get_inexpensive_subset(self): inexpensive_subset = [] mapping = json.load(open(self._mapping_file_path)) for key, value in mapping.items(): if value['computation_time'] == 'inexpensive': d3m_metafeature_name = value['data_metafeatures_path'].split( '.')[0] if d3m_metafeature_name not in inexpensive_subset: inexpensive_subset.append(d3m_metafeature_name) return inexpensive_subset # returns the user's desired metafeature set according to hyperparam object def _get_metafeatures_to_compute(self): if self.hyperparams['metafeature_subset'] == 'CUSTOM': return self.hyperparams['metafeatures_to_compute'] elif self.hyperparams['metafeature_subset'] == 'INEXPENSIVE': return self._get_inexpensive_subset() elif self.hyperparams['metafeature_subset'] == 'ALL': # Just get every metafeature name in the mapping mapping = json.load(open(self._mapping_file_path)) return [ mf_obj['data_metafeatures_path'].split('.')[0] for mf_obj in mapping.values() ] def _get_landmarking_metafeatures(self): landmarking_mfs = [] mapping = json.load(open(self._mapping_file_path)) for key, value in mapping.items(): if 'landmarking' in value: if value['landmarking'] == True: landmarking_mfs.append(key) return landmarking_mfs # set the 'primitive' and 'random_seed' fields for metafeatures who's results could vary depending on implementation def _set_implementation_fields(self, data_metafeatures, data_metafeatures_path): landmarking_name = data_metafeatures_path[0] if landmarking_name not in data_metafeatures: primitive_field_path = [landmarking_name, 'primitive'] random_seed_field_path = [landmarking_name, 'random_seed'] primitive_field_val = { 'id': self.metadata.query()['id'], 'version': __version__, 'python_path': self.metadata.query()['python_path'], 'name': self.metadata.query()['name'] } if 'digest' in self.metadata.query(): primitive_field_val['digest'] = self.metadata.query()['digest'] random_seed_field_val = self.random_seed data_metafeatures = self._place_value(data_metafeatures, primitive_field_path, primitive_field_val) data_metafeatures = self._place_value(data_metafeatures, random_seed_field_path, random_seed_field_val) return data_metafeatures # populate metadata with metafeatures and return it def _populate_metadata( self, metafeatures, metadata, ): dataframe_metadata = dict(metadata.query((), )) data_metafeatures = dataframe_metadata.get('data_metafeatures', {}) mapping = json.load(open(self._mapping_file_path)) for column_name in metafeatures.columns: if column_name[-4:] != 'Time': data_metafeatures_path = mapping[column_name][ 'data_metafeatures_path'].split('.') metafeature_val = metafeatures[column_name][0] if pd.notna(metafeature_val) and metafeature_val not in ( mf_consts.TIMEOUT, mf_consts.NO_TARGETS, mf_consts.NUMERIC_TARGETS): if column_name in self._get_landmarking_metafeatures(): data_metafeatures = self._set_implementation_fields( data_metafeatures, data_metafeatures_path) if mapping[column_name]['required_type'] == 'integer': metafeature_val = int(metafeature_val) data_metafeatures = self._place_value( data_metafeatures, data_metafeatures_path, metafeature_val) dataframe_metadata['data_metafeatures'] = data_metafeatures if 'schema' not in dataframe_metadata: dataframe_metadata[ 'schema'] = 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json' if 'structural_type' not in dataframe_metadata: dataframe_metadata['structural_type'] = DataFrame metadata = metadata.update((), dataframe_metadata) return metadata # given a d3m DataFrame, return it with the computed metafeatures (specified by the hyperparam) added to it's metadata def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if not isinstance(inputs, DataFrame): raise ValueError( 'inputs must be an instance of \'d3m.container.pandas.DataFrame\'' ) metadata = self._produce(inputs.metadata, copy.copy(inputs)) inputs.metadata = metadata.generate(inputs) return CallResult(inputs) # add the column types to the column_types dict and convert the column to the appropriate data types if necessary def _update_column_type(self, data, column_name, semantic_types, column_types): if 'http://schema.org/Float' in semantic_types \ or 'http://schema.org/Integer' in semantic_types and 'https://metadata.datadrivendiscovery.org/types/CategoricalData' not in semantic_types: column_types[column_name] = mf_consts.NUMERIC actual_type = str(data[column_name].dtype) if 'int' not in actual_type and 'float' not in actual_type: data[column_name] = pd.to_numeric(data[column_name]) else: column_types[column_name] = mf_consts.CATEGORICAL # remove redacted column from data by checking if it has one of the redacted semantic types def _remove_redacted_column(self, data, column_name, semantic_types): if 'https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData' in semantic_types \ or 'https://metadata.datadrivendiscovery.org/types/RedactedTarget' in semantic_types: data.drop(column_name, axis=1, inplace=True) return True return False # check if a column is a target and if so add it to the target_col_names list def _append_target_column_name(self, column_name, semantic_types, target_col_names): if 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in semantic_types: target_col_names.append(column_name) def _produce(self, metadata, data): # get data related inputs for the metafeature computation data, target_series, column_types = self._get_data_for_metafeature_computation( metadata, data) # translate d3m metafeature names to metalearn names d3m_metafeatures_to_compute = self._get_metafeatures_to_compute() if d3m_metafeatures_to_compute is not None: metalearn_metafeatures_to_compute = [] for mf in d3m_metafeatures_to_compute: metalearn_functions = self._d3m_metafeature_name_to_metalearn_functions( mf) metalearn_metafeatures_to_compute.extend(metalearn_functions) else: metalearn_metafeatures_to_compute = None # compute metafeatures and return in metadata metafeatures = Metafeatures().compute( data, target_series, column_types=column_types, metafeature_ids=metalearn_metafeatures_to_compute, seed=self.random_seed) metafeature_df = pd.DataFrame.from_dict([{ mf: metafeatures[mf][mf_consts.VALUE_KEY] for mf in metafeatures }]) metadata = self._populate_metadata(metafeature_df, metadata) return metadata
class RandomSamplingImputer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ This imputes missing values in a DataFrame by sampling known values from each column independently. If the training data has no known values in a particular column, no values are imputed. Alternatively, columns with missing values can be dropped. By default columns of all missing values are dropped. """ metadata = metadata_base.PrimitiveMetadata({ 'id': 'ebfeb6f0-e366-4082-b1a7-602fd50acc96', 'version': __version__, 'name': 'Random Sampling Imputer', 'source': { 'name': 'byu-dml', 'contact': 'mailto:[email protected]', 'uris': [ 'https://github.com/byu-dml/d3m-primitives', ] }, 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package': 'byudml', 'version': __version__, }], 'location_uris': [ 'https://github.com/byu-dml/d3m-primitives/blob/master/byudml/imputer/random_sampling_imputer.py', ], 'python_path': __imputer_path__, 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.IMPUTATION, ], 'effects': [ # not the case if empty columns are just ignored metadata_base.PrimitiveEffect.NO_MISSING_VALUES, ] }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._random_state = np.random.RandomState(self.random_seed) self._fitted: bool = False self._training_inputs: Inputs = None self._known_values = None self._drop_cols = None self._drop_col_indices = None def set_training_data(self, *, inputs: Inputs) -> None: self._fitted = False self._training_inputs = inputs self._known_values = [] self._drop_cols = [] self._drop_col_indices = [] def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return CallResult(None) if self._training_inputs is None: raise d3m_exceptions.MissingValueError( 'set_training_data must be called before fit') # operate on columns by index, not name for i, (col_name, col) in enumerate(self._training_inputs.iteritems()): drop_col = False if self.hyperparams['drop_missing_values']: if self.hyperparams['how'] == 'all' and col.isnull().all(): drop_col = True elif self.hyperparams['how'] == 'any' and col.isnull().any(): drop_col = True self._drop_cols.append(drop_col) if drop_col: self._drop_col_indices.append(i) col_known_values = None if not drop_col: col_known_values = col.dropna(axis=0, how='any').tolist() self._known_values.append(col_known_values) self._fitted = True self._training_inputs = None # free memory return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if not self._fitted: raise d3m_exceptions.PrimitiveNotFittedError( 'fit must be called before produce') if inputs.shape[1] != len(self._known_values): raise d3m_exceptions.DimensionalityMismatchError( 'The number of input columns does not match the training data: {} != {}' .format(inputs.shape[1], len(self._known_values))) outputs = inputs.copy() for i, (col_name, col) in enumerate(inputs.iteritems()): if self._drop_cols[i]: assert self._known_values[i] is None else: indices_of_missing_values = col.isnull() n_missing = indices_of_missing_values.sum() n_known = len(self._known_values[i]) if n_missing > 0 and n_known > 0: # k_known == 0 implies drop_missing_values == False outputs.loc[indices_of_missing_values, col_name] = self._random_state.choice( self._known_values[i], n_missing, replace=True) # TODO: update column metadata? outputs = outputs.remove_columns(self._drop_col_indices) # TODO: update global metadata if any values were imputed? return CallResult(outputs) def get_params(self) -> Params: if not self._fitted: raise d3m_exceptions.PrimitiveNotFittedError( 'fit must be called before get_params') return Params(known_values=self._known_values, drop_cols=self._drop_cols, drop_col_indices=self._drop_col_indices) def set_params(self, *, params: Params) -> None: self._fitted = True self._training_inputs = None self._known_values = params['known_values'] self._drop_cols = params['drop_cols'] self._drop_col_indices = params['drop_col_indices']
class TimeSeriesFormatterPrimitive( transformer.TransformerPrimitiveBase[container.Dataset, container.Dataset, Hyperparams]): """ Reads the time series files from a given column in an input dataset resource into a new M x N data resource, where each value in timeseries occupies one of M rows. Each row has N columns, representing the union of the fields found in the timeseries files and in the main data resource. The loading process assumes that each series file has an identical set of timestamps. The `GroupingKey` semantic type will be added to the column that contains the file names, and the time column will be marked with the `Time` semantic type. Example output:: filename | time | value | label | ------------------------------------------------- f1.csv | 0 | 0.1 | alpha | f1.csv | 1 | 0.12 | alpha | f1.csv | 2 | 0.13 | alpha | f2.csv | 0 | 0.72 | bravo | f2.csv | 1 | 0.77 | bravo | f2.csv | 2 | 0.67 | bravo | """ _semantic_types = ( "https://metadata.datadrivendiscovery.org/types/FileName", "https://metadata.datadrivendiscovery.org/types/Timeseries", "http://schema.org/Text", "https://metadata.datadrivendiscovery.org/types/Attribute", ) _media_types = ("text/csv", ) _resource_id = "learningData" __author__ = ("Uncharted Software", ) metadata = metadata_base.PrimitiveMetadata({ "id": "6a1ce3ee-ee70-428b-b1ff-0490bdb23023", "version": version.__version__, "name": "Time series formatter", "python_path": "d3m.primitives.data_transformation.time_series_formatter.DistilTimeSeriesFormatter", "keywords": ["series", "reader", "csv"], "source": { "name": "Distil", "contact": "mailto:[email protected]", "uris": [ "https://github.com/uncharted-distil/distil-primitives/blob/main/distil/primitives/time_series_formatter.py", "https://gitlab.com/uncharted-distil/distil-primitives", ], }, "installation": [ CYTHON_DEP, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@{git_commit}#egg=distil-primitives" .format(git_commit=d3m_utils.current_git_commit( os.path.dirname(__file__)), ), }, ], "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.FILE_MANIPULATION, ], "supported_media_types": _media_types, "primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, }) def produce(self, *, inputs: container.Dataset, timeout: float = None, iterations: int = None) -> base.CallResult[container.Dataset]: # if this is a single resource dataset we don't need to reformat it if len(inputs) < 2: return base.CallResult(inputs) # find the main resource if supplied, infer if not main_resource_id, main_resource = base_utils.get_tabular_resource( inputs, self.hyperparams["main_resource_id"]) if main_resource_id is None: raise exceptions.InvalidArgumentValueError( "no main resource specified") # find the csv file column resource if supplied, infer if not file_index = self.hyperparams["file_col_index"] if file_index is not None: if not self._is_csv_file_column(inputs.metadata, main_resource_id, file_index): raise exceptions.InvalidArgumentValueError( "column idx=" + str(file_index) + " from does not contain csv file names") else: file_index = self._find_csv_file_column(inputs.metadata, main_resource_id) if file_index is None: raise exceptions.InvalidArgumentValueError( "no column from contains csv file names") # generate the long form timeseries data base_path = self._get_base_path(inputs.metadata, main_resource_id, file_index) csv_paths = [ os.path.join(base_path, local_path) for local_path in inputs[main_resource_id].iloc[:, file_index] ] new_dfs = [pd.read_csv(path) for path in csv_paths] original_dfs = [ pd.DataFrame( np.tile(row, (df.shape[0], 1)), columns=inputs[main_resource_id].columns, index=df.index, ) for row, df in zip(inputs[main_resource_id].values, new_dfs) ] combined_dfs = [ original_df.join(new_df) for original_df, new_df in zip(original_dfs, new_dfs) ] output_data = pd.concat(combined_dfs) timeseries_dataframe = container.DataFrame(output_data) timeseries_dataframe.reset_index(drop=True, inplace=True) # make sure that all timeseries have the same length, most downstream tasks will appreciate this. if self.hyperparams["equal_length"]: min_length = (timeseries_dataframe.groupby( timeseries_dataframe.columns[file_index]).count().min(). values[0]) group_count = timeseries_dataframe.groupby( timeseries_dataframe.columns[file_index]).cumcount() timeseries_dataframe = timeseries_dataframe.assign( group_count=group_count) timeseries_dataframe = timeseries_dataframe[ timeseries_dataframe["group_count"] < min_length] timeseries_dataframe = timeseries_dataframe.drop(["group_count"], axis=1) # create a dataset to hold the result timeseries_dataset = container.Dataset( {self._resource_id: timeseries_dataframe}, generate_metadata=True) timeseries_dataset.metadata = timeseries_dataset.metadata.update( (), {"id": inputs.metadata.query(())["id"]}) timeseries_dataset.metadata = timeseries_dataset.metadata.update( (), {"digest": inputs.metadata.query(())["digest"]}) # copy main resource column metadata to timeseries dataframe num_main_resource_cols = inputs.metadata.query( (main_resource_id, metadata_base.ALL_ELEMENTS))["dimension"]["length"] for i in range(num_main_resource_cols): source = inputs.metadata.query( (main_resource_id, metadata_base.ALL_ELEMENTS, i)) timeseries_dataset.metadata = timeseries_dataset.metadata.update_column( i, source, at=(self._resource_id, )) # remove the foreign key entry from the filename column if it exists metadata = dict( timeseries_dataset.metadata.query( (self._resource_id, metadata_base.ALL_ELEMENTS, file_index))) metadata["foreign_key"] = metadata_base.NO_VALUE timeseries_dataset.metadata = timeseries_dataset.metadata.update( (self._resource_id, metadata_base.ALL_ELEMENTS, file_index), metadata) # copy timeseries column metadata to timeseries if its available in the metadata (which is not necssarily true anymore) source = self._find_timeseries_metadata(inputs) i = 0 start_idx = 0 if source is not None: for col_info in source["file_columns"]: timeseries_dataset.metadata = timeseries_dataset.metadata.update_column( i + num_main_resource_cols, col_info, at=(self._resource_id, )) i += 1 # flag all other columns as attributes start_idx = i + num_main_resource_cols else: # loop over the appended time series columns start_idx = original_dfs[0].shape[1] for i in range(start_idx, timeseries_dataframe.shape[1]): timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, i), "https://metadata.datadrivendiscovery.org/types/Attribute", ) struct_type = timeseries_dataset.metadata.query( (self._resource_id, metadata_base.ALL_ELEMENTS, i))["structural_type"] if struct_type == np.float64: timeseries_dataset.metadata = ( timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, i), "http://schema.org/Float", )) elif struct_type == np.int64: timeseries_dataset.metadata = ( timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, i), "http://schema.org/Integer", )) else: timeseries_dataset.metadata = ( timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, i), "http://schema.org/Text", )) # mark the filename column as a grouping key timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, file_index), "https://metadata.datadrivendiscovery.org/types/GroupingKey", ) # mark the d3mIndex as a primary multi-key since there are now multiple instances of the value present primary_index_col = ( timeseries_dataset.metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/PrimaryKey", ), at=(self._resource_id, ), )) timeseries_dataset.metadata = timeseries_dataset.metadata.remove_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, primary_index_col[0]), "https://metadata.datadrivendiscovery.org/types/PrimaryKey", ) timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, primary_index_col[0]), "https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey", ) return base.CallResult(timeseries_dataset) @classmethod def _find_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata, res_id: str) -> typing.Optional[int]: indices = inputs_metadata.list_columns_with_semantic_types( cls._semantic_types, at=(res_id, )) for i in indices: if cls._is_csv_file_column(inputs_metadata, res_id, i): return i return None @classmethod def _is_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata, res_id: str, column_index: int) -> bool: # check to see if a given column is a file pointer that points to a csv file column_metadata = inputs_metadata.query( (res_id, metadata_base.ALL_ELEMENTS, column_index)) if not column_metadata or column_metadata["structural_type"] != str: return False # check if a foreign key exists if "foreign_key" not in column_metadata: return False ref_col_index = column_metadata["foreign_key"]["column_index"] ref_res_id = column_metadata["foreign_key"]["resource_id"] return cls._is_csv_file_reference(inputs_metadata, ref_res_id, ref_col_index) @classmethod def _is_csv_file_reference(cls, inputs_metadata: metadata_base.DataMetadata, res_id: int, column_index: int) -> bool: # check to see if the column is a csv resource column_metadata = inputs_metadata.query( (res_id, metadata_base.ALL_ELEMENTS, column_index)) if not column_metadata or column_metadata["structural_type"] != str: return False semantic_types = column_metadata.get("semantic_types", []) media_types = column_metadata.get("media_types", []) semantic_types_set = set(semantic_types) _semantic_types_set = set(cls._semantic_types) return bool( semantic_types_set.intersection(_semantic_types_set)) and set( cls._media_types).issubset(media_types) @classmethod def _find_timeseries_metadata( cls, dataset: container.Dataset ) -> typing.Optional[metadata_base.DataMetadata]: # loop over the dataset to find the resource that contains the timeseries file col info for resource_id, resource in dataset.items(): metadata = dataset.metadata.query((resource_id, "ALL_ELEMENTS", 0)) if "file_columns" in metadata: return metadata return None def _get_base_path( self, inputs_metadata: metadata_base.DataMetadata, res_id: str, column_index: int, ) -> str: # get the base uri from the referenced column column_metadata = inputs_metadata.query( (res_id, metadata_base.ALL_ELEMENTS, column_index)) ref_col_index = column_metadata["foreign_key"]["column_index"] ref_res_id = column_metadata["foreign_key"]["resource_id"] return inputs_metadata.query((ref_res_id, metadata_base.ALL_ELEMENTS, ref_col_index))["location_base_uris"][0] def _get_ref_resource( self, inputs_metadata: metadata_base.DataMetadata, res_id: str, column_index: int, ) -> str: # get the referenced resource from the referenced column column_metadata = inputs_metadata.query( (res_id, metadata_base.ALL_ELEMENTS, column_index)) ref_res_id = column_metadata["foreign_key"]["resource_id"] return ref_res_id
class ColumnParserPrimitive( transformer.TransformerPrimitiveBase[ container.DataFrame, container.DataFrame, Hyperparams ] ): """ A primitive which parses columns and sets the appropriate dtypes according to it's respective metadata. """ metadata = metadata_base.PrimitiveMetadata( { "id": "e8e78214-9770-4c26-9eae-a45bd0ede91a", "version": version.__version__, "name": "Column Parser", "python_path": "d3m.primitives.data_transformation.column_parser.DistilColumnParser", "source": { "name": "Distil", "contact": "mailto:[email protected]", "uris": [ "https://github.com/uncharted-distil/distil-primitives/blob/main/distil/primitives/column_parser.py", "https://gitlab.com/uncharted-distil/distil-primitives", ], }, "installation": [ CYTHON_DEP, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@{git_commit}#egg=distil-primitives".format( git_commit=d3m_utils.current_git_commit( os.path.dirname(__file__) ), ), }, ], "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION], "primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, } ) def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: start = time.time() logger.debug(f"Producing {__name__}") cols = self._get_columns(inputs.metadata) # outputs = container.DataFrame(generate_metadata=False) outputs = [None] * inputs.shape[1] parsing_semantics = self.hyperparams["parsing_semantics"] def fromstring(x: str) -> np.ndarray: # if column isn't a string, we'll just pass it through assuming it doesn't need to be parsed if type(x) is not str: return x return np.fromstring(x, dtype=float, sep=",") for col_index in range(len(inputs.columns)): if col_index in cols: column_metadata = inputs.metadata.query( (metadata_base.ALL_ELEMENTS, col_index) ) semantic_types = column_metadata.get("semantic_types", []) desired_semantics = set(semantic_types).intersection(parsing_semantics) if desired_semantics: if ( "https://metadata.datadrivendiscovery.org/types/FloatVector" in desired_semantics ): outputs[col_index] = inputs.iloc[:, col_index].apply( fromstring, convert_dtype=False ) if outputs[col_index].shape[0] > 0: inputs.metadata = inputs.metadata.update_column( col_index, {"structural_type": type(outputs[col_index][0])}, ) elif "http://schema.org/DateTime" in desired_semantics: outputs[col_index] = inputs.iloc[:, col_index].apply( utils.parse_datetime_to_float, fuzzy=self.hyperparams["fuzzy_time_parsing"], convert_dtype=False, ) inputs.metadata = inputs.metadata.update_column( col_index, {"structural_type": float} ) elif ( "https://metadata.datadrivendiscovery.org/types/CategoricalData" in desired_semantics ): # need to make sure if a categorical type is a numeric string, convert it if inputs[inputs.columns[col_index]][0].isnumeric(): outputs[col_index] = pd.to_numeric( inputs.iloc[:, col_index], errors=self.hyperparams["error_handling"], ) if outputs[col_index].shape[0] > 0: updated_type = type(outputs[col_index][0].item()) inputs.metadata = inputs.metadata.update_column( col_index, {"structural_type": updated_type} ) else: # if it's categorical but not numerical, ensure the string stays outputs[col_index] = inputs.iloc[:, col_index] else: outputs[col_index] = pd.to_numeric( inputs.iloc[:, col_index], errors=self.hyperparams["error_handling"], ) # Update structural type to reflect the results of the to_numeric call. We can't rely on the semantic type because # error coersion may result in a type becoming a float due to the presence of NaN. if outputs[col_index].shape[0] > 0: updated_type = type(outputs[col_index][0].item()) inputs.metadata = inputs.metadata.update_column( col_index, {"structural_type": updated_type} ) else: # columns without specified semantics need to be concatenated outputs[col_index] = inputs.iloc[:, col_index] else: # columns not specified still need to be concatenated outputs[col_index] = inputs.iloc[:, col_index] outputs = container.DataFrame(pd.concat(outputs, axis=1)) outputs.metadata = inputs.metadata end = time.time() logger.debug(f"Produce {__name__} completed in {end - start} ms") return base.CallResult(outputs) def _get_columns( self, inputs_metadata: metadata_base.DataMetadata ) -> typing.List[int]: def can_use_column(column_index: int) -> bool: return True columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( inputs_metadata, self.hyperparams["use_columns"], self.hyperparams["exclude_columns"], can_use_column, ) if self.hyperparams["use_columns"] and columns_not_to_use: self.logger.warning( "Not all specified columns can parsed. Skipping columns: %(columns)s", { "columns": columns_not_to_use, }, ) return columns_to_use
class duke(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': "46612a42-6120-3559-9db9-3aa9a76eb94f", 'version': __version__, 'name': "duke", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['Dataset Descriptor', 'Text', 'NLP', 'Abstractive Summarization'], 'source': { 'name': __author__, 'uris': [ # Unstructured URIs. "https://github.com/NewKnowledge/duke-d3m-wrapper", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [ { 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/NewKnowledge/duke-d3m-wrapper.git@{git_commit}#egg=DukeD3MWrapper' .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, { "type": "TGZ", "key": "en.model", "file_uri": "http://public.datadrivendiscovery.org/en_1000_no_stem.tar.gz", "file_digest": "3b1238137bba14222ae7c718f535c68a3d7190f244296108c895f1abe8549861" }, ], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.distil.duke', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.RECURRENT_NEURAL_NETWORK, ], 'primitive_family': metadata_base.PrimitiveFamily.DATA_CLEANING, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, volumes: typing.Dict[str, str] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, volumes=volumes) self.volumes = volumes def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce a summary for the tabular dataset input Parameters ---------- inputs : Input pandas frame Returns ------- Outputs The output is a string summary """ """ Accept a pandas data frame, returns a string summary frame: a pandas data frame containing the data to be processed -> a string summary """ # sub-sample percentage of records from data frame if not self.hyperparams: self.hyperparams['records'] = 1 records = self.hyperparams['records'] frame = inputs.sample(frac=records) # cast frame data type back to original, if numeric, to ensure # that duke can drop them, and not skew results (since d3m # preprocessing prims turn everything into str/object) tmp = frame for i in range(frame.shape[1]): if (frame.metadata.query_column(i)['semantic_types'][0] == 'http://schema.org/Integer'): tmp.ix[:, frame.columns[i]].replace('', 0, inplace=True) tmp[frame.columns[i]] = pandas.to_numeric( tmp[frame.columns[i]], errors='coerce') # converting a string value like '32.0' to an int directly results in an error, so we first # convert everything to a float tmp = tmp.astype({frame.columns[i]: float}) tmp = tmp.astype({frame.columns[i]: int}) elif (frame.metadata.query_column(i)['semantic_types'][0] == 'http://schema.org/Float'): tmp.ix[:, frame.columns[i]].replace('', 0, inplace=True) tmp[frame.columns[i]] = pandas.to_numeric( tmp[frame.columns[i]], errors='coerce') tmp = tmp.astype({frame.columns[i]: float}) # not yet sure if dropping CategoticalData is ideal, but it appears to work... # some categorical data may contain useful information, but the d3m transformation is not reversible # and not aware of a way to distinguish numerical from non-numerical CategoricalData elif (frame.metadata.query_column(i)['semantic_types'][0] == 'https://metadata.datadrivendiscovery.org/types/CategoricalData' ): tmp = tmp.drop(columns=[frame.columns[i]]) # print('beginning summarization... \n') # get the path to the ontology class tree resource_package = "Duke" resource_path = '/'.join( ('ontologies', 'class-tree_dbpedia_2016-10.json')) tree_path = pkg_resources.resource_filename(resource_package, resource_path) embedding_path = self.volumes['en.model'] + "/en_1000_no_stem/en.model" row_agg_func = mean_of_rows tree_agg_func = parent_children_funcs(np.mean, max) source_agg_func = mean_of_rows max_num_samples = 1e6 verbose = True duke = DatasetDescriptor( dataset=tmp, tree=tree_path, embedding=embedding_path, row_agg_func=row_agg_func, tree_agg_func=tree_agg_func, source_agg_func=source_agg_func, max_num_samples=max_num_samples, verbose=verbose, ) print('initialized duke dataset descriptor \n') N = 5 out_tuple = duke.get_top_n_words(N) print('finished summarization \n') out_df_duke = pandas.DataFrame.from_records(list(out_tuple)).T out_df_duke.columns = ['subject tags', 'confidences'] # initialize the output dataframe as input dataframe (results will be appended to it) # out_df = d3m_DataFrame(inputs) # create metadata for the duke output dataframe duke_df = d3m_DataFrame(out_df_duke) # first column ('subject tags') col_dict = dict(duke_df.metadata.query( (metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type("it is a string") col_dict['name'] = "subject tags" col_dict['semantic_types'] = ( 'http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute') duke_df.metadata = duke_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) # second column ('confidences') col_dict = dict(duke_df.metadata.query( (metadata_base.ALL_ELEMENTS, 1))) col_dict['structural_type'] = type("1.0") col_dict['name'] = "confidences" col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') duke_df.metadata = duke_df.metadata.update( (metadata_base.ALL_ELEMENTS, 1), col_dict) # concatenate final output frame -- not real consensus from program, so commenting out for now #out_df = utils_cp.append_columns(out_df, duke_df) return CallResult(duke_df)
class ContinuityValidation(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ Check whether the seires data is consitent in time interval and provide processing if not consistent. Parameters ---------- continuity_option: enumeration Choose ablation or imputation. ablation: delete some rows and increase timestamp interval to keep the timestamp consistent imputation: linearly imputate the absent timestamps to keep the timestamp consistent interval: float Only used in imputation, give the timestamp interval. ‘interval’ should be an integral multiple of 'timestamp' or 'timestamp' should be an integral multiple of ‘interval’ """ __author__: "DATA Lab at Texas A&M University" metadata = metadata_base.PrimitiveMetadata({ "name": "continuity validation primitive", "python_path": "d3m.primitives.tods.data_processing.continuity_validation", "source": { 'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:[email protected]', 'uris': [ 'https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/ContinuityValidation.py' ] }, "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.CONTINUITY_VALIDATION, ], "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, "id": "ef8fb025-d157-476c-8e2e-f8fe56162195", "hyperparams_to_tune": ['continuity_option', 'interval'], "version": "0.0.1", }) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: """ Args: inputs: Container DataFrame timeout: Default iterations: Default Returns: Container DataFrame with consistent timestamp """ # self.logger.warning('Hi, ContinuityValidation.produce was called!') if self.hyperparams['continuity_option'] == 'ablation': outputs = self._continuity_ablation(inputs) if self.hyperparams['continuity_option'] == 'imputation': outputs = self._continuity_imputation(inputs) outputs.reset_index(drop=True, inplace=True) self._update_metadata(outputs) # self._write(outputs) return base.CallResult(outputs) def _update_metadata(self, outputs): outputs.metadata = outputs.metadata.generate(outputs) def _continuity_ablation(self, inputs: Inputs): ablation_set = self._find_ablation_set(inputs) inputs = inputs.loc[inputs['timestamp'].isin(ablation_set)].copy() inputs.sort_values("timestamp", inplace=True) inputs['d3mIndex'] = list(range(inputs.shape[0])) return inputs def _find_ablation_set(self, inputs): """ Find the longest series with minimum timestamp interval of inputs """ # find the min inteval and max interval min_interval = inputs.iloc[1]['timestamp'] - inputs.iloc[0]['timestamp'] for i in range(2, inputs.shape[0]): curr_interval = inputs.iloc[i]['timestamp'] - inputs.iloc[ i - 1]['timestamp'] if min_interval > curr_interval: min_interval = curr_interval max_interval = ( (inputs.iloc[-1]['timestamp'] - inputs.iloc[0]['timestamp']) + min_interval * (2 - inputs.shape[0])) print((inputs.iloc[-1]['timestamp'] - inputs.iloc[0]['timestamp']), inputs.shape[0]) interval = min_interval ablation_set = list() origin_set = set(inputs['timestamp']) print(min_interval, max_interval) while interval <= max_interval: start = 0 while (inputs.iloc[start]['timestamp'] <= inputs.iloc[0]['timestamp'] + max_interval) and ( inputs.iloc[start]['timestamp'] <= inputs.iloc[-1]['timestamp']): tmp_list = list() tmp = utils.numpy.arange(start=inputs.iloc[start]['timestamp'], step=interval, stop=inputs.iloc[-1]['timestamp']) for i in tmp: if i in origin_set: tmp_list.append(i) else: break ablation_set.append(tmp_list) start += 1 interval += min_interval max_size_index = 0 for i in range(1, len(ablation_set)): if len(ablation_set[i]) > len(ablation_set[max_size_index]): max_size_index = i return ablation_set[max_size_index] def _continuity_imputation(self, inputs: Inputs): """ Linearly imputate the missing timestmap and value of inputs """ interval = self.hyperparams['interval'] time1 = inputs.iloc[0]['timestamp'] for i in range(1, inputs.shape[0]): time2 = inputs.iloc[i]['timestamp'] if time2 - time1 != interval: blank_number = int( (time2 - time1) / interval ) # how many imputation should there be between two timestamps in original data for j in range(1, blank_number): dict = { 'timestamp': [time1 + interval * j], 'ground_truth': [int(inputs.iloc[i]['ground_truth'])] } for col in list(inputs.columns.values): if not col in [ 'd3mIndex', 'timestamp', 'ground_truth' ]: dict[col] = [ inputs.iloc[i - 1][col] + (inputs.iloc[i][col] - inputs.iloc[i - 1][col]) / blank_number * j ] inputs = inputs.append(utils.pandas.DataFrame(dict), ignore_index=True, sort=False) time1 = time2 inputs.sort_values("timestamp", inplace=True) inputs['d3mIndex'] = list(range(inputs.shape[0])) return inputs def _write(self, inputs: Inputs): """ write inputs to current directory, only for test """ inputs.to_csv(str(time.time()) + '.csv')
class DuplicationValidation(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ Check whether the seires data involves duplicate data in one timestamp, and provide processing if the duplication exists. Parameters ---------- keep_option: enumeration When dropping rows, choose to keep the first one or calculate the average """ __author__: "DATA Lab at Texas A&M University" metadata = metadata_base.PrimitiveMetadata({ "name": "duplication validation primitive", "python_path": "d3m.primitives.tods.data_processing.duplication_validation", "source": { 'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:[email protected]', 'uris': [ 'https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/DuplicationValidation.py' ] }, "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.DUPLICATION_VALIDATION, ], "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, "id": "cf6d8137-73d8-496e-a2e3-49f941ee716d", "hyperparams_to_tune": ['keep_option'], "version": "0.0.1", }) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: """ Args: inputs: Container DataFrame timeout: Default iterations: Default Returns: Container DataFrame after drop the duplication """ # self.logger.warning('Hi, DuplicationValidation.produce was called!') if self.hyperparams['keep_option'] == 'first': outputs = self._timestamp_keep_first(inputs) if self.hyperparams['keep_option'] == 'average': outputs = self._timestamp_keep_average(inputs) self._update_metadata(outputs) # self._write(outputs) return base.CallResult(outputs) def _update_metadata(self, outputs): outputs.metadata = outputs.metadata.generate(outputs) def _timestamp_keep_first(self, inputs: Inputs): return inputs.drop_duplicates(subset=['timestamp'], keep='first') def _timestamp_keep_average(self, inputs: Inputs): inputs_copy = inputs.copy() inputs = inputs.drop_duplicates(subset=['timestamp'], keep='first') inputs_copy = inputs_copy.groupby('timestamp').mean().reset_index() for col in list(inputs.columns.values): if not col in ['d3mIndex', 'timestamp', 'ground_truth']: inputs[col] = inputs_copy[col].values return inputs def _write(self, inputs: Inputs): """ write inputs to current directory, only for test """ inputs.to_csv(str(time.time()) + '.csv')
class DeepArPrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive that applies a deep autoregressive forecasting algorithm for time series prediction. The implementation is based off of this paper: https://arxiv.org/pdf/1704.04110.pdf and this implementation: https://gluon-ts.mxnet.io/index.html Training inputs: 1) Feature dataframe, 2) Target dataframe Outputs: Dataframe with predictions for specific time series at specific future time instances Arguments: hyperparams {Hyperparams} -- D3M Hyperparameter object Keyword Arguments: random_seed {int} -- random seed (default: {0}) """ metadata = metadata_base.PrimitiveMetadata( { "id": "3410d709-0a13-4187-a1cb-159dd24b584b", "version": __version__, "name": "DeepAR", "keywords": [ "time series", "forecasting", "recurrent neural network", "autoregressive", ], "source": { "name": __author__, "contact": __contact__, "uris": [ "https://github.com/kungfuai/d3m-primitives", ], }, "installation": [ {"type": "PIP", "package": "cython", "version": "0.29.16"}, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/kungfuai/d3m-primitives.git@{git_commit}#egg=kf-d3m-primitives".format( git_commit=utils.current_git_commit(os.path.dirname(__file__)), ), }, ], "python_path": "d3m.primitives.time_series_forecasting.lstm.DeepAR", "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.RECURRENT_NEURAL_NETWORK, ], "primitive_family": metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING, } ) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._freq = None self._is_fit = False self._all_preds = None def get_params(self) -> Params: return Params( deepar_dataset = self._deepar_dataset, timestamp_column = self._timestamp_column, real_cols = self._real_columns, group_cols = self._grouping_columns, cat_cols = self._cat_columns, output_column = self._output_column, freq = self._freq, reind_freq = self._reind_freq, is_fit = self._is_fit, min_trains = self._min_trains ) def set_params(self, *, params: Params) -> None: self._deepar_dataset = params['deepar_dataset'] self._timestamp_column = params['timestamp_column'] self._real_columns = params['real_cols'] self._grouping_columns = params['group_cols'] self._cat_columns = params['cat_cols'] self._output_column = params['output_column'] self._freq = params['freq'] self._reind_freq = params['reind_freq'] self._is_fit = params['is_fit'] self._min_trains = params['min_trains'] def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """ Sets primitive's training data Arguments: inputs {Inputs} -- D3M dataframe containing attributes outputs {Outputs} -- D3M dataframe containing targets Raises: ValueError: If multiple columns are annotated with 'Time' or 'DateTime' metadata """ self._output_column = outputs.columns[0] frame = inputs.append_columns(outputs) self._get_cols(frame) self._set_freq(frame) frame, self._min_trains, max_train_length, _ = self._reindex(frame) self._check_window_support(max_train_length) self._deepar_dataset = DeepARDataset( frame, self._grouping_columns, self._cat_columns, self._real_columns, self._timestamp_column, self._target_column, self._freq, self.hyperparams['prediction_length'], self.hyperparams['context_length'], self._target_semantic_types, self.hyperparams['count_data'] ) self._train_data = self._deepar_dataset.get_data() def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Fits DeepAR model using training data from set_training_data and hyperparameters Keyword Arguments: timeout {float} -- timeout, considered (default: {None}) iterations {int} -- iterations, considered (default: {None}) Returns: CallResult[None] """ if iterations is None: iterations = self.hyperparams["epochs"] has_finished = True else: has_finished = False estimator = DeepAREstimator( freq=self._freq, prediction_length=self.hyperparams['prediction_length'], context_length=self.hyperparams['context_length'], use_feat_static_cat=self._deepar_dataset.has_cat_cols() or self._deepar_dataset.has_group_cols(), use_feat_dynamic_real=self._deepar_dataset.has_real_cols(), cardinality=self._deepar_dataset.get_cardinality(), distr_output=self._deepar_dataset.get_distribution_type(), dropout_rate=self.hyperparams['dropout_rate'], trainer=Trainer( epochs=iterations, learning_rate=self.hyperparams['learning_rate'], batch_size=self.hyperparams['training_batch_size'], num_batches_per_epoch=self.hyperparams['steps_per_epoch'] ) ) logger.info(f"Fitting for {iterations} iterations") start_time = time.time() predictor = estimator.train(self._train_data) predictor.batch_size = self.hyperparams['inference_batch_size'] self._is_fit = True logger.info(f"Fit for {iterations} epochs, took {time.time() - start_time}s") if not os.path.isdir(self.hyperparams['weights_dir']): os.mkdir(self.hyperparams['weights_dir']) predictor.serialize(Path(self.hyperparams['weights_dir'])) return CallResult(None, has_finished=has_finished) def produce( self, *, inputs: Inputs, timeout: float = None, iterations: int = None ) -> CallResult[Outputs]: """ Produce primitive's predictions for specific time series at specific future time instances * these specific timesteps / series are specified implicitly by input dataset Arguments: inputs {Inputs} -- D3M dataframe containing attributes Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- (N, 2) dataframe with d3m_index and value for each prediction slice requested. prediction slice = specific horizon idx for specific series in specific regression """ if self._all_preds is None: self._all_preds, self._pred_intervals = self._produce(inputs) point_estimates = np.concatenate( [series[0][idxs] for series, idxs in zip(self._all_preds, self._pred_intervals)] ) result_df = container.DataFrame( {self._output_column: point_estimates}, generate_metadata=True, ) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"), ) return CallResult(result_df, has_finished=self._is_fit) def produce_confidence_intervals( self, *, inputs: Inputs, timeout: float = None, iterations: int = None ) -> CallResult[Outputs]: """ produce quantiles for each prediction timestep in dataframe Arguments: inputs {Inputs} -- D3M dataframe containing attributes Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, considered (default: {None}) Raises: PrimitiveNotFittedError: Returns: CallResult[Outputs] -- Ex. 0.50 | 0.05 | 0.95 ------------------- 5 | 3 | 7 6 | 4 | 8 5 | 3 | 7 6 | 4 | 8 """ if self._all_preds is None: self._all_preds, self._pred_intervals = self._produce(inputs) all_quantiles = [[] for q in range(len(self.hyperparams['quantiles']) + 1)] for series, idxs in zip(self._all_preds, self._pred_intervals): for i, quantile in enumerate(series): all_quantiles[i].append(quantile[idxs]) all_quantiles = [np.concatenate(quantile) for quantile in all_quantiles] col_names = (0.5,) + self.hyperparams['quantiles'] result_df = container.DataFrame( {col_name: quantile for col_name, quantile in zip(col_names, all_quantiles)}, generate_metadata=True, ) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"), ) return CallResult(result_df, has_finished=self._is_fit) def _get_col_names(self, col_idxs, all_col_names): """ transform column indices to column names """ return [all_col_names[i] for i in col_idxs] def _process_special_col(self, col_list, col_type): """ private util function that warns if multiple special columns """ if len(col_list) == 0: return None elif len(col_list) > 1: logger.warn( f"""There are more than one {col_type} marked. This primitive will use the first""" ) return col_list[0] def _sort_by_timestamp(self, frame): """ private util function: convert to pd datetime and sort """ time_name = frame.columns[self._timestamp_column] new_frame = frame.copy() if "http://schema.org/Integer" in frame.metadata.query_column_field( self._timestamp_column, "semantic_types" ): new_frame.iloc[:, self._timestamp_column] = pd.to_datetime( new_frame.iloc[:, self._timestamp_column] - 1, unit = 'D' ) self._freq = 'D' self._reind_freq = 'D' else: new_frame.iloc[:, self._timestamp_column] = pd.to_datetime( new_frame.iloc[:, self._timestamp_column], unit = 's' ) return new_frame.sort_values(by = time_name) def _set_freq(self, frame): """ sets frequency using differences in timestamp column in data frame ASSUMPTION: frequency is the same across all grouped time series """ if len(self._grouping_columns) == 0: if self._freq is None: diff = frame.iloc[1, self._timestamp_column] - frame.iloc[0, self._timestamp_column] self._freq, self._reind_freq = calculate_time_frequency(diff, model = 'gluon') else: if self._freq is None: g_cols = self._get_col_names(self._grouping_columns, frame.columns) for g, df in frame.groupby(g_cols, sort = False): diff = df.iloc[1, self._timestamp_column] - df.iloc[0, self._timestamp_column] break self._freq, self._reind_freq = calculate_time_frequency(diff, model = 'gluon') def _robust_reindex(self, frame): """ reindex dataframe IFF it has > 1 row, interpolate real-valued columns, forward-filling categorical and grouping columns """ frame = self._sort_by_timestamp(frame) original_times = frame.iloc[:, self._timestamp_column] frame = frame.drop_duplicates(subset = frame.columns[self._timestamp_column]) frame.index = frame.iloc[:, self._timestamp_column] if frame.shape[0] > 1: frame = frame.reindex( pd.date_range( frame.index[0], frame.index[-1], freq = self._reind_freq, ) ) frame.iloc[:, self._real_columns] = frame.iloc[:, self._real_columns].interpolate() frame.iloc[:, self._cat_columns + self._grouping_columns] = \ frame.iloc[:, self._cat_columns + self._grouping_columns].ffill() return frame, original_times def _reindex(self, frame): """ reindex data, keeping NA values for target column, but interpolating feature columns """ if len(self._grouping_columns) == 0: df, original_times = self._robust_reindex(frame) return df, [df.index[0]], df.shape[0], original_times else: all_dfs, min_trains, original_times = [], {}, OrderedDict() max_train_length = 0 g_cols = self._get_col_names(self._grouping_columns, frame.columns) for grp, df in frame.groupby(g_cols, sort = False): df, orig_times = self._robust_reindex(df) if df.shape[0] > max_train_length: max_train_length = df.shape[0] all_dfs.append(df) min_trains[grp] = df.index[0] original_times[grp] = orig_times return pd.concat(all_dfs), min_trains, max_train_length, original_times def _get_cols(self, frame): """ private util function: get indices of important columns from metadata """ input_metadata = frame.metadata # get target idx (first column by default) target_columns = input_metadata.list_columns_with_semantic_types( ( "https://metadata.datadrivendiscovery.org/types/SuggestedTarget", "https://metadata.datadrivendiscovery.org/types/TrueTarget", "https://metadata.datadrivendiscovery.org/types/Target", ) ) if len(target_columns) == 0: raise ValueError("At least one column must be marked as a target") self._target_column = self._process_special_col( target_columns, "target column" ) # get timestamp idx (first column by default) timestamp_columns = input_metadata.list_columns_with_semantic_types( ( "https://metadata.datadrivendiscovery.org/types/Time", "http://schema.org/DateTime", ) ) self._timestamp_column = self._process_special_col( timestamp_columns, "timestamp column" ) # get grouping idx self._grouping_columns = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/GroupingKey",) ) suggested_group_cols = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey",) ) if len(self._grouping_columns) == 0: self._grouping_columns = suggested_group_cols def diff(li1, li2): return list(set(li1) - set(li2)) # categorical columns self._cat_columns = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/CategoricalData",) ) self._cat_columns = diff(self._cat_columns, self._grouping_columns + suggested_group_cols) # real valued columns self._real_columns = input_metadata.list_columns_with_semantic_types( ("http://schema.org/Integer", "http://schema.org/Float") ) self._real_columns = diff( self._real_columns, [self._timestamp_column] + [self._target_column] + self._grouping_columns ) # determine whether targets are count data self._target_semantic_types = input_metadata.query_column_field( self._target_column, "semantic_types" ) def _check_window_support(self, max_train_length): """ ensures that at least one series of target series is >= context_length """ if max_train_length < self.hyperparams['prediction_length']: raise ValueError( f"This training set does not support a prediction length of {self.hyperparams['prediction_length']} " + f"because its longest series has length {max_train_length} observations. Please " + f"choose a shorter prediction length." ) def _get_pred_intervals(self, original_times): """ private util function that retrieves unevenly spaced prediction intervals from data frame """ if len(self._grouping_columns) == 0: intervals = discretize_time_difference( original_times, self._min_trains[0], self._freq, zero_index = True ) all_intervals = [np.array(intervals) + 1] else: all_intervals = [] for grp, times in original_times.items(): if grp in self._min_trains.keys(): intervals = discretize_time_difference( times, self._min_trains[grp], self._freq, zero_index = True ) else: logger.info( f'Series with category {grp} did not exist in training data, ' + f'These predictions will be returned as np.nan.' ) intervals = np.zeros(times.shape[0]).astype(int) all_intervals.append(np.array(intervals) + 1) return all_intervals def _produce(self, inputs: Inputs): """ internal produce method to support produce() and produce_confidence_intervals() methods """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") test_frame = inputs.copy() deepar_forecast = DeepARForecast( self._deepar_dataset, self.hyperparams['weights_dir'], self.hyperparams['output_mean'], self.hyperparams['number_samples'], self.hyperparams['quantiles'] ) test_frame, _, _, original_times = self._reindex(test_frame) pred_intervals = self._get_pred_intervals(original_times) st = time.time() preds = deepar_forecast.predict(test_frame, pred_intervals) logger.info(f'Making predictions took {time.time() - st}s') return preds, pred_intervals
class MatrixProfile(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ A primitive that performs matrix profile on a DataFrame using Stumpy package Stumpy documentation: https://stumpy.readthedocs.io/en/latest/index.html Parameters ---------- T_A : ndarray The time series or sequence for which to compute the matrix profile m : int Window size T_B : ndarray The time series or sequence that contain your query subsequences of interest. Default is `None` which corresponds to a self-join. ignore_trivial : bool Set to `True` if this is a self-join. Otherwise, for AB-join, set this to `False`. Default is `True`. Returns ------- out : ndarray The first column consists of the matrix profile, the second column consists of the matrix profile indices, the third column consists of the left matrix profile indices, and the fourth column consists of the right matrix profile indices. """ metadata = metadata_base.PrimitiveMetadata({ '__author__': "DATA Lab @Texas A&M University", 'name': "Matrix Profile", #'python_path': 'd3m.primitives.tods.feature_analysis.matrix_profile', 'python_path': 'd3m.primitives.tods.detection_algorithm.matrix_profile', 'source': {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:[email protected]', 'uris': ['https://gitlab.com/lhenry15/tods/-/blob/Yile/anomaly-primitives/anomaly_primitives/MatrixProfile.py']}, 'algorithm_types': [metadata_base.PrimitiveAlgorithmType.MATRIX_PROFILE,], 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, 'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'MatrixProfilePrimitive')), 'hyperparams_to_tune': ['window_size'], 'version': '0.0.2', }) def __init__(self, *, hyperparams: Hyperparams) -> None: super().__init__(hyperparams=hyperparams) self._clf = MP(window_size = hyperparams['window_size']) self.primitiveNo = PrimitiveCount.primitive_no PrimitiveCount.primitive_no+=1 def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: """ Args: inputs: Container DataFrame timeout: Default iterations: Default Returns: Container DataFrame containing Matrix Profile of selected columns """ # Get cols to fit. self._fitted = False self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns if len(self._training_indices) > 0: self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") sk_inputs = inputs if self.hyperparams['use_semantic_types']: sk_inputs = inputs.iloc[:, self._training_indices] output_columns = [] if len(self._training_indices) > 0: sk_output = self._clf.produce(sk_inputs) if sparse.issparse(sk_output): sk_output = sk_output.toarray() outputs = self._wrap_predictions(inputs, sk_output) if len(outputs.columns) == len(self._input_column_names): outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) #print(outputs) #CallResult(outputs) #print("___") print(outputs.columns) #outputs.columns = [str(x) for x in outputs.columns] return CallResult(outputs) # assert isinstance(inputs, container.DataFrame), type(container.DataFrame) # _, self._columns_to_produce = self._get_columns_to_fit(inputs, self.hyperparams) # #print("columns_to_produce ", self._columns_to_produce) # outputs = inputs # if len(self._columns_to_produce) > 0: # for col in self.hyperparams['use_columns']: # output = self._clf.produce(inputs.iloc[ : ,col]) # outputs = pd.concat((outputs, pd.DataFrame({inputs.columns[col]+'_matrix_profile': output[:,0], # inputs.columns[col]+'_matrix_profile_indices': output[:,1], # inputs.columns[col]+'_left_matrix_profile_indices': output[:,2], # inputs.columns[col]+'_right_matrix_profile_indices': output[:,3]})), axis = 1) # else: # if self.hyperparams['error_on_no_input']: # raise RuntimeError("No input columns were selected") # self.logger.warn("No input columns were selected") # #print(outputs) # self._update_metadata(outputs) # return base.CallResult(outputs) def _update_metadata(self, outputs): outputs.metadata = outputs.metadata.generate(outputs) @classmethod def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): """ Select columns to fit. Args: inputs: Container DataFrame hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: list """ if not hyperparams['use_semantic_types']: return inputs, list(range(len(inputs.columns))) inputs_metadata = inputs.metadata def can_produce_column(column_index: int) -> bool: return cls._can_produce_column(inputs_metadata, column_index, hyperparams) columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, use_columns=hyperparams['use_columns'], exclude_columns=hyperparams['exclude_columns'], can_use_column=can_produce_column) """ Encountered error: when hyperparams['use_columns'] = (2,3) and hyperparams['exclude_columns'] is (1,2) columns_to_produce is still [2] """ return inputs.iloc[:, columns_to_produce], columns_to_produce @classmethod def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: """ Output whether a column can be processed. Args: inputs_metadata: d3m.metadata.base.DataMetadata column_index: int Returns: bool """ column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) accepted_structural_types = (int, float, np.integer, np.float64) #changed numpy to np accepted_semantic_types = set() accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") # print(column_metadata) # print(column_metadata['structural_type'], accepted_structural_types) if not issubclass(column_metadata['structural_type'], accepted_structural_types): return False semantic_types = set(column_metadata.get('semantic_types', [])) # print(column_metadata) # print(semantic_types, accepted_semantic_types) if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: """ Wrap predictions into dataframe Args: inputs: Container Dataframe predictions: array-like data (n_samples, n_features) Returns: Dataframe """ outputs = d3m_dataframe(predictions, generate_metadata=True) target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams, self.primitiveNo) outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) return outputs @classmethod def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: """ Updata metadata for selected columns. Args: inputs_metadata: metadata_base.DataMetadata outputs: Container Dataframe target_columns_metadata: list Returns: d3m.metadata.base.DataMetadata """ outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) for column_index, column_metadata in enumerate(target_columns_metadata): column_metadata.pop("structural_type", None) outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) return outputs_metadata @classmethod def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams, primitiveNo): """ Add target columns metadata Args: outputs_metadata: metadata.base.DataMetadata hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: List[OrderedDict] """ outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_name = "{0}{1}_{2}".format(cls.metadata.query()['name'], primitiveNo, column_index) column_metadata = OrderedDict() semantic_types = set() semantic_types.add(hyperparams["return_semantic_type"]) column_metadata['semantic_types'] = list(semantic_types) column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata
class So_GaalPrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): """Single-Objective Generative Adversarial Active Learning. SO-GAAL directly generates informative potential outliers to assist the classifier in describing a boundary that can separate outliers from normal data effectively. Moreover, to prevent the generator from falling into the mode collapsing problem, the network structure of SO-GAAL is expanded from a single generator (SO-GAAL) to multiple generators with different objectives (MO-GAAL) to generate a reasonable reference distribution for the whole dataset. Read more in the :cite:`liu2019generative`. Parameters ---------- contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. stop_epochs : int, optional (default=20) The number of epochs of training. lr_d : float, optional (default=0.01) The learn rate of the discriminator. lr_g : float, optional (default=0.0001) The learn rate of the generator. decay : float, optional (default=1e-6) The decay parameter for SGD. momentum : float, optional (default=0.9) The momentum parameter for SGD. Attributes ---------- decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. The higher, the more abnormal. Outliers tend to have higher scores. This value is available once the detector is fitted. threshold_ : float The threshold is based on ``contamination``. It is the ``n_samples * contamination`` most abnormal samples in ``decision_scores_``. The threshold is calculated for generating binary outlier labels. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. """ __author__ = "DATA Lab at Texas A&M University", metadata = metadata_base.PrimitiveMetadata({ 'id': '56e6cfe9-d9e9-495f-83da-cfed6fa27da1', 'version': '0.1.0', 'name': 'So_Gaal Anomaly Detection', 'python_path': 'd3m.primitives.tods.detection_algorithm.pyod_sogaal', 'keywords': ['Time Series', 'GAN'], "hyperparams_to_tune": ['stop_epochs', 'lr_d', 'lr_g', 'decay', 'momentum'], 'source': { 'name': 'DATA Lab at Texas A&M University', 'uris': [ 'https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/detection_algorithm/PyodSoGaal.py' ], 'contact': 'mailto:[email protected]' }, 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'. format(git_commit=d3m_utils.current_git_commit( os.path.dirname(__file__)), ), }], 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, ], 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, }) def __init__( self, *, hyperparams: Hyperparams, # random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._clf = SO_GAAL( stop_epochs=hyperparams['stop_epochs'], lr_d=hyperparams['lr_d'], lr_g=hyperparams['lr_g'], decay=hyperparams['decay'], momentum=hyperparams['momentum'], contamination=hyperparams['contamination'], ) return def set_training_data(self, *, inputs: Inputs) -> None: """ Set training data for outlier detection. Args: inputs: Container DataFrame Returns: None """ super().set_training_data(inputs=inputs) def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Fit model with training data. Args: *: Container DataFrame. Time series data up to fit. Returns: None """ return super().fit() def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Process the testing data. Args: inputs: Container DataFrame. Time series data up to outlier detection. Returns: Container DataFrame 1 marks Outliers, 0 marks normal. """ return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) def get_params(self) -> Params: """ Return parameters. Args: None Returns: class Params """ return super().get_params() def set_params(self, *, params: Params) -> None: """ Set parameters for outlier detection. Args: params: class Params Returns: None """ super().set_params(params=params)
class SeededGraphMatching( UnsupervisedLearnerPrimitiveBase[Inputs, Outputs,Params, Hyperparams]): # This should contain only metadata which cannot be automatically determined from the code. metadata = metadata_module.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': 'ff22e721-e4f5-32c9-ab51-b90f32603a56', 'version': "0.1.0", 'name': "jhu.sgm", # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.jhu_primitives.SeededGraphMatching', # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['graph matching'], 'source': { 'name': "JHU", 'uris': [ # Unstructured URIs. Link to file and link to repo in this case. 'https://github.com/neurodata/primitives-interfaces/jhu_primitives/sgm/sgm.py', # 'https://github.com/youngser/primitives-interfaces/blob/jp-devM1/jhu_primitives/ase/ase.py', 'https://github.com/neurodata/primitives-interfaces.git', ], }, 'installation': [{ 'type': 'UBUNTU', 'package': 'r-base', 'version': '3.4.2' }, { 'type': 'UBUNTU', 'package': 'libxml2-dev', 'version': '2.9.4' }, { 'type': 'UBUNTU', 'package': 'libpcre3-dev', 'version': '2.9.4' },{ 'type': metadata_module.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/neurodata/primitives-interfaces.git@{git_commit}#egg=jhu_primitives'.format( git_commit=utils.current_git_commit(os.path.dirname(__file__)), ), }], 'algorithm_types': [ metadata_module.PrimitiveAlgorithmType.FRANK_WOLFE_ALGORITHM ], 'primitive_family': metadata_module.PrimitiveFamily.GRAPH_MATCHING }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, base.DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._training_dataset = None self._g1 = None self._g2 = None self._g1_node_attributes = None self._g2_node_attributes = None def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: return CallResult[None] def set_training_data(self,*,inputs: Inputs) -> None: self._training_dataset = inputs self._g1 = self._training_dataset['0'] self._g2 = self._training_dataset['1'] self._g1_node_attributes = list(networkx.get_node_attributes(self._g1, 'nodeID').values()) self._g2_node_attributes = list(networkx.get_node_attributes(self._g2, 'nodeID').values()) #technically, this is unsupervised, as there is no fit function #instead, we just hang on to the training data and run produce with the two graphs and seeds #and use that to predict later on. def get_params(self) -> None: return Params def set_params(self, *, params: Params) -> None: pass #UnsupervisedLearner def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: #produce takes the training dataset and runs seeded graph matching using the seeds #then predicts using the resulting permutation_matrix permutation_matrix = np.asmatrix(self._seeded_graph_match(training_data=self._training_dataset)) predictions = self._get_predictions(permutation_matrix=permutation_matrix, inputs = inputs) return base.CallResult(predictions) def _get_predictions(self,*, permutation_matrix: np.matrix, inputs: Inputs): testing = inputs['2'] threshold = self.hyperparams['threshold'] for i in range(testing.shape[0]): testing['match'][i] = 0 v1 = testing['G1.nodeID'][i] v2 = testing['G2.nodeID'][i] found = False j = 0 while not found: if self._g1_node_attributes[j] == int(v1): found = True v1 = j j += 1 # print(found) found = False j = 0 while not found: if self._g2_node_attributes[j] == int(v2): found = True v2 = j j += 1 if permutation_matrix[v1, v2] > threshold: testing['match'][i] = 1 else: testing['match'][i] = 0 df = container.DataFrame({"d3mIndex": testing['d3mIndex'], "match": testing['match']}) return df def _seeded_graph_match(self,*, training_data = None): if training_data is None: training_data = self._training_dataset seeds = training_data['2'] new_seeds = pd.DataFrame( {'G1.nodeID': seeds['G1.nodeID'], 'G2.nodeID': seeds['G2.nodeID'], 'match': seeds['match']}) new_seeds = new_seeds[new_seeds['match'] == '1'] # we now have a seeds correspondence of nodeIDs, # but we need a seed correspondence of actual vertex numbers # initialize the integer values to nothing: new_seeds['g1_vertex'] = "" new_seeds['g2_vertex'] = "" # for every seed, locate the corresponding vertex integer for j in range(new_seeds.shape[0]): found = False i = 0 while not found: if (int(new_seeds['G1.nodeID'][j]) == self._g1_node_attributes[i]): new_seeds['g1_vertex'][j] = i found = True i += 1 for j in range(new_seeds.shape[0]): found = False i = 0 while not found: if (int(new_seeds['G2.nodeID'][j]) == self._g2_node_attributes[i]): new_seeds['g2_vertex'][j] = i found = True i += 1 # store the vertex pairs as an m x 2 array and convert to a matrix seeds_array = np.array(new_seeds[['g1_vertex', 'g2_vertex']]) seeds_array = seeds_array.astype(int) seeds = seeds_array nr, nc = seeds.shape seeds = ro.r.matrix(seeds, nrow=nr, ncol=nc) ro.r.assign("seeds", seeds) g1_matrix = networkx.to_numpy_array(self._g1) nr, nc = g1_matrix.shape g1_matrix = ro.r.matrix(g1_matrix, nrow=nr, ncol=nc) ro.r.assign("g1_matrix", g1_matrix) g2_matrix = networkx.to_numpy_array(self._g2) nr, nc = g2_matrix.shape g2_matrix = ro.r.matrix(g2_matrix, nrow=nr, ncol=nc) ro.r.assign("g2_matrix", g2_matrix) reps = self.hyperparams['reps'] ro.r.assign("reps",reps) # run the R code: path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "sgm.interface.R") path = file_path_conversion(path, uri="") cmd = """ source("%s") fn <- function(g1_matrix, g2_matrix, seeds,reps) { sgm.interface(g1_matrix, g2_matrix, seeds,reps) } """ % path result = np.array(ro.r(cmd)(g1_matrix, g2_matrix, seeds,reps)) return container.ndarray(result)
class TimeSeriesBinnerPrimitive( transformer.TransformerPrimitiveBase[container.DataFrame, container.DataFrame, Hyperparams]): """ Bins according to the binning_operation on timeseries values. The time value can be a datetime stamp or an integer. If there is a GroupingKey column, it will apply binning to the groups. This will also bin on any value columns set, not just for one column. Currently works for downsampling. If the column with the time semantic is a datetime, it can upsample but will leave NaN values for the time. """ _grouping_key_semantic = ( "https://metadata.datadrivendiscovery.org/types/GroupingKey", ) _time_semantic = ("https://metadata.datadrivendiscovery.org/types/Time", ) _target_semantic = ( "https://metadata.datadrivendiscovery.org/types/Target", ) metadata = metadata_base.PrimitiveMetadata({ "id": "5fee7a91-b843-4636-a21e-a02bf0fd7f3a", "version": version.__version__, "name": "Time series binner", "python_path": "d3m.primitives.data_transformation.time_series_binner.DistilTimeSeriesBinner", "source": { "name": "Distil", "contact": "mailto:[email protected]", "uris": [ "https://github.com/uncharted-distil/distil-primitives-contrib/blob/main/main/distil_primitives_contrib/time_series_binner.py", "https://gitlab.com/uncharted-distil/distil-primitives-contrib", ], }, "installation": [ { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/uncharted-distil/distil-primitives-contrib.git@{git_commit}#egg=distil-primitives-contrib" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_NORMALIZATION], "primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, }) def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None) -> base.CallResult[container.DataFrame]: if inputs.shape[0] == 0: return base.CallResult(inputs) # cols = distil_utils.get_operating_columns(inputs, self.hyperparams['binning_columns'], self._semantic_types) init_index = inputs.index d3m_index = inputs.columns.get_loc("d3mIndex") d3m_col = inputs["d3mIndex"] group_key_index = self._get_grouping_key_index(inputs.metadata) time_index = self._get_time_index(inputs.metadata) value_indices = self._get_value_indices(inputs.metadata) self.time_col_name = inputs.columns[time_index] self.group_col_name = inputs.columns[group_key_index] self.time_col_dtype = inputs.dtypes[self.time_col_name] self.value_columns = inputs.columns[list(value_indices)] usable_cols = [self.group_col_name, self.time_col_name] + list( self.value_columns) inputs = inputs[usable_cols] groups = inputs.groupby(self.group_col_name, sort=False) outputs = pd.DataFrame() binned_groups = [None] * len(groups) group_col_values = [] i = 0 for group_name, group in groups: timeseries_group = group.drop(columns=[self.group_col_name]) timeseries_group = self._applyBinningOperation(timeseries_group) group_col_values += [group_name] * len(timeseries_group) binned_groups[i] = timeseries_group i += 1 outputs = container.DataFrame(pd.concat(binned_groups)) is_datetime_index = isinstance(outputs.index, pd.DatetimeIndex) if is_datetime_index: datetime_index = outputs.index if len(outputs) <= len(init_index): outputs = outputs.set_index(init_index[0:len(outputs)]) outputs.insert(loc=d3m_index, column="d3mIndex", value=d3m_col[0:len(outputs)]) outputs.metadata = outputs.metadata.update_column( metadata=inputs.metadata.query( (metadata_base.ALL_ELEMENTS, d3m_index)), column_index=d3m_index, ) else: # assume index and d3mIndex are int outputs = outputs.set_index(pd.Index(range(0, len(outputs), 1))) d3m_new_col = container.DataFrame( {"d3mIndex": range(0, len(outputs), 1)}) outputs.insert(loc=d3m_index, column="d3mIndex", value=d3m_new_col) outputs.insert(loc=group_key_index, column=self.group_col_name, value=group_col_values) if is_datetime_index: outputs.insert(loc=time_index, column=self.time_col_name, value=datetime_index) outputs.metadata = inputs.metadata.select_columns( [d3m_index, group_key_index, time_index] + list(value_indices)) return base.CallResult(outputs) def _get_grouping_key_index(self, inputs_metadata): group_key_index = self.hyperparams["grouping_key_col"] if group_key_index: return group_key_index grouping_key_indices = inputs_metadata.list_columns_with_semantic_types( self._grouping_key_semantic) if len(grouping_key_indices) > 0: return grouping_key_indices[0] raise exceptions.InvalidArgumentValueError( "no column with grouping key") def _get_time_index(self, inputs_metadata): time_index = self.hyperparams["time_col"] if time_index: return time_index time_indices = inputs_metadata.list_columns_with_semantic_types( self._time_semantic) if len(time_indices) > 0: return time_indices[0] raise exceptions.InvalidArgumentValueError("no column with time") def _get_value_indices(self, inputs_metadata): value_indices = self.hyperparams["value_cols"] if value_indices and len(value_indices) > 0: return value_indices value_indices = inputs_metadata.list_columns_with_semantic_types( self._target_semantic) if len(value_indices) > 0: return value_indices raise exceptions.InvalidArgumentValueError("no columns with target") def _granularityToRule(self): granularity = self.hyperparams["granularity"] if granularity == "seconds": return "S" elif granularity == "minutes": return "T" elif granularity == "hours": return "H" elif granularity == "days": return "D" elif granularity == "weeks": return "W" elif granularity == "months": return "M" elif granularity == "years": return "A" raise exceptions.InvalidArgumentValueError( "Given granularity argument not supported") def _applyBinningOperation(self, timeseries_group): if is_numeric_dtype(self.time_col_dtype): return self._applyIntegerNumericBinning(timeseries_group) timeseries_group = timeseries_group.set_index( pd.DatetimeIndex(timeseries_group[self.time_col_name])) df = timeseries_group.resample(self._granularityToRule()) bin_oper = self.hyperparams["binning_operation"] return getattr(df, bin_oper)() def _applyIntegerNumericBinning(self, timeseries_group): bin_oper = self.hyperparams["binning_operation"] binning_size = self.hyperparams["binning_size"] ( firstTime, right, ) = self._get_starting_bin_value( timeseries_group) # timeseries_group[self.time_col_name][0] lastTime = timeseries_group[self.time_col_name].iloc[ len(timeseries_group) - 1] amount_of_binning_numbers = int( (lastTime - firstTime) / binning_size) + 1 amount_of_binning_intervals = amount_of_binning_numbers + 1 binning_intervals = [ i * binning_size + firstTime for i in range(amount_of_binning_intervals) ] binning_intervals[0] = binning_intervals[0] - int(right) timeseries_group["binned"] = pd.cut( x=timeseries_group[self.time_col_name], bins=binning_intervals, right=right) columnsToOperation = {} columnsToOperation[self.time_col_name] = "max" for value in self.value_columns: columnsToOperation[value] = bin_oper return (timeseries_group.groupby("binned").agg( columnsToOperation).reset_index(drop=True)) def _get_starting_bin_value(self, df): if self.hyperparams["binning_starting_value"] == "zero": return ( 0, True, ) else: return ( df[self.time_col_name].iloc[0], False, )
class IsolationForestPrimitive( unsupervised_learning.UnsupervisedLearnerPrimitiveBase[ container.DataFrame, container.DataFrame, Params, Hyperparams]): """ Uses scikit learn's Isolated Forest primitive to detect and label anomalies. """ metadata = metadata_base.PrimitiveMetadata( { "id": "793f0b17-7413-4962-9f1d-0b285540b21f", "version": version.__version__, "name": "Isolation Forest", "python_path": "d3m.primitives.classification.isolation_forest.IsolationForestPrimitive", "source": { "name": "Distil", "contact": "mailto:[email protected]", "uris": [ "https://github.com/uncharted-distil/distil-primitives-contrib/blob/main/main/distil_primitives_contrib/isolation_forest.py", "https://github.com/uncharted-distil/distil-primitives-contrib", ], }, "installation": [ { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/uncharted-distil/distil-primitives-contrib.git@{git_commit}#egg=distil-primitives-contrib" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.BINARY_CLASSIFICATION, ], "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, }, ) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._model = IsolationForest( n_estimators=self.hyperparams["n_estimators"], random_state=np.random.RandomState(random_seed), ) def set_training_data(self, *, inputs: container.DataFrame) -> None: self._inputs = inputs self._needs_fit = True def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: logger.debug(f"Fitting {__name__}") if self._needs_fit: self._model.fit(self._inputs) self._needs_fit = False return base.CallResult(None) def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: if self._needs_fit: self.fit() result = self._model.predict(inputs) result_df = container.DataFrame( { "outlier_label": result, }, generate_metadata=True, ) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ) return base.CallResult(result_df) def get_params(self) -> Params: return Params( model=self._model, needs_fit=self._needs_fit, ) def set_params(self, *, params: Params) -> None: self._model = params["model"] self._needs_fit = params["needs_fit"] return
class TimeSeriesLoaderPrimitive( transformer.TransformerPrimitiveBase[container.DataFrame, container.DataFrame, Hyperparams]): """ Reads the time series files from a given column in an input dataframe into a new M x N dataframe, where each timeseries occupies one of M rows, and each of the row's N entries represents a timestamp. The loading process assumes that each series file has an identical set of timestamps. """ _semantic_types = ( 'https://metadata.datadrivendiscovery.org/types/FileName', 'https://metadata.datadrivendiscovery.org/types/Timeseries') _media_types = ('text/csv', ) __author__ = 'Uncharted Software', metadata = metadata_base.PrimitiveMetadata({ 'id': '1689aafa-16dc-4c55-8ad4-76cadcf46086', 'version': '0.1.0', 'name': 'Time series loader', 'python_path': 'd3m.primitives.data_preprocessing.time_series_to_list.TimeSeriesLoader', 'keywords': ['series', 'reader', 'csv'], 'source': { 'name': 'Uncharted Software', 'contact': 'mailto:[email protected]', 'uris': [ # Unstructured URIs. "https://github.com/NewKnowledge/sloth-d3m-wrapper", ], }, 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://gitlab.com/unchartedsoftware/distil-timeseries-loader.git@' + '{git_commit}#egg=distil-timeseries-loader'.format( git_commit=d3m_utils.current_git_commit( os.path.dirname(__file__)), ), }], 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.FILE_MANIPULATION, ], 'supported_media_types': _media_types, 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, }) @classmethod def _find_csv_file_column( cls, inputs_metadata: metadata_base.DataMetadata ) -> typing.Optional[int]: indices = utils.list_columns_with_semantic_types( inputs_metadata, cls._semantic_types) for i in indices: if cls._is_csv_file_column(inputs_metadata, i): return i return None @classmethod def _is_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: # check to see if a given column is a file pointer that points to a csv file column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) if not column_metadata or column_metadata['structural_type'] != str: return False semantic_types = column_metadata.get('semantic_types', []) media_types = column_metadata.get('media_types', []) return set(cls._semantic_types).issubset(semantic_types) and set( cls._media_types).issubset(media_types) def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None) -> base.CallResult[container.DataFrame]: file_index = self.hyperparams['file_col_index'] if file_index is not None: if not self._is_csv_file_column(inputs.metadata, file_index): raise exceptions.InvalidArgumentValueError( 'column idx=' + str(file_index) + ' from ' + str(inputs.columns) + ' does not contain csv file names') else: file_index = self._find_csv_file_column(inputs.metadata) if file_index is None: raise exceptions.InvalidArgumentValueError( 'no column from ' + str(inputs.columns) + ' contains csv file names') value_index = self.hyperparams['value_col_index'] time_index = self.hyperparams['time_col_index'] base_path = inputs.metadata.query( (metadata_base.ALL_ELEMENTS, file_index))['location_base_uris'][0] timeseries_dataframe: pd.DataFrame for idx, file_path in enumerate(inputs.iloc[:, file_index]): csv_path = os.path.join(base_path, file_path) timeseries_row = pd.read_csv(csv_path).transpose() # use the time values as the column headers if idx is 0: timeseries_dataframe = pd.DataFrame( columns=timeseries_row.iloc[time_index]) timeseries_dataframe = timeseries_dataframe.append( timeseries_row.iloc[value_index]) # get the index to use a range of ints rather than the value col name timeseries_dataframe = timeseries_dataframe.reset_index(drop=True) # wrap as a D3M container - metadata should be auto generated return base.CallResult(container.DataFrame(data=timeseries_dataframe)) @classmethod def can_accept( cls, *, method_name: str, arguments: typing.Dict[str, typing.Union[metadata_base.Metadata, type]], hyperparams: Hyperparams ) -> typing.Optional[metadata_base.DataMetadata]: output_metadata = super().can_accept(method_name=method_name, arguments=arguments, hyperparams=hyperparams) # If structural types didn't match, don't bother. if output_metadata is None: return None if method_name != 'produce': return output_metadata if 'inputs' not in arguments: return output_metadata inputs_metadata = typing.cast(metadata_base.DataMetadata, arguments['inputs']) # make sure there's a file column that points to a csv (search if unspecified) file_col_index = hyperparams['file_col_index'] if file_col_index is not None: can_use_column = cls._is_csv_file_column(inputs_metadata, file_col_index) if not can_use_column: return None else: inferred_index = cls._find_csv_file_column(inputs_metadata) if inferred_index is None: return None # we don't have access to the data at this point so there's not much that we can # do to figure out the resulting shape etc return inputs_metadata
class GaussianClustering(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params,Hyperparams]): """ Expecation-Maxmization algorithm for clustering """ # This should contain only metadata which cannot be automatically determined from the code. metadata = metadata_module.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': '5194ef94-3683-319a-9d8d-5c3fdd09de24', 'version': "0.1.0", 'name': "jhu.gclust", # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.graph_clustering.gaussian_clustering.JHU', # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['graph', 'gaussian clustering'], 'source': { 'name': "JHU", 'uris': [ # Unstructured URIs. Link to file and link to repo in this case. 'https://github.com/neurodata/primitives-interfaces/blob/master/jhu_primitives/gclust/gclust.py', # 'https://github.com/youngser/primitives-interfaces/blob/jp-devM1/jhu_primitives/ase/ase.py', 'https://github.com/neurodata/primitives-interfaces.git', ], 'contact': 'mailto:[email protected]', }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [ { 'type': 'UBUNTU', 'package': 'libxml2-dev', 'version': '2.9.4' }, { 'type': 'UBUNTU', 'package': 'libpcre3-dev', 'version': '2.9.4' }, { 'type': metadata_module.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/neurodata/primitives-interfaces.git@{git_commit}#egg=jhu_primitives'.format( git_commit=utils.current_git_commit(os.path.dirname(__file__)), ), }], 'description': 'Expecation-Maxmization algorithm for clustering', # URIs at which one can obtain code for the primitive, if available. # 'location_uris': # 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'.format( # git_commit=utils.current_git_commit(os.path.dirname(__file__)), # ), # ], # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ "EXPECTATION_MAXIMIZATION_ALGORITHM" ], 'primitive_family': "GRAPH_CLUSTERING", 'preconditions': ['NO_MISSING_VALUES'] }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, base.DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._embedding: container.ndarray = None def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ TODO: YP description **Positional Arguments:** inputs: - A matrix **Optional Arguments:** dim: - The number of clusters in which to assign the data """ if self._embedding is None: self._embedding = inputs[0] nodeIDs = inputs[1] nodeIDS = np.array([int(i) for i in nodeIDs]) max_clusters = self.hyperparams['max_clusters'] if max_clusters < self._embedding.shape[1]: self._embedding = self._embedding[:, :max_clusters].copy() gclust_object = graspyGCLUST(min_components=max_clusters, covariance_type="all") gclust_object.fit(self._embedding) model = gclust_object.model_ predictions = model.predict(self._embedding) testing = inputs[2] testing_nodeIDs = np.asarray(testing['G1.nodeID']) testing_nodeIDs = np.array([int(i) for i in testing_nodeIDs]) final_labels = np.zeros(len(testing)) for i in range(len(testing_nodeIDs)): label = predictions[i] final_labels[i] = int(label) + 1 testing['classLabel'] = final_labels outputs = container.DataFrame(testing[['d3mIndex', 'classLabel']]) outputs[['d3mIndex', 'classLabel']] = outputs[['d3mIndex', 'classLabel']].astype(int) return base.CallResult(outputs) def set_training_data(self, *, inputs: Inputs) -> None: self._training_inputs = inputs def get_params(self) -> Params: return Params(embedding = self._embedding) def set_params(self, *, params: Params) -> None: self._embedding = params['embedding'] def fit(self, *, timeout: float = None, iterations: int = None) -> None: return base.CallResult(None) # clf.fit(self._embedding) # BIC_max = -clf.bic(self._embedding) # cluster_likelihood_max = 1 # cov_type_likelihood_max = "spherical" # for i in range(1, max_clusters): # for k in cov_types: # clf = GaussianMixture(n_components=i, # covariance_type=k) # clf.fit(self._embedding) # current_bic = -clf.bic(self._embedding) # if current_bic > BIC_max: # BIC_max = current_bic # cluster_likelihood_max = i # cov_type_likelihood_max = k # clf = GaussianMixture(n_components = cluster_likelihood_max, # covariance_type = cov_type_likelihood_max) # clf.fit(self._embedding)
class SpectralClustering(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): ''' Primitive that applies sklearn spectral clustering algorithm to unsupervised, supervised or semi-supervised datasets. Training inputs: D3M dataframe with features and labels, and D3M indices Outputs:D3M dataframe with cluster predictions and D3M indices. Clusterlabels are of "suggestTarget" semantic type if the task_type hyperparameter is clustering, and "Attribute" if the task_type is classification. ''' metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': "d13a4529-f0ba-44ee-a867-e0fdbb71d6e2", 'version': __version__, 'name': "tsne", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['Clustering', 'Graph Clustering'], 'source': { 'name': __author__, 'contact': __contact__, 'uris': [ # Unstructured URIs. "https://github.com/NewKnowledge/D3M-Unsupervised", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package': 'cython', 'version': '0.29.14', }, { 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/NewKnowledge/D3M-Unsupervised.git@{git_commit}#egg=D3MUnsupervised' .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.clustering.spectral_graph_clustering.SpectralClustering', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.SPECTRAL_CLUSTERING, ], 'primitive_family': metadata_base.PrimitiveFamily.CLUSTERING, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self.sc = SC(n_clusters=self.hyperparams['n_clusters'], n_init=self.hyperparams['n_init'], n_neighbors=self.hyperparams['n_neighbors'], affinity=self.hyperparams['affinity'], random_state=self.random_seed) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs : dataframe Returns ---------- Outputs The output is a transformed dataframe of X fit into an embedded space, n feature columns will equal n_components hyperparameter For timeseries datasets the output is the dimensions concatenated to the timeseries filename dataframe """ targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ) target_names = [list(inputs)[t] for t in targets] index = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') index_names = [list(inputs)[i] for i in index] X_test = inputs.drop(columns=list(inputs)[index[0]]) X_test = X_test.drop(columns=target_names).values # special semi-supervised case - during training, only produce rows with labels series = inputs[target_names] != '' if series.any().any(): inputs = dataframe_utils.select_rows(inputs, np.flatnonzero(series)) X_test = X_test[np.flatnonzero(series)] sc_df = d3m_DataFrame( pandas.DataFrame(self.sc.fit_predict(X_test), columns=['cluster_labels'])) # just add last column of last column ('clusters') col_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) if self.hyperparams['task_type'] == 'classification': col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') col_dict['name'] = 'cluster_labels' else: col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' ) col_dict['name'] = target_names[0] sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict) df_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = 1 sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(utils_cp.append_columns(inputs, sc_df))
class Sent2Vec(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ Produce numerical representations (features) for short texts or sentences. Parameters ---------- inputs : Input pandas dataframe Returns ------- Outputs The output is a pandas dataframe """ metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". "id": "cf450079-9333-4a3f-aed4-b77a4e8c7be7", "version": __version__, "name": "sent2vec_wrapper", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. "keywords": ["Sent2Vec", "Embedding", "NLP", "Natural Language Processing"], "source": { "name": __author__, "contact": __contact__, "uris": [ # Unstructured URIs. "https://github.com/NewKnowledge/nk-sent2vec-d3m-wrapper" ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. "installation": [ { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/NewKnowledge/nk-sent2vec-d3m-wrapper.git@{git_commit}#egg=sent2vec_wrapper" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__))), }, { "type": "FILE", "key": "sent2vec_model", "file_uri": "http://public.datadrivendiscovery.org/twitter_bigrams.bin", "file_digest": "9e8ccfea2aaa4435ca61b05b11b60e1a096648d56fff76df984709339f423dd6", }, ], # The same path the primitive is registered with entry points in setup.py. "python_path": "d3m.primitives.feature_extraction.nk_sent2vec.Sent2Vec", # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. "algorithm_types": [metadata_base.PrimitiveAlgorithmType.VECTORIZATION], "primitive_family": metadata_base.PrimitiveFamily.FEATURE_EXTRACTION, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, volumes: typing.Dict[str, str] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, volumes=volumes) self.volumes = volumes def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce numerical representations (features) for short texts or sentences. Parameters ---------- inputs : Input pandas dataframe Returns ------- Outputs The output is a pandas dataframe """ # extract sentences from stored in nested media files text_columns = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/FileName') base_paths = [ inputs.metadata.query( (metadata_base.ALL_ELEMENTS, t))['location_base_uris'][0].replace('file:///', '/') for t in text_columns ] txt_paths = [[ os.path.join(base_path, filename) for filename in inputs.iloc[:, col] ] for base_path, col in zip(base_paths, text_columns)] txt = [[ open(path, 'r').read().replace('\n', '') for path in path_list ] for path_list in txt_paths] txt_df = pd.DataFrame(np.array(txt).T) # concatenate with text columns that aren't stored in nested files local_text_columns = inputs.metadata.get_columns_with_semantic_type( 'http://schema.org/Text') local_text_columns = [ col for col in local_text_columns if col not in text_columns ] frame = pd.concat((txt_df, inputs[local_text_columns]), axis=1) # delete columns with path names of nested media files outputs = inputs.remove_columns(text_columns) try: vectorizer = _Sent2Vec(path=self.volumes["sent2vec_model"]) #print('loaded sent2vec model', file = sys.__stdout__) output_vectors = [] for col in range(frame.shape[1]): text = frame.iloc[:, col].tolist() embedded_sentences = vectorizer.embed_sentences(sentences=text) output_vectors.append(embedded_sentences) embedded_df = pd.DataFrame( np.array(output_vectors).reshape(len(embedded_sentences), -1)) except ValueError: # just return inputs with file names deleted if vectorizing fails return CallResult(outputs) #print('successfully vectorized text\n', file = sys.__stdout__) # create df with vectorized columns and append to input df embedded_df = d3m_DataFrame(embedded_df) for col in range(embedded_df.shape[1]): col_dict = dict( embedded_df.metadata.query((metadata_base.ALL_ELEMENTS, col))) col_dict['structural_type'] = type(1.0) col_dict['name'] = "vector_" + str(col) col_dict["semantic_types"] = ( "http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/Attribute", ) embedded_df.metadata = embedded_df.metadata.update( (metadata_base.ALL_ELEMENTS, col), col_dict) df_dict = dict( embedded_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( embedded_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = embedded_df.shape[1] embedded_df.metadata = embedded_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(outputs.append_columns(embedded_df))
class StatisticalAbsEnergyPrimitive( transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ Primitive to find abs_energy of time series """ __author__ = "DATA Lab at Texas A&M University", metadata = metadata_base.PrimitiveMetadata({ 'id': '73299ffe-d8bb-43c6-a6cc-9261f5e17a5e', 'version': '0.1.0', 'name': 'Time Series Statistical Abs Energy', 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_abs_energy', 'keywords': ['Time Series', 'AbsEnergy'], "hyperparams_to_tune": ['window_size'], 'source': { 'name': 'DATA Lab at Texas A&M University', 'uris': [ 'https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalAbsEnergy.py' ], 'contact': 'mailto:[email protected]' }, 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'. format(git_commit=d3m_utils.current_git_commit( os.path.dirname(__file__)), ), }], 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, ], 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, }) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: """ Args: inputs: Container DataFrame timeout: Default iterations: Default Returns: Container DataFrame containing abs_energy of time series """ self.logger.info('Statistical AbsEnergy Primitive called') # Get cols to fit. self._fitted = False self._training_inputs, self._training_indices = self._get_columns_to_fit( inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns if len(self._training_indices) > 0: # self._clf.fit(self._training_inputs) self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") statistical_abs_energy_input = inputs if self.hyperparams['use_semantic_types']: statistical_abs_energy_input = inputs.iloc[:, self._training_indices] output_columns = [] if len(self._training_indices) > 0: statistical_abs_energy_output = self._abs_energy( statistical_abs_energy_input, self.hyperparams["window_size"]) if sparse.issparse(statistical_abs_energy_output): statistical_abs_energy_output = statistical_abs_energy_output.toarray( ) outputs = self._wrap_predictions(inputs, statistical_abs_energy_output) #if len(outputs.columns) == len(self._input_column_names): # outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns( return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) self.logger.info('Statistical AbsEnergy Primitive returned') return base.CallResult(outputs) @classmethod def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): """ Select columns to fit. Args: inputs: Container DataFrame hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: list """ if not hyperparams['use_semantic_types']: return inputs, list(range(len(inputs.columns))) inputs_metadata = inputs.metadata def can_produce_column(column_index: int) -> bool: return cls._can_produce_column(inputs_metadata, column_index, hyperparams) use_columns = hyperparams['use_columns'] exclude_columns = hyperparams['exclude_columns'] columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use( inputs_metadata, use_columns=use_columns, exclude_columns=exclude_columns, can_use_column=can_produce_column) return inputs.iloc[:, columns_to_produce], columns_to_produce # return columns_to_produce @classmethod def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: """ Output whether a column can be processed. Args: inputs_metadata: d3m.metadata.base.DataMetadata column_index: int Returns: bool """ column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) accepted_structural_types = (int, float, numpy.integer, numpy.float64) accepted_semantic_types = set() accepted_semantic_types.add( "https://metadata.datadrivendiscovery.org/types/Attribute") if not issubclass(column_metadata['structural_type'], accepted_structural_types): return False semantic_types = set(column_metadata.get('semantic_types', [])) return True if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False @classmethod def _update_predictions_metadata( cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict] ) -> metadata_base.DataMetadata: """ Updata metadata for selected columns. Args: inputs_metadata: metadata_base.DataMetadata outputs: Container Dataframe target_columns_metadata: list Returns: d3m.metadata.base.DataMetadata """ outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) for column_index, column_metadata in enumerate( target_columns_metadata): column_metadata.pop("structural_type", None) outputs_metadata = outputs_metadata.update_column( column_index, column_metadata) return outputs_metadata def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: """ Wrap predictions into dataframe Args: inputs: Container Dataframe predictions: array-like data (n_samples, n_features) Returns: Dataframe """ outputs = d3m_dataframe(predictions, generate_metadata=True) target_columns_metadata = self._add_target_columns_metadata( outputs.metadata, self.hyperparams) outputs.metadata = self._update_predictions_metadata( inputs.metadata, outputs, target_columns_metadata) return outputs @classmethod def _add_target_columns_metadata( cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): """ Add target columns metadata Args: outputs_metadata: metadata.base.DataMetadata hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: List[OrderedDict] """ outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): # column_name = "output_{}".format(column_index) column_metadata = OrderedDict() semantic_types = set() semantic_types.add(hyperparams["return_semantic_type"]) column_metadata['semantic_types'] = list(semantic_types) # column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata def _write(self, inputs: Inputs): inputs.to_csv(str(time.time()) + '.csv') def _abs_energy(self, X, window_size): """ statistical abs_energy of time series sequence Args: X : DataFrame Time series. Returns: DataFrame A object with abs_energy """ if (window_size == -1): window_size = len(X) transformed_X = utils.pandas.DataFrame() for column in X.columns: column_value = X[column].values column_abs_energy = np.zeros(len(column_value)) for iter in range(window_size - 1, len(column_value)): sequence = column_value[iter - window_size + 1:iter + 1] column_abs_energy[iter] = np.round(np.sum(sequence * sequence), 4) column_abs_energy[:window_size - 1] = column_abs_energy[window_size - 1] transformed_X[column + "_abs_energy"] = column_abs_energy return transformed_X
class IVectorExtractor(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ BBN D3M I-vector extractor extracts i-vectors for variable-length input sequences of feature vectors. Input: List of arrays with feature vectors extracted for frames [ num_frames, num_features ] Output: Array of i-vectors of shape [ num_inputs, ivec_dim ] Applications include: audio, time-series classification """ __git_commit__ = utils.current_git_commit(os.path.dirname(__file__)) metadata = metadata_module.PrimitiveMetadata({ 'id': '1c5080bd-7b2f-4dbb-ac5f-0a65b59526a7', 'version': __version__, 'name': "I-vector extractor", 'description': """BBN D3M I-vector extractor extracts i-vectors for variable-length input sequences of feature vectors.\n Input: List of arrays with feature vectors extracted for frames [ num_frames, num_features ]\n Output: Array of i-vectors of shape [ num_inputs, ivec_dim ]\n Applications include: audio, time-series classification""", 'keywords': [], 'source': { 'name': __author__, 'contact': 'mailto:[email protected]', 'uris': [ 'https://github.com/BBN-E/d3m-bbn-primitives/blob/{git_commit}/bbn_primitives/time_series/ivector_extraction.py' .format(git_commit=__git_commit__), 'https://github.com/BBN-E/d3m-bbn-primitives.git', ], }, 'installation': [{ 'type': 'PIP', 'package_uri': 'git+https://github.com/BBN-E/d3m-bbn-primitives.git@{git_commit}#egg={egg}' .format(git_commit=__git_commit__, egg='bbn_primitives'), }], 'python_path': 'd3m.primitives.data_transformation.i_vector_extractor.IVectorExtractor', #'d3m.primitives.bbn.time_series.IVectorExtractor', #'d3m.primitives.data_transformation.ivector_extractor.BBN', 'algorithm_types': [metadata_module.PrimitiveAlgorithmType.DATA_CONVERSION], 'primitive_family': metadata_module.PrimitiveFamily.DATA_TRANSFORMATION, }) def __init__( self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._training_inputs = None self._gmm = GaussianMixture( n_components=self.hyperparams['num_gauss'], covariance_type=self.hyperparams['gmm_covariance_type'], max_iter=self.hyperparams['max_gmm_iter']) self._v = None self._fitted: bool = False def set_training_data(self, *, inputs: Inputs) -> None: self._training_inputs = inputs self._fitted = False def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return CallResult(None) if self._training_inputs is None: raise Exception('Missing training data') with stopit.ThreadingTimeout(timeout) as timer: # Train GMM _logger.info('Training GMM') num_data = len(self._training_inputs) #for idx in range(num_data): # X = self._training_inputs[idx] # print(X.shape) self._gmm.fit( np.vstack( [x for x in self._training_inputs if len(x.shape) == 2])) # Train i-vector extractor self._v = np.random.randn( self._gmm.n_components * self._gmm.means_.shape[1], self.hyperparams['ivec_dim']) _logger.info('Training i-vector extractor') N = np.zeros((num_data, self._gmm.n_components)) F = np.zeros( (num_data, self._gmm.n_components * self._gmm.means_.shape[1])) # TODO: Do the E-step in mini-batches to prevent memory overflow for idx in range(num_data): X = self._training_inputs[idx] if len(X.shape) != 2: continue gamma = self._gmm.predict_proba(X) N0 = gamma.T.sum(axis=1) F0 = gamma.T.dot(X) N0, F0 = normalize_stats(N0, F0, self._gmm.means_, self._gmm.precisions_cholesky_) N[idx, :] = N0 F[idx, :] = F0.flatten() for ivec_iter in range(self.hyperparams['num_ivec_iter']): _logger.info('Training i-vector extractor - iteration %d' % ivec_iter) num_data = len(self._training_inputs) A, C, Amd, Cmd, Nmd = None, None, None, None, None VtV, I = None, None A, C, Amd, Cmd, Nmd = E_step_with_MD(N, F, self._v, VtV, I, A, C, Amd, Cmd, Nmd) em_v = M_step(A, C) md_v = M_step_MD(Amd, Cmd, Nmd, em_v) self._v = md_v.reshape( (self._gmm.n_components * self._gmm.means_.shape[1], self.hyperparams['ivec_dim'])) self._fitted = True if timer.state == timer.EXECUTED: return CallResult(None) else: raise TimeoutError('IVectorExtractor exceeded time limit') def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: with stopit.ThreadingTimeout(timeout) as timer: num_data = len(inputs) outputs = np.empty((num_data, self.hyperparams['ivec_dim']), dtype=self._v.dtype) VtV = compute_VtV(self._v, self._gmm.n_components) I = np.eye(self.hyperparams['ivec_dim'], dtype=self._v.dtype) for idx in range(num_data): X = inputs[idx] if len(X.shape) != 2: outputs[idx] = np.zeros((self.hyperparams['ivec_dim'])) continue gamma = self._gmm.predict_proba(X) N0 = gamma.T.sum(axis=1) F0 = gamma.T.dot(X) N0, F0 = normalize_stats(N0, F0, self._gmm.means_, self._gmm.precisions_cholesky_) ivec = estimate_i(row(N0.astype(self._v.dtype)), row(F0.astype(self._v.dtype)), self._v, VtV, I) outputs[idx] = ivec.flatten() #adding normalization if (self.hyperparams['ivec_normalize']): outputs = preprocessing.normalize(outputs, norm='l2') outputs = d3m_dataframe(outputs, generate_metadata=False) metadata = inputs.metadata.clear( { 'schema': metadata_module.CONTAINER_SCHEMA_VERSION, 'structural_type': type(outputs), 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'dimension': { 'length': outputs.shape[0], 'name': 'rows', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularRow' ] } }, for_value=outputs ).update( ((metadata_base.ALL_ELEMENTS, )), { 'dimension': { 'length': outputs.shape[1], 'name': 'columns', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularColumn' ] } } ).update( ((metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS)), { #'structural_type': self._v.dtype, 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/Attribute' ], }) # Set metadata attribute. outputs.metadata = metadata if timer.state == timer.EXECUTED: return CallResult(outputs) else: raise TimeoutError('IVectorExtractor exceeded time limit') def get_params(self) -> Params: return Params(weights=self._gmm.weights_, means=self._gmm.means_, covs=self._gmm.covariances_, cov_type=self._gmm.covariance_type, v=self._v) def set_params(self, *, params: Params) -> None: assert self._gmm.covariance_type == params['cov_type'] # Consider adding additional assertations regarding dims self._gmm.weights_ = params['weights'] self._gmm.means_ = params['means'] self._gmm.covariances_ = params['covs'] self._gmm.precisions_cholesky_ = sklearn.mixture.gaussian_mixture._compute_precision_cholesky( params['covs'], params['cov_type']) self._v = params['v']
class ForecastingNBEATSPrimitive( SupervisedLearnerPrimitiveBase[Inputs, Outputs, ForecastingNBEATSParams, ForecastingNBEATSHyperparams]): """ N-BEATS for time series forecasting """ metadata = metadata_base.PrimitiveMetadata( { 'id': 'bd925663-aeaf-4240-9748-cd77dce33819', 'version': '0.1.0', "name": "N-BEATS models for time series forecasting", 'description': "Pytorch Implementation of N-BEATS. The model is doing local projections to basis " "functions. These functions include \"trends\" (with polynomial functions) and " "\"seasonalities\" (with harmonic functions). The prediction will consist of adding the " "local projections to these basis functions to the last available value in the ts (Naive " "1). The model decomposes the signals successively through different \"blocks\" of a fully " "connected residual NN.", 'python_path': 'd3m.primitives.time_series_forecasting.nbeats.DeepNeuralNetwork', 'source': { 'name': nbeats.__author__, 'uris': ['https://github.com/autonlab/nbeats'], 'contact': 'mailto:[email protected]' }, 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/autonlab/nbeats.git@{git_commit}#egg=nbeats' .format(git_commit=d3m_utils.current_git_commit( os.path.dirname(__file__)), ), }], 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.DEEP_NEURAL_NETWORK, ], 'primitive_family': metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING, }, ) def __init__(self, *, hyperparams: ForecastingNBEATSHyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._is_fitted = False self._device = 'cpu' if not torch.cuda.is_available( ) or hyperparams['device'] == 'cpu' else hyperparams['device'] print("Use " + self._device) self.logger.info("Use " + self._device) self._nbeats = Nbeats( input_size_multiplier=hyperparams['input_size_multiplier'], window_sampling_limit_multiplier=hyperparams[ 'window_sampling_limit_multiplier'], shared_weights=hyperparams['shared_weights'], output_size=hyperparams['output_size'], stack_types=hyperparams['stack_types'], n_blocks=hyperparams['n_blocks'], n_layers=hyperparams['n_layers'], n_hidden=hyperparams['n_hidden'], n_harmonics=hyperparams['n_harmonics'], n_polynomials=hyperparams['n_polynomials'], n_iterations=hyperparams['n_iterations'], learning_rate=hyperparams['learning_rate'], lr_decay=hyperparams['lr_decay'], n_lr_decay_steps=hyperparams['n_lr_decay_steps'], batch_size=hyperparams['batch_size'], loss=hyperparams['loss'], seasonality=hyperparams['seasonality'], # random_seed=random_seed, # FIXME pipelines are tuned on NBeats default seed random_seed=1, device=self._device) self._time_column = None self._integer_time = False self.filter_idxs = [] self._year_column = None self._constant = 0 # the constant term to avoid nan self._y_mean = 0 # the mean of the target variable in the training data def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if not self._is_fitted: raise PrimitiveNotFittedError("Primitive not fitted.") inputs_copy = inputs.copy() # if datetime columns are integers, parse as # of days if self._integer_time: inputs_copy[self._time_column] = pd.to_datetime( inputs_copy[self._time_column], unit="D") else: inputs_copy[self._time_column] = pd.to_datetime( inputs_copy[self._time_column], unit="s") # find marked 'GroupingKey' or 'SuggestedGroupingKey' grouping_keys = inputs_copy.metadata.get_columns_with_semantic_type( "https://metadata.datadrivendiscovery.org/types/GroupingKey") suggested_grouping_keys = inputs_copy.metadata.get_columns_with_semantic_type( "https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey" ) if len(grouping_keys) == 0: grouping_keys = suggested_grouping_keys else: inputs_copy = inputs_copy.drop(columns=[ list(inputs_copy)[i] for i in suggested_grouping_keys ]) # check whether no grouping keys are labeled if len(grouping_keys) == 0: concat = pd.concat([inputs_copy[self._time_column]], axis=1) concat.columns = ['ds'] concat['unique_id'] = 'series1' # We have only one series else: # concatenate columns in `grouping_keys` to unique_id column concat = inputs_copy.loc[:, self.filter_idxs].apply( lambda x: ' '.join([str(v) for v in x]), axis=1) concat = pd.concat([concat, inputs_copy[self._time_column]], axis=1) concat.columns = ['unique_id', 'ds'] X_test = concat[['unique_id', 'ds']] predictions = self._nbeats.predict(X_test) predictions['y_hat'] -= self._constant predictions['y_hat'] = self._fillna(predictions['y_hat']) output = container.DataFrame(predictions['y_hat'], generate_metadata=True) return base.CallResult(output) def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: data = inputs.horizontal_concat(outputs) data = data.copy() # mark datetime column times = data.metadata.list_columns_with_semantic_types(( "https://metadata.datadrivendiscovery.org/types/Time", "http://schema.org/DateTime", )) if len(times) != 1: raise ValueError( f"There are {len(times)} indices marked as datetime values. Please only specify one" ) self._time_column = list(data)[times[0]] # if datetime columns are integers, parse as # of days if ("http://schema.org/Integer" in inputs.metadata.query_column(times[0])["semantic_types"]): self._integer_time = True data[self._time_column] = pd.to_datetime(data[self._time_column], unit="D") else: data[self._time_column] = pd.to_datetime(data[self._time_column], unit="s") # sort by time column data = data.sort_values(by=[self._time_column]) # mark key and grp variables self.key = data.metadata.get_columns_with_semantic_type( "https://metadata.datadrivendiscovery.org/types/PrimaryKey") # mark target variables self._targets = data.metadata.list_columns_with_semantic_types(( "https://metadata.datadrivendiscovery.org/types/SuggestedTarget", "https://metadata.datadrivendiscovery.org/types/TrueTarget", "https://metadata.datadrivendiscovery.org/types/Target", )) self._target_types = [ "i" if "http://schema.org/Integer" in data.metadata.query_column(t)["semantic_types"] else "c" if "https://metadata.datadrivendiscovery.org/types/CategoricalData" in data.metadata.query_column(t)["semantic_types"] else "f" for t in self._targets ] self._targets = [list(data)[t] for t in self._targets] self.target_column = self._targets[0] # see if 'GroupingKey' has been marked # otherwise fall through to use 'SuggestedGroupingKey' grouping_keys = data.metadata.get_columns_with_semantic_type( "https://metadata.datadrivendiscovery.org/types/GroupingKey") suggested_grouping_keys = data.metadata.get_columns_with_semantic_type( "https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey" ) if len(grouping_keys) == 0: grouping_keys = suggested_grouping_keys drop_list = [] else: drop_list = suggested_grouping_keys grouping_keys_counts = [ data.iloc[:, key_idx].nunique() for key_idx in grouping_keys ] grouping_keys = [ group_key for count, group_key in sorted( zip(grouping_keys_counts, grouping_keys)) ] self.filter_idxs = [list(data)[key] for key in grouping_keys] # drop index data.drop(columns=[list(data)[i] for i in drop_list + self.key], inplace=True) # check whether no grouping keys are labeled if len(grouping_keys) == 0: concat = pd.concat( [data[self._time_column], data[self.target_column]], axis=1) concat.columns = ['ds', 'y'] concat['unique_id'] = 'series1' # We have only one series else: # concatenate columns in `grouping_keys` to unique_id column concat = data.loc[:, self.filter_idxs].apply( lambda x: ' '.join([str(v) for v in x]), axis=1) concat = pd.concat( [concat, data[self._time_column], data[self.target_column]], axis=1) concat.columns = ['unique_id', 'ds', 'y'] if len(grouping_keys): # Infer frequency freq = self._nbeats.frequency if not freq: freq = pd.infer_freq(concat.head()['ds']) if freq is None and len(concat['unique_id']) > 0: freq = pd.infer_freq(concat[concat['unique_id'] == concat['unique_id'][0]]['ds']) if freq is None: freq = 'D' self.logger.warn('Cannot infer frequency. Use "D".') else: self.logger.info('Inferred frequency: {}'.format(freq)) # Series must be complete in the frequency concat = ForecastingNBEATSPrimitive._ffill_missing_dates_per_serie( concat, freq) # remove duplicates concat = concat.drop_duplicates(['unique_id', 'ds']) self._data = concat self._y_mean = self._data['y'].mean() # if min of y is negative, then add the absolute value of it to the constant if self._data['y'].min() <= 0: self._constant = 1 - self._data['y'].min() def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: y_train = self._data[['unique_id', 'ds', 'y']] y_train['y'] += self._constant self._nbeats.fit(y_train, verbose=False) self._is_fitted = True return base.CallResult(None) def get_params(self) -> Params: return ForecastingNBEATSParams(is_fitted=self._is_fitted, time_column=self._time_column, integer_time=self._integer_time, filter_idxs=self.filter_idxs, y_mean=self._y_mean, nbeats=self._nbeats) def set_params(self, *, params: Params) -> None: self._is_fitted = params['is_fitted'] self._time_column = params['time_column'] self._integer_time = params['integer_time'] self.filter_idxs = params['filter_idxs'] self._y_mean = params['y_mean'] self._nbeats = params['nbeats'] @staticmethod def _ffill_missing_dates_particular_serie(serie, min_date, max_date, freq): date_range = pd.date_range(start=min_date, end=max_date, freq=freq) unique_id = serie['unique_id'].unique() df_balanced = pd.DataFrame({ 'ds': date_range, 'key': [1] * len(date_range), 'unique_id': unique_id[0] }) # Check balance check_balance = df_balanced.groupby( ['unique_id']).size().reset_index(name='count') assert len(set(check_balance['count'].values)) <= 1 df_balanced = df_balanced.merge(serie, how="left", on=['unique_id', 'ds']) df_balanced['y'] = df_balanced['y'].fillna(method='ffill') return df_balanced @staticmethod def _ffill_missing_dates_per_serie(df, freq="D", fixed_max_date=None): """Receives a DataFrame with a date column and forward fills the missing gaps in dates, not filling dates before Parameters ---------- df: DataFrame Input DataFrame key: str or list Name(s) of the column(s) which make a unique time series date_col: str Name of the column that contains the time column freq: str Pandas time frequency standard strings, like "W-THU" or "D" or "M" numeric_to_fill: str or list Name(s) of the columns with numeric values to fill "fill_value" with """ if fixed_max_date is None: df_max_min_dates = df[['unique_id', 'ds']].groupby('unique_id').agg( ['min', 'max']).reset_index() else: df_max_min_dates = df[['unique_id', 'ds']].groupby('unique_id').agg( ['min']).reset_index() df_max_min_dates['max'] = fixed_max_date df_max_min_dates.columns = df_max_min_dates.columns.droplevel() df_max_min_dates.columns = ['unique_id', 'min_date', 'max_date'] df_list = [] for index, row in df_max_min_dates.iterrows(): df_id = df[df['unique_id'] == row['unique_id']] df_id = ForecastingNBEATSPrimitive._ffill_missing_dates_particular_serie( df_id, row['min_date'], row['max_date'], freq) df_list.append(df_id) df_dates = pd.concat(df_list).reset_index(drop=True).drop( 'key', axis=1)[['unique_id', 'ds', 'y']] return df_dates def _fillna(self, series): if series.isnull().any(): # self.logger.warning("The prediction contains NAN. Fill with mean of prediction.") tofill = series.mean( ) # use the prediction mean if possible. Otherwise use the mean of the training data. if pd.isna(tofill): # self.logger.warn('The predictions are all NAN') tofill = self._y_mean return series.fillna(tofill) return series
class RuleBasedFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ Filter the selected columns according to the rule. Parameters ---------- rule: String The rule to follow when performing the filter. Write it like how we write 'if' in python. And wrap column index with two '#': #col_num#. e.g. "#1# > 10" means that the numbers in column 1 must be greater than 10. The indicies of columns should be same with those in 'use_columns'. use_columns: Set A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped. The indicies of columns should be same with those in 'rule'. exclude_columns: Set A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided. return_result: Enumeration Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? add_index_columns: Bool Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\". error_on_no_input: Bool( Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False. return_semantic_type: Enumeration[str]( Decides what semantic type to attach to generated attributes' """ __author__: "DATA Lab at Texas A&M University" metadata = metadata_base.PrimitiveMetadata({ "name": "Rule-Based Filtering", "python_path": "d3m.primitives.tods.reinforcement.rule_filter", "source": { 'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:[email protected]', 'uris': [ 'https://gitlab.com/lhenry15/tods.git', ] }, "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.RULE_BASED_FILTER, ], "primitive_family": metadata_base.PrimitiveFamily.REINFORCEMENT, "id": "42744c37-8879-4785-9f18-6de9d612ea93", "hyperparams_to_tune": [ 'rule', ], "version": "0.0.1", }) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Process the testing data. Args: inputs: Container DataFrame. Returns: Container DataFrame after BKFilter. """ # Get cols to fit. self._fitted = False self._training_inputs, self._training_indices = self._get_columns_to_fit( inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns operated_col = [ int(x.strip('#')) for x in re.findall(r'#\d*#', self.hyperparams['rule']) ] if set(operated_col) != set(self._training_indices): # print(operated_col, self._training_indices) raise RuntimeError( "Column numbers in 'rule' and 'use_columns' are not matched.") if len(self._training_indices) > 0: self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") # if not self._fitted: # raise PrimitiveNotFittedError("Primitive not fitted.") # sk_inputs = inputs # if self.hyperparams['use_semantic_types']: # sk_inputs = inputs.iloc[:, self._training_indices] output_columns = [] if len(self._training_indices) > 0: sk_output = self._rule_based_filter(inputs, self.hyperparams['rule']) if sparse.issparse(sk_output): sk_output = sk_output.toarray() outputs = self._wrap_predictions(inputs, sk_output) if len(outputs.columns) == len(self._input_column_names): outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns( return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) # self._write(outputs) # self.logger.warning('produce was called3') return CallResult(outputs) @classmethod def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): """ Select columns to fit. Args: inputs: Container DataFrame hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: list """ # if not hyperparams['use_semantic_types']: # return inputs, list(range(len(inputs.columns))) inputs_metadata = inputs.metadata def can_produce_column(column_index: int) -> bool: return cls._can_produce_column(inputs_metadata, column_index, hyperparams) use_columns = [] exclude_columns = [] # if hyperparams['columns_using_method'] == 'name': # inputs_cols = inputs.columns.values.tolist() # for i in range(len(inputs_cols)): # if inputs_cols[i] in hyperparams['use_columns_name']: # use_columns.append(i) # elif inputs_cols[i] in hyperparams['exclude_columns_name']: # exclude_columns.append(i) # else: use_columns = hyperparams['use_columns'] exclude_columns = hyperparams['exclude_columns'] columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use( inputs_metadata, use_columns=use_columns, exclude_columns=exclude_columns, can_use_column=can_produce_column) return inputs.iloc[:, columns_to_produce], columns_to_produce # return columns_to_produce @classmethod def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: """ Output whether a column can be processed. Args: inputs_metadata: d3m.metadata.base.DataMetadata column_index: int Returns: bool """ column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) accepted_structural_types = (int, float, numpy.integer, numpy.float64) accepted_semantic_types = set() accepted_semantic_types.add( "https://metadata.datadrivendiscovery.org/types/Attribute") if not issubclass(column_metadata['structural_type'], accepted_structural_types): return False semantic_types = set(column_metadata.get('semantic_types', [])) if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False @classmethod def _update_predictions_metadata( cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict] ) -> metadata_base.DataMetadata: """ Updata metadata for selected columns. Args: inputs_metadata: metadata_base.DataMetadata outputs: Container Dataframe target_columns_metadata: list Returns: d3m.metadata.base.DataMetadata """ outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) for column_index, column_metadata in enumerate( target_columns_metadata): column_metadata.pop("structural_type", None) outputs_metadata = outputs_metadata.update_column( column_index, column_metadata) return outputs_metadata def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: """ Wrap predictions into dataframe Args: inputs: Container Dataframe predictions: array-like data (n_samples, n_features) Returns: Dataframe """ outputs = d3m_dataframe(predictions, generate_metadata=True) target_columns_metadata = self._add_target_columns_metadata( outputs.metadata, self.hyperparams) outputs.metadata = self._update_predictions_metadata( inputs.metadata, outputs, target_columns_metadata) return outputs @classmethod def _add_target_columns_metadata( cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): """ Add target columns metadata Args: outputs_metadata: metadata.base.DataMetadata hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: List[OrderedDict] """ outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): # column_name = "output_{}".format(column_index) column_metadata = OrderedDict() semantic_types = set() semantic_types.add(hyperparams["return_semantic_type"]) column_metadata['semantic_types'] = list(semantic_types) # column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata def _write(self, inputs: Inputs): inputs.to_csv(str(time.time()) + '.csv') def _rule_based_filter(self, X, rule): """ Filter the selected columns according to the rule. Args: X: slected rows to be performed rule: The rule to follow when performing the filter Returns: Dataframe, results of Rule-Based Filter """ list_result = [0] * X.shape[0] rule = re.sub(r'#\d*#', lambda x: 'row[' + x.group(0).strip('#') + ']', rule) for index, row in X.iterrows(): if not eval(rule): list_result[index] = 1 return utils.pandas.DataFrame({'result': list_result})
class BKFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ Filter a time series using the Baxter-King bandpass filter. Parameters ---------- low: int Minimum period for oscillations, ie., Baxter and King suggest that the Burns-Mitchell U.S. business cycle has 6 for quarterly data and 1.5 for annual data. high: int Maximum period for oscillations BK suggest that the U.S. business cycle has 32 for quarterly data and 8 for annual data. K: int Lead-lag length of the filter. Baxter and King propose a truncation length of 12 for quarterly data and 3 for annual data. use_columns: Set A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped. exclude_columns: Set A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided. return_result: Enumeration Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false. use_semantic_types: Bool Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe. add_index_columns: Bool Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\". error_on_no_input: Bool( Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False. return_semantic_type: Enumeration[str]( Decides what semantic type to attach to generated attributes' """ __author__: "DATA Lab at Texas A&M University" metadata = metadata_base.PrimitiveMetadata({ "name": "Baxter-King Filter Primitive", "python_path": "d3m.primitives.tods.feature_analysis.bk_filter", "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:[email protected]', 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/DuplicationValidation.py']}, "algorithm_types": [metadata_base.PrimitiveAlgorithmType.BK_FILTER,], "primitive_family": metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, "id": "b2bfadc5-dbca-482c-b188-8585e5f245c4", "hyperparams_to_tune": ['low', 'high', 'K'], "version": "0.0.1", }) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Process the testing data. Args: inputs: Container DataFrame. Returns: Container DataFrame after BKFilter. """ # Get cols to fit. self._fitted = False self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns if len(self._training_indices) > 0: # self._clf.fit(self._training_inputs) self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") sk_inputs = inputs if self.hyperparams['use_semantic_types']: sk_inputs = inputs.iloc[:, self._training_indices] output_columns = [] if len(self._training_indices) > 0: sk_output = self._bkfilter(sk_inputs, low=self.hyperparams['low'], high=self.hyperparams['high'], K=self.hyperparams['K']) if sparse.issparse(sk_output): sk_output = sk_output.toarray() outputs = self._wrap_predictions(inputs, sk_output) if len(outputs.columns) == len(self._input_column_names): outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) # self._write(outputs) # self.logger.warning('produce was called3') return CallResult(outputs) @classmethod def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): """ Select columns to fit. Args: inputs: Container DataFrame hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: list """ if not hyperparams['use_semantic_types']: return inputs, list(range(len(inputs.columns))) inputs_metadata = inputs.metadata def can_produce_column(column_index: int) -> bool: return cls._can_produce_column(inputs_metadata, column_index, hyperparams) use_columns = [] exclude_columns = [] # if hyperparams['columns_using_method'] == 'name': # inputs_cols = inputs.columns.values.tolist() # for i in range(len(inputs_cols)): # if inputs_cols[i] in hyperparams['use_columns_name']: # use_columns.append(i) # elif inputs_cols[i] in hyperparams['exclude_columns_name']: # exclude_columns.append(i) # else: use_columns=hyperparams['use_columns'] exclude_columns=hyperparams['exclude_columns'] columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, use_columns=use_columns, exclude_columns=exclude_columns, can_use_column=can_produce_column) return inputs.iloc[:, columns_to_produce], columns_to_produce # return columns_to_produce @classmethod def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: """ Output whether a column can be processed. Args: inputs_metadata: d3m.metadata.base.DataMetadata column_index: int Returns: bool """ column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) accepted_structural_types = (int, float, numpy.integer, numpy.float64) accepted_semantic_types = set() accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") if not issubclass(column_metadata['structural_type'], accepted_structural_types): return False semantic_types = set(column_metadata.get('semantic_types', [])) if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False @classmethod def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: """ Updata metadata for selected columns. Args: inputs_metadata: metadata_base.DataMetadata outputs: Container Dataframe target_columns_metadata: list Returns: d3m.metadata.base.DataMetadata """ outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) for column_index, column_metadata in enumerate(target_columns_metadata): column_metadata.pop("structural_type", None) outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) return outputs_metadata def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: """ Wrap predictions into dataframe Args: inputs: Container Dataframe predictions: array-like data (n_samples, n_features) Returns: Dataframe """ outputs = d3m_dataframe(predictions, generate_metadata=True) target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) return outputs @classmethod def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): """ Add target columns metadata Args: outputs_metadata: metadata.base.DataMetadata hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: List[OrderedDict] """ outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_name = "output_{}".format(column_index) column_metadata = OrderedDict() semantic_types = set() semantic_types.add(hyperparams["return_semantic_type"]) column_metadata['semantic_types'] = list(semantic_types) column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata def _write(self, inputs:Inputs): inputs.to_csv(str(time.time())+'.csv') def _bkfilter(self, X, low, high, K): """ Perform BKFilter Args: X: slected rows to be performed K, low, high: Parameters of BKFilter Returns: Dataframe, results of BKFilter """ transformed_X = utils.pandas.DataFrame() for col in X.columns: cycle = sm.tsa.filters.bkfilter(X[col], low=low, high=high, K=K) cycle_df = utils.pandas.DataFrame(cycle) transformed_X = utils.pandas.concat([transformed_X,cycle_df], axis=1) return transformed_X
class LinearRegressionPrimitive( ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], GradientCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], SamplingCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ ------------- Inputs: DataFrame of features of shape: NxM, where N = samples and M = features. Outputs: DataFrame containing the target column of shape Nx1 or denormalized dataset. ------------- """ # Metadata __author__ = 'UBC DARPA D3M Team, Tony Joseph <*****@*****.**>' metadata = metadata_base.PrimitiveMetadata({ "id": "7288e169-5c2b-434a-96f8-cb2144e7f9cc", "version": config.VERSION, "name": "Bayesian Linear Regression", "description": "A bayesian linear regression", "python_path": "d3m.primitives.regression.linear_regression.UBC", "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.LINEAR_REGRESSION, ], "source": { "name": config.D3M_PERFORMER_TEAM, "contact": config.D3M_CONTACT, "uris": [config.REPOSITORY], }, "keywords": ['bayesian', 'regression'], "installation": [config.INSTALLATION], "hyperparams_to_tune": ['learning_rate', 'minibatch_size'] }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, _verbose: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self.hyperparams = hyperparams self._random_state = random_seed self._verbose = _verbose self._training_inputs: Inputs = None self._training_outputs: Outputs = None self.label_name_columns = None self._batch_size = hyperparams['minibatch_size'] self._use_gradient_fit = hyperparams['use_gradient_fit'] self._num_iterations = hyperparams['num_iterations'] self._learning_rate = hyperparams['learning_rate'] self._analytic_fit_threshold = hyperparams['analytic_fit_threshold'] self._weights_prior = hyperparams['weights_prior'] self._tune_prior_end_to_end = hyperparams['tune_prior_end_to_end'] self._fit_term_temperature = 0.0 self._weights = None # type: torch.autograd.Variable self._noise_variance = None self._weights_variance = None self._iterations_done = None # type: int self._has_finished = False self._new_training_data = True self._inputs = None self._outputs = None self._use_analytic_form = False # Is the model fit on data self._fitted = False def _curate_data(self, training_inputs, training_outputs, get_labels): # if self._training_inputs is None or self._training_outputs is None: if training_inputs is None: raise ValueError("Missing data.") # Get training data and labels data try: feature_columns_1 = training_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/Attribute') except: feature_columns_1 = None try: feature_columns_2 = training_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/FileName') except: feature_columns_2 = None # Remove columns if outputs present in inputs if len(feature_columns_2) >= 1: for fc_2 in feature_columns_2: try: feature_columns_1.remove(fc_2) except ValueError: pass # Get labels data if present in training input try: label_columns = training_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') except: label_columns = training_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ) # If no error but no label-columns found, force try SuggestedTarget if len(label_columns) == 0 or label_columns == None: label_columns = training_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ) # Remove columns if outputs present in inputs if len(label_columns) >= 1: for lbl_c in label_columns: try: feature_columns_1.remove(lbl_c) except ValueError: pass # Training Set feature_columns_1 = [int(fc) for fc in feature_columns_1] try: new_XTrain = (( training_inputs.iloc[:, feature_columns_1]).to_numpy()).astype( np.float) except ValueError: # Most likely Numpy ndarray series XTrain = training_inputs.iloc[:, feature_columns_1] XTrain_shape = XTrain.shape[0] XTrain = ((XTrain.iloc[:, -1]).to_numpy()) # Unpack new_XTrain = [] for arr in range(XTrain_shape): new_XTrain.append(XTrain[arr]) new_XTrain = np.array(new_XTrain) # del to save memory del XTrain # Training labels if get_labels: if training_outputs is None: raise ValueError("Missing data.") # Get labelled dataset try: label_columns = training_outputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget' ) except ValueError: label_columns = training_outputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ) # If no error but no label-columns force try SuggestedTarget if len(label_columns) == 0 or label_columns == None: label_columns = training_outputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ) YTrain = (( training_outputs.iloc[:, label_columns]).to_numpy()).astype( np.float) # Get label column names label_name_columns = [] label_name_columns_ = list(training_outputs.columns) for lbl_c in label_columns: label_name_columns.append(label_name_columns_[lbl_c]) self.label_name_columns = label_name_columns return new_XTrain, YTrain, feature_columns_1 return new_XTrain, feature_columns_1 def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: inputs, outputs, _ = self._curate_data(training_inputs=inputs, training_outputs=outputs, get_labels=True) N, P = inputs.shape if self._use_gradient_fit: self._use_analytic_form = False elif P < N and N / P < self._analytic_fit_threshold: self._use_analytic_form = True inputs_with_ones = np.insert(inputs, P, 1, axis=1) self._training_inputs = to_variable(inputs_with_ones, requires_grad=True) self._training_outputs = to_variable(outputs, requires_grad=True) self._new_training_data = True self._has_finished = False self._iterations_done = 0 self._converged_count = 0 self._best_rmse = np.inf def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ inputs: (num_inputs, D) numpy array outputs : numpy array of dimension (num_inputs) """ # Curate data XTest, feature_columns = self._curate_data(training_inputs=inputs, training_outputs=None, get_labels=False) XTest = self._offset_input(inputs=XTest) self._weights = refresh_node(self._weights) self._noise_variance = refresh_node(self._noise_variance) self._weights_variance = refresh_node(self._weights_variance) self._inputs = to_variable(XTest, requires_grad=True) mu = torch.mm(self._inputs, self._weights.unsqueeze(0).transpose(0, 1)).squeeze() reparameterized_normal = torch.distributions.normal.Normal( mu, self._noise_variance.expand(len(mu))) self._outputs = reparameterized_normal.rsample() self._outputs.reqiures_grad = True predictions = self._outputs.data.numpy() # Delete columns with path names of nested media files outputs = inputs.remove_columns(feature_columns) # Convert from ndarray from DataFrame predictions = container.DataFrame(predictions, generate_metadata=True) # Update Metadata for each feature vector column for col in range(predictions.shape[1]): col_dict = dict( predictions.metadata.query((metadata_base.ALL_ELEMENTS, col))) col_dict['structural_type'] = type(1.0) col_dict['name'] = self.label_name_columns[col] col_dict["semantic_types"] = ( "http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ) predictions.metadata = predictions.metadata.update( (metadata_base.ALL_ELEMENTS, col), col_dict) # Rename Columns to match label columns predictions.columns = self.label_name_columns # Append to outputs outputs = outputs.append_columns(predictions) return base.CallResult(outputs) def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult: """ Runs gradient descent for ``timeout`` seconds or ``iterations`` iterations, whichever comes sooner, on log normal_density(self.weights * self.input - output, identity*self.noise_variance) + parameter_prior_primitives["weights"].score(self.weights) + parameter_prior_primitives["noise_variance"].score(noise_variance). """ if self._fitted: return base.CallResult(None) iterations = self._num_iterations if self._new_training_data: self._weights = torch.FloatTensor( np.random.randn(self._training_inputs.size()[1]) * 0.001) self._noise_variance = torch.ones(1) # this should be a matrix self._weights_variance = torch.ones(1) self._new_training_data = False elif self._has_finished: return CallResult(None, has_finished=self._has_finished, iterations_done=self._iterations_done) if self._use_analytic_form: self._analytic_fit(iterations=iterations) else: self._gradient_fit(timeout=timeout, iterations=iterations, batch_size=self._batch_size) self._fitted = True return CallResult(None) def _analytic_fit(self, *, iterations): train_x = self._training_inputs.data.numpy() train_y = self._training_outputs.data.numpy() cov_dim = self._training_inputs.shape[1] inv_covar = np.zeros([cov_dim, cov_dim]) if self._weights_prior is not None: # just the prior on weights minus offset inv_covar[:cov_dim - 1, :cov_dim - 1] = np.linalg.inv( self._weights_prior.get_params()['covariance']) # this expression is (X^T*X + Lambda*I)^-1*X^T*Y # i.e. it is the solution to the problem argmin_w(E_D(w) + E_w(w)) # where the first E_D(w) is the ML objective (least squares mvn) # and the second term E_w(w) is the regularizer, in this case Lambda/2*w^T*w w_sigma = np.dot( np.transpose(train_x), train_x) + inv_covar * float(self._noise_variance.data.numpy()[0]) w_mu = np.dot(np.dot(np.linalg.inv(w_sigma), np.transpose(train_x)), train_y) self._weights = torch.FloatTensor(w_mu.flatten()) self._weights_variance = torch.FloatTensor(w_sigma) self._iterations_done = 1 self._has_finished = True def _gradient_fit(self, *, timeout: float = None, iterations: int = 100, fit_threshold: float = 0, batch_size: int) -> None: if self._training_inputs is None or self._training_outputs is None: raise ValueError("Missing training data.") if timeout is None: timeout = np.inf if batch_size is None: batch_size = 1 x_batches = [] y_batches = [] # optionally do sampling with replacement for i in range(0, len(self._training_inputs), batch_size): x_batches.append(self._training_inputs[i:i + batch_size]) y_batches.append(self._training_outputs[i:i + batch_size]) num_batches = len(x_batches) start = time.time() # self._weights_variance = torch.Tensor() iter_count = 0 has_converged = False while iter_count < iterations and has_converged == False: iter_count += 1 batch_no = iter_count % num_batches grads = [ self._gradient_params_log_likelihood(input=training_input, output=training_output) for training_input, training_output in zip( x_batches[batch_no], y_batches[batch_no]) ] weights_grad = sum(grad[0] for grad in grads) * num_batches noise_grad = sum(grad[1] for grad in grads) * num_batches if self._weights_prior is not None: # TODO scale this by bz over total data weights_grad += torch.from_numpy( self._weights_prior.gradient_output(outputs=np.array( [self._weights.data.numpy()]), inputs=[])) self._weights.data += self._learning_rate * weights_grad * 1 / torch.norm( weights_grad) self._noise_variance.data += self._learning_rate * noise_grad * 1 / torch.norm( noise_grad) train_outputs = torch.mm( self._training_inputs, self._weights.unsqueeze(0).transpose(0, 1)).squeeze() train_y = self._training_outputs.data.numpy().flatten() rmse = mean_squared_error(train_outputs.data.numpy(), train_y) if rmse < self._best_rmse: self._converged_count = 0 self._best_rmse = rmse else: self._converged_count += 1 if self._converged_count > 1000: self._has_finished = True break self._iterations_done += iter_count def log_likelihoods(self, *, outputs: Outputs, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[ndarray]: """ input : D-length numpy ndarray output : float Calculates log(normal_density(self.weights * self.input - output, identity * self.noise_variance)) for a single input/output pair. """ result = np.array([ self._log_likelihood(output=to_variable(output), input=to_variable(input)).data.numpy() for input, output in zip(inputs, outputs) ]) return CallResult(result) def log_likelihood(self, *, outputs: Outputs, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[float]: inputs = self._offset_input(inputs=inputs) result = self.log_likelihoods(outputs=outputs, inputs=inputs, timeout=timeout, iterations=iterations) return CallResult(sum(result.value), has_finished=result.has_finished, iterations_done=result.iterations_done) def _log_likelihood( self, output: torch.autograd.Variable, input: torch.autograd.Variable) -> torch.autograd.Variable: """ All inputs are torch tensors (or variables if grad desired). input : D-length torch to_variable output : float """ expected_output = torch.dot(self._weights, input).unsqueeze(0) covariance = to_variable(self._noise_variance).view(1, 1) return log_mvn_likelihood(expected_output, covariance, output) def _gradient_params_log_likelihood( self, *, output: torch.autograd.Variable, input: torch.autograd.Variable ) -> Tuple[torch.autograd.Variable, torch.autograd.Variable, torch.autograd.Variable]: """ Output is ( D-length torch variable, 1-length torch variable ) """ self._weights = refresh_node(self._weights) self._noise_variance = refresh_node(self._noise_variance) log_likelihood = self._log_likelihood(output=output, input=input) log_likelihood.backward() return (self._weights.grad.data, self._noise_variance.grad.data) def _gradient_output_log_likelihood( self, *, output: ndarray, input: torch.autograd.Variable) -> torch.autograd.Variable: """ output is D-length torch variable """ output_var = to_variable(output) log_likelihood = self._log_likelihood(output=output_var, input=input) log_likelihood.backward() return output_var.grad def gradient_output(self, *, outputs: Outputs, inputs: Inputs) -> Gradients[Outputs]: # type: ignore """ Calculates grad_output log normal_density(self.weights * self.input - output, identity * self.noise_variance) for a single input/output pair. """ inputs = self._offset_input(inputs=inputs) outputs_vars = [ to_variable(output, requires_grad=True) for output in outputs ] inputs_vars = [to_variable(input) for input in inputs] grad = sum( self._gradient_output_log_likelihood(output=output, input=input) for (input, output) in zip(inputs_vars, outputs_vars)) return grad.data.numpy() def gradient_params(self, *, outputs: Outputs, inputs: Inputs) -> Gradients[Params]: # type: ignore """ Calculates grad_weights fit_term_temperature * log normal_density(self.weights * self.input - output, identity * self.noise_variance) for a single input/output pair. """ outputs_vars = [ to_variable(output, requires_grad=True) for output in outputs ] inputs_vars = [to_variable(input) for input in inputs] grads = [ self._gradient_params_log_likelihood(output=output, input=input) for (input, output) in zip(inputs_vars, outputs_vars) ] grad_weights = sum(grad[0] for grad in grads) grad_noise_variance = sum(grad[1] for grad in grads) return Params(weights=grad_weights, offset=grad_offset, noise_variance=grad_noise_variance) def _sample_once(self, *, inputs: Inputs) -> Outputs: """ input : NxD numpy ndarray outputs : N-length numpy ndarray """ if self._weights is None or self._noise_variance is None or self._weights_variance is None: raise ValueError("Params not set.") inputs = self._offset_input(inputs=inputs) weights = np.random.multivariate_normal( self._weights.detach().numpy(), self._weights_variance.detach().numpy()) # sample outputs output_means = [np.dot(weights, input) for input in inputs] outputs = np.random.normal(output_means, self._noise_variance.data[0]) return outputs def sample(self, *, inputs: Inputs, num_samples: int = 1, timeout: float = None, iterations: int = None) -> Sequence[Outputs]: """ input : num_inputs x D numpy ndarray outputs : num_predictions x num_inputs numpy ndarray """ return [self._sample_once(inputs=inputs) for _ in range(num_samples)] def backward( self, *, gradient_outputs: Gradients[Outputs], fine_tune: bool = False, fine_tune_learning_rate: float = 0.00001 ) -> Tuple[Gradients[Inputs], Gradients[Params]]: # type: ignore if self._inputs is None: raise Exception( 'Cannot call backpropagation before forward propagation. Call "produce" before "backprop".' ) else: if self._inputs.grad is not None: self._inputs.grad.data.zero_() self._outputs.backward(gradient=torch.Tensor(gradient_outputs)) # this is the gradients given by end to end loss weights_grad = self._weights.grad.data noise_grad = self._noise_variance.grad.data if fine_tune: # this is gradients given by the annealed local loss if self._fit_term_temperature != 0: # TODO use minibatches here training_grads = [ self._gradient_params_log_likelihood(output=output, input=input) for (input, output) in zip(self._training_inputs, self._training_outputs) ] weights_grad += self._fit_term_temperature * \ sum(grad[0] for grad in training_grads) noise_grad += self._fit_term_temperature * \ sum(grad[1] for grad in training_grads) # make local update with temperature if required # TODO add the score frmo the prior primitive here self._weights.data += weights_grad * 1 / torch.norm( weights_grad) self._noise_variance.data += noise_grad * 1 / torch.norm( noise_grad) self._weights = refresh_node(self._weights) self._noise_variance = refresh_node(self._noise_variance) grad_inputs = self._inputs.grad grad_params = Params(weights=ndarray(weights_grad[:-1]), offset=float(weights_grad[-1]), noise_variance=float(noise_grad[0]), weights_variance=ndarray( np.zeros(self._weights_variance.shape))) if self._tune_prior_end_to_end: # update priors parameters here if specified self._weights_prior.backward(gradient_outputs=grad['weights'], fine_tune=True) return grad_inputs, grad_params def set_fit_term_temperature(self, *, temperature: float = 0) -> None: self._fit_term_temperature = temperature def _offset_input(self, *, inputs: Inputs) -> Inputs: if inputs.shape[1] == self._weights.shape[0]: return inputs else: return np.insert(inputs, inputs.shape[1], 1, axis=1) def get_call_metadata(self) -> CallResult: return CallResult(None, has_finished=self._has_finished, iterations_done=self._iterations_done) def get_params(self) -> Params: return Params( weights=ndarray(self._weights[:-1].data.numpy()), offset=float(self._weights[-1].data.numpy()), noise_variance=float(self._noise_variance.data.numpy()[0]), weights_variance=ndarray(self._weights_variance.data.numpy()), target_names_=self.label_name_columns) def set_params(self, *, params: Params) -> None: full_weights = np.append(params['weights'], params['offset']) self._weights = to_variable(full_weights, requires_grad=True) self._weights.retain_grad() self._weights_variance = to_variable(params['weights_variance'], requires_grad=True) self._noise_variance = to_variable(params['noise_variance'], requires_grad=True) self.label_name_columns = params['target_names_'] self._fitted = True def __getstate__(self) -> dict: state = super().__getstate__() state['random_state'] = self._random_state return state def __setstate__(self, state: dict) -> None: super().__setstate__(state) self._random_state = state['random_state']
class ObjectDetectionRNPrimitive(PrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive that utilizes RetinaNet, a convolutional neural network (CNN), for object detection. The methodology comes from "Focal Loss for Dense Object Detection" by Lin et al. 2017 (https://arxiv.org/abs/1708.02002). The code implementation is based off of the base library found at: https://github.com/fizyr/keras-retinanet. The primitive accepts a Dataset consisting of images, labels as input and returns a dataframe as output which include the bounding boxes for each object in each image. """ metadata = metadata_base.PrimitiveMetadata({ 'id': 'd921be1e-b158-4ab7-abb3-cb1b17f42639', 'version': __version__, 'name': 'retina_net', 'python_path': 'd3m.primitives.object_detection.retina_net.ObjectDetectionRN', 'keywords': [ 'object detection', 'convolutional neural network', 'digital image processing', 'RetinaNet' ], 'source': { 'name': __author__, 'contact': __contact__, 'uris': [ 'https://github.com/kungfuai/d3m-primitives', ], }, "installation": [{ "type": "PIP", "package": "cython", "version": "0.29.16" }, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/kungfuai/d3m-primitives.git@{git_commit}#egg=kf-d3m-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, { 'type': "FILE", 'key': "resnet50", 'file_uri': "http://public.datadrivendiscovery.org/ResNet-50-model.keras.h5", 'file_digest': "0128cdfa3963288110422e4c1a57afe76aa0d760eb706cda4353ef1432c31b9c" }], 'algorithm_types': [metadata_base.PrimitiveAlgorithmType.RETINANET], 'primitive_family': metadata_base.PrimitiveFamily.OBJECT_DETECTION, 'can_use_gpus': True }) def __init__(self, *, hyperparams: Hyperparams, volumes: typing.Dict[str, str] = None, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, volumes=volumes, random_seed=random_seed) self.image_paths = None self.annotations = None self.base_dir = None self.classes = None self.backbone = None self.y_true = None def get_params(self) -> Params: return Params(base_dir=self.base_dir, image_paths=self.image_paths, annotations=self.annotations, classes=self.classes) def set_params(self, *, params: Params) -> None: self.base_dir = params['base_dir'] self.image_paths = params['image_paths'] self.annotations = params['annotations'] self.classes = params['classes'] def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """ Sets the primitive's training data and preprocesses the files for RetinaNet format. Parameters ---------- inputs: numpy ndarray of size (n_images, dimension) containing the d3m Index, image name, and bounding box for each image. Returns ------- No returns. Function is called by pipeline at runtime. """ # Prepare annotation file ## Generate image paths image_cols = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/FileName') self.base_dir = [ inputs.metadata.query( (metadata_base.ALL_ELEMENTS, t))['location_base_uris'][0].replace('file:///', '/') for t in image_cols ] self.image_paths = np.array([[ os.path.join(self.base_dir, filename) for filename in inputs.iloc[:, col] ] for self.base_dir, col in zip(self.base_dir, image_cols)]).flatten() self.image_paths = pd.Series(self.image_paths) ## Arrange proper bounding coordinates bounding_coords = inputs.bounding_box.str.split(',', expand=True) bounding_coords = bounding_coords.drop( bounding_coords.columns[[2, 5, 6, 7]], axis=1) bounding_coords.columns = ['x1', 'y1', 'y2', 'x2'] bounding_coords = bounding_coords[['x1', 'y1', 'x2', 'y2']] ## Generate class names class_name = pd.Series(['class'] * inputs.shape[0]) ## Assemble annotation file self.annotations = pd.concat( [self.image_paths, bounding_coords, class_name], axis=1) self.annotations.columns = [ 'img_file', 'x1', 'y1', 'x2', 'y2', 'class_name' ] # Prepare ID file self.classes = pd.DataFrame({'class_name': ['class'], 'class_id': [0]}) def _create_callbacks(self, model, training_model, prediction_model): """ Creates the callbacks to use during training. Parameters ---------- model : The base model. training_model : The model that is used for training. prediction_model : The model that should be used for validation. validation_generator : The generator for creating validation data. Returns ------- callbacks : A list of callbacks used for training. """ callbacks = [] callbacks.append( keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)) return callbacks def _create_models(self, backbone_retinanet, num_classes, weights, freeze_backbone=False, lr=1e-5): """ Creates three models (model, training_model, prediction_model). Parameters ---------- backbone_retinanet : A function to call to create a retinanet model with a given backbone. num_classes : The number of classes to train. weights : The weights to load into the model. multi_gpu : The number of GPUs to use for training. freeze_backbone : If True, disables learning for the backbone. config : Config parameters, None indicates the default configuration. Returns ------- model : The base model. training_model : The training model. If multi_gpu=0, this is identical to model. prediction_model : The model wrapped with utility functions to perform object detection (applies regression values and performs NMS). """ modifier = freeze_model if freeze_backbone else None anchor_params = None num_anchors = None model = self._model_with_weights(backbone_retinanet( num_classes, num_anchors=num_anchors, modifier=modifier), weights=weights, skip_mismatch=True) training_model = model prediction_model = retinanet_bbox(model=model, anchor_params=anchor_params) training_model.compile(loss={ 'regression': losses.smooth_l1(), 'classification': losses.focal() }, optimizer=keras.optimizers.adam(lr=lr, clipnorm=0.001)) return model, training_model, prediction_model def _num_classes(self): """ Number of classes in the dataset. """ return max(self.classes.values()) + 1 def _model_with_weights(self, model, weights, skip_mismatch): """ Load weights for model. Parameters ---------- model : The model to load weights for. weights : The weights to load. skip_mismatch : If True, skips layers whose shape of weights doesn't match with the model. Returns ------- model : Model with loaded weights. """ if weights is not None: model.load_weights(weights, by_name=True, skip_mismatch=skip_mismatch) return model def _create_generator(self, annotations, classes, shuffle_groups): """ Create generator for evaluation. """ validation_generator = CSVGenerator(self.annotations, self.classes, self.base_dir, self.hyperparams['batch_size'], self.backbone.preprocess_image, shuffle_groups=False) return validation_generator def _fill_empty_predictions(self, empty_predictions_image_names, d3mIdx_image_mapping): """ D3M metrics evaluator needs at least one prediction per image. If RetinaNet does not return predictions for an image, this method creates a dummy empty prediction row to add to results_df for that missing image. TODO: DUMMY CONFIDENCE SCORES LOWER AVERAGE PRECISION. FIND A FIX. """ # Prepare D3M index empty_predictions_d3mIdx = [ d3mIdx_image_mapping.get(key) for key in empty_predictions_image_names ] empty_predictions_d3mIdx = [ item for sublist in empty_predictions_d3mIdx for item in sublist ] # Prepare dummy columns d3mIdx = empty_predictions_d3mIdx bounding_box = ["0,0,0,0,0,0,0,0"] * len(empty_predictions_d3mIdx) confidence = [float(0)] * len(empty_predictions_d3mIdx) empty_predictions_df = pd.DataFrame({ 'd3mIndex': d3mIdx, 'bounding_box': bounding_box, 'confidence': confidence }) return empty_predictions_df def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Creates the image generators and then trains RetinaNet model on the image paths in the input dataframe column. Can choose to use validation generator. If no weight file is provided, the default is to use the ImageNet weights. """ # Create object that stores backbone information self.backbone = models.backbone(self.hyperparams['backbone']) # Create the generators train_generator = CSVGenerator(self.annotations, self.classes, self.base_dir, self.hyperparams['batch_size'], self.backbone.preprocess_image, shuffle_groups=False) # Running the model ## Assign weights if self.hyperparams['weights'] is False: weights = None else: weights = self.volumes[self.hyperparams['backbone']] ## Create model logger.info('Creating model...') model, training_model, prediction_model = self._create_models( backbone_retinanet=self.backbone.retinanet, num_classes=train_generator.num_classes(), weights=weights, freeze_backbone=self.hyperparams['freeze_backbone'], lr=self.hyperparams['learning_rate']) ### !!! vgg AND densenet BACKBONES CURRENTLY NOT IMPLEMENTED !!! ## Let the generator compute the backbone layer shapes using the actual backbone model # if 'vgg' in self.hyperparams['backbone'] or 'densenet' in self.hyperparams['backbone']: # train_generator.compute_shapes = make_shapes_callback(model) # if validation_generator: # validation_generator.compute_shapes = train_generator.compute_shapes ## Set up callbacks callbacks = self._create_callbacks( model, training_model, prediction_model, ) start_time = time.time() logger.info('Starting training...') training_model.fit_generator( generator=train_generator, steps_per_epoch=self.hyperparams['n_steps'], epochs=self.hyperparams['n_epochs'], verbose=1, callbacks=callbacks, ) training_model.save_weights(self.hyperparams['weights_path'] + 'model_weights.h5') logger.info( f'Training complete. Training took {time.time()-start_time} seconds.' ) return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce image detection predictions. Parameters ---------- inputs : numpy ndarray of size (n_images, dimension) containing the d3m Index, image name, and bounding box for each image. Returns ------- outputs : A d3m dataframe container with the d3m index, image name, bounding boxes as a string (8 coordinate format), and confidence scores. """ iou_threshold = 0.5 # Bounding box overlap threshold for false positive or true positive score_threshold = 0.05 # The score confidence threshold to use for detections max_detections = 100 # Maxmimum number of detections to use per image # Create object that stores backbone information backbone = models.backbone(self.hyperparams['backbone']) # Create the generators train_generator = CSVGenerator(self.annotations, self.classes, self.base_dir, self.hyperparams['batch_size'], backbone.preprocess_image, shuffle_groups=False) # Assign weights if self.hyperparams['weights'] is False: weights = None else: weights = self.volumes[self.hyperparams['backbone']] # Instantiate model model, training_model, prediction_model = self._create_models( backbone_retinanet=backbone.retinanet, num_classes=train_generator.num_classes(), weights=weights, freeze_backbone=self.hyperparams['freeze_backbone'], lr=self.hyperparams['learning_rate']) # Load model weights saved in fit training_model.load_weights(self.hyperparams['weights_path'] + 'model_weights.h5') # Convert training model to inference model inference_model = models.convert_model(training_model) # Generate image paths image_cols = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/FileName') self.base_dir = [ inputs.metadata.query( (metadata_base.ALL_ELEMENTS, t))['location_base_uris'][0].replace('file:///', '/') for t in image_cols ] self.image_paths = np.array([[ os.path.join(self.base_dir, filename) for filename in inputs.iloc[:, col] ] for self.base_dir, col in zip(self.base_dir, image_cols)]).flatten() self.image_paths = pd.Series(self.image_paths) # Initialize output objects box_list = [] score_list = [] image_name_list = [] # Predict bounding boxes and confidence scores for each image image_list = [ x for i, x in enumerate(self.image_paths.tolist()) if self.image_paths.tolist().index(x) == i ] start_time = time.time() logger.info('Starting testing...') for i in image_list: image = read_image_bgr(i) # preprocess image for network image = preprocess_image(image) image, scale = resize_image(image) boxes, scores, labels = inference_model.predict_on_batch( tf.constant(np.expand_dims(image, axis=0), dtype=tf.float32)) # correct for image scale boxes /= scale for box, score in zip(boxes[0], scores[0]): if score < 0.5: break b = box.astype(int) box_list.append(b) score_list.append(score) image_name_list.append(i * len(b)) logger.info( f'Testing complete. Testing took {time.time()-start_time} seconds.' ) ## Convert predicted boxes from a list of arrays to a list of strings boxes = np.array(box_list).tolist() boxes = list( map(lambda x: [x[0], x[1], x[0], x[3], x[2], x[3], x[2], x[1]], boxes)) # Convert to 8 coordinate format for D3M boxes = list(map(lambda x: ",".join(map(str, x)), boxes)) # Create mapping between image names and D3M index input_df = pd.DataFrame({ 'd3mIndex': inputs.d3mIndex, 'image': [os.path.basename(list) for list in self.image_paths] }) d3mIdx_image_mapping = input_df.set_index('image').T.to_dict('list') # Extract values for image name keys and get missing image predictions (if they exist) image_name_list = [os.path.basename(list) for list in image_name_list] d3mIdx = [d3mIdx_image_mapping.get(key) for key in image_name_list] empty_predictions_image_names = [ k for k, v in d3mIdx_image_mapping.items() if v not in d3mIdx ] d3mIdx = [item for sublist in d3mIdx for item in sublist] # Flatten list of lists ## Assemble in a Pandas DataFrame results = pd.DataFrame({ 'd3mIndex': d3mIdx, 'bounding_box': boxes, 'confidence': score_list }) # D3M metrics evaluator needs at least one prediction per image. If RetinaNet does not return # predictions for an image, create a dummy empty prediction row to add to results_df for that # missing image. if len(empty_predictions_image_names) != 0: # Create data frame of empty predictions for missing each image and concat with results. # Sort results_df. empty_predictions_df = self._fill_empty_predictions( empty_predictions_image_names, d3mIdx_image_mapping) results_df = pd.concat([results, empty_predictions_df ]).sort_values('d3mIndex') else: results_df = results # Convert to DataFrame container results_df = d3m_DataFrame(results_df) ## Assemble first output column ('d3mIndex) col_dict = dict( results_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type("1") col_dict['name'] = 'd3mIndex' col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') results_df.metadata = results_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) ## Assemble second output column ('bounding_box') col_dict = dict( results_df.metadata.query((metadata_base.ALL_ELEMENTS, 1))) col_dict['structural_type'] = type("1") col_dict['name'] = 'bounding_box' col_dict['semantic_types'] = ( 'http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget', 'https://metadata.datadrivendiscovery.org/types/BoundingPolygon') results_df.metadata = results_df.metadata.update( (metadata_base.ALL_ELEMENTS, 1), col_dict) ## Assemble third output column ('confidence') col_dict = dict( results_df.metadata.query((metadata_base.ALL_ELEMENTS, 2))) col_dict['structural_type'] = type("1") col_dict['name'] = 'confidence' col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Score') results_df.metadata = results_df.metadata.update( (metadata_base.ALL_ELEMENTS, 2), col_dict) return CallResult(results_df)
class EnrichDatesPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ Enriches dates by converting to seconds from a base time and computing Z scores. The results are appended to the existing dataset, and the original column is left in place for additional downstream processing. """ metadata = metadata_base.PrimitiveMetadata( { "id": "b1367f5b-bab1-4dfc-a1a9-6a56430e516a", "version": version.__version__, "name": "Enrich dates", "python_path": "d3m.primitives.data_transformation.enrich_dates.DistilEnrichDates", "source": { "name": "Distil", "contact": "mailto:[email protected]", "uris": [ "https://github.com/uncharted-distil/distil-primitives/blob/main/distil/primitives/enrich_dates.py", "https://github.com/uncharted-distil/distil-primitives", ], }, "installation": [ CYTHON_DEP, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@{git_commit}#egg=distil-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.ENCODE_BINARY, ], "primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, }, ) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: logger.debug(f"Running {__name__}") outputs = inputs.copy() outputs = self._enrich_dates(outputs) logger.debug(f"\n{outputs}") return base.CallResult(outputs) def _enrich_dates(self, inputs: Inputs) -> Outputs: # determine columns we need to operate on cols = distil_utils.get_operating_columns( inputs, self.hyperparams["use_columns"], ("http://schema.org/DateTime", )) date_num = 0 for c in cols: try: # compute z scores for column members inputs_seconds = ( (pd.to_datetime(inputs.iloc[:, c]) - pd.to_datetime("2000-01-01")).dt.total_seconds().values) sec_mean = inputs_seconds.mean() sec_std = inputs_seconds.std() sec_val = 0.0 if sec_std != 0.0: sec_val = (inputs_seconds - sec_mean) / sec_std if self.hyperparams["replace"]: inputs.metadata = inputs.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Float") inputs.metadata = inputs.metadata.remove_semantic_type( (metadata_base.ALL_ELEMENTS, c), "http://schema.org/DateTime") inputs.metadata = inputs.metadata.update( (metadata_base.ALL_ELEMENTS, c), {"structural_type": float}) inputs[inputs.columns[c]] = sec_val else: # append the results and update semantic types result = container.DataFrame( {f"__date_{date_num}": sec_val}, generate_metadata=True) result.metadata = result.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), "http://schema.org/Float") inputs = inputs.append_columns(result) date_num += 1 except: continue return inputs
class HighRankImputer(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ This primitive imputes a dataset in which data points are drawn from multiple subspaces, which in pratice means the data have mutiple groups/classes. In such cases, the data matrices are often of high-rank. In such cases, Sparse Factorization based Matrix Completion (SFMC) can outperform classical low-rank matrix completion methods. The optimization is solved via accelerated proximal alternating minimization (APALM). The NaNs in the input matrix will be regarded as missing entries. The algorithm will recover the missing entries and return the recovered matrix as output. The method can be used for collaborative filtering (recommendation system) and data preprocessing. """ metadata = metadata_base.PrimitiveMetadata({ 'id': 'e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93', 'version': __version__, 'name': "Matrix Completion via Sparse Factorization", 'keywords': [ 'Matrix completion', 'low-rank matrix', 'high-rank matrix', 'sparse factorization', ], 'source': { 'name': __author__, 'contact': 'mailto:[email protected]', 'uris': [ 'https://github.com/cyangcornell/d3m-primitives.git', ], }, 'installation': [{ 'type': 'PIP', 'package_uri': 'git+https://github.com/cyangcornell/d3m-primitives.git@{git_commit}#egg=pyglrm-d3m' .format( git_commit=utils.current_git_commit(os.path.dirname(__file__))) }], 'python_path': 'd3m.primitives.collaborative_filtering.high_rank_imputer.Cornell', 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType. LOW_RANK_MATRIX_APPROXIMATIONS, ], 'primitive_family': metadata_base.PrimitiveFamily.COLLABORATIVE_FILTERING, }) def __init__(self, *, hyperparams: Hyperparams, docker_containers: Dict[str, DockerContainer] = None, _versbose: int = 0) -> None: super().__init__(hyperparams=hyperparams, docker_containers=docker_containers) self.d: int = hyperparams['d'] self.tol: float = hyperparams['tol'] self.maxiter: int = hyperparams['maxiter'] self.alpha: float = hyperparams['alpha'] self.beta: float = hyperparams['beta'] self._fitted = False self._CF = False def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: self._training_inputs = inputs self._training_outputs = outputs self._fitted = False self._keys = list(outputs) self._MC = False def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return Vt = self._training_inputs.values if Vt.shape[1] > 3: X_incomplete = self._training_inputs.copy() X = self._training_inputs.values.copy() X = X.T m0 = 1 n0 = 0 else: self._CF = True x_rating = self._training_inputs.copy() x_rating[self._training_outputs. columns[0]] = self._training_outputs.values X_incomplete = x_rating.pivot(index=x_rating.columns[0], columns=x_rating.columns[1], values=x_rating.columns[2]) X = X_incomplete.values.copy() m0, n0 = X.shape if m0 > n0: X = X.T tol = self.tol maxiter = self.maxiter m, n = X.shape M = np.ones([m, n]) M[np.isnan(X)] = 0 sr = M.sum() / m / n X[np.isnan(X)] = 0 if self.d == 0: if sr > 0.5: d = np.int(np.round(0.5 * min(m, n))) else: d = np.int(3 * np.round(sr * min(m, n))) else: d = self.d alpha = self.alpha * n / d beta = self.beta * np.sqrt(n / d) self._beta = beta self._d = d A = np.random.randn(m, d) Z = np.zeros((d, n)) rho = max(1.5 * np.sqrt(M.mean()), 0.5) iter = 0 cc = 0.5 while iter < self.maxiter: iter = iter + 1 # Z_new if iter == 1: Z = Z else: Z = Z_new + cc * (Z_new - Z_old) tau = rho * np.linalg.norm(np.dot(A.T, A), 2) G = Z - (-np.dot(A.T, np.multiply(M, X - np.dot(A, Z)))) / tau Z_new = np.maximum(0, G - beta / tau) + np.minimum( 0, G + beta / tau) # A_new if iter == 1: A = A else: A = A_new + cc * (A_new - A_old) kai = rho * np.linalg.norm(np.dot(Z_new, Z_new.T), 2) H = A + np.dot(np.multiply(M, X - np.dot(A, Z_new)), Z_new.T) / kai A_new = 1 / (alpha + kai) * H * kai # check convergence stopC = max( np.linalg.norm(Z_new - Z, 'fro') / np.linalg.norm(Z_new, 'fro'), np.linalg.norm(A_new - A, 'fro') / np.linalg.norm(A_new, 'fro')) isstopC = stopC < tol if isstopC: Z = Z_new A = A_new break Z_old = Z A_old = A Z = Z_new A = A_new #X_temp=np.multiply(X,M)+np.multiply(np.dot(A,Z),1-M) X_temp = np.dot(A, Z) if m0 > n0: X_temp = X_temp.T self._A = A #self._X=pd.DataFrame(X_temp,X_incomplete.index,X_incomplete.columns) self._X = container.DataFrame(X_temp, index=X_incomplete.index, columns=X_incomplete.columns) self._fitted = True return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: testData = inputs if self._CF: Xp = self._X y_pred = np.zeros(testData.shape[0]) + Xp.values.mean() idr = testData[testData.columns[0]].isin(Xp.index) idc = testData[testData.columns[1]].isin(Xp.columns) dd = np.where(idr & idc == True) for i in dd[0]: tpc = testData.values[i, 1] # id_col=Xp.columns.get_loc(np.str(tpc)) id_col = Xp.columns.get_loc(tpc) tpr = testData.values[i, 0] id_row = Xp.index.get_loc(tpr) y_pred[i] = Xp.values[id_row, id_col] self._index = inputs.index outputs = container.DataFrame(y_pred, index=self._index, columns=self._keys) else: X = inputs.values.copy() tol = self.tol maxiter = self.maxiter X = X.T m, n = X.shape M = np.ones([m, n]) M[np.isnan(X)] = 0 sr = M.sum() / m / n X[np.isnan(X)] = 0 beta = self._beta d = self._d A = self._A Z = np.zeros((d, n)) rho = max(1.5 * np.sqrt(M.mean()), 0.5) iter = 0 cc = 0.5 while iter < self.maxiter: iter = iter + 1 # Z_new if iter == 1: Z = Z else: Z = Z_new + cc * (Z_new - Z_old) tau = rho * np.linalg.norm(np.dot(A.T, A), 2) G = Z - (-np.dot(A.T, np.multiply(M, X - np.dot(A, Z)))) / tau Z_new = np.maximum(0, G - beta / tau) + np.minimum( 0, G + beta / tau) # check convergence stopC = np.linalg.norm(Z_new - Z, 'fro') / np.linalg.norm( Z_new, 'fro') isstopC = stopC < tol if isstopC: Z = Z_new break Z_old = Z Z = Z_new #X_temp=np.multiply(X,M)+np.multiply(np.dot(A,Z),1-M) X_temp = np.dot(A, Z) X_temp = X_temp.T outputs = container.DataFrame(X_temp, index=testData.index, columns=testData.columns) outputs.metadata = inputs.metadata return CallResult(outputs) def get_params(self) -> Params: return Params(X=self._X) def set_params(self, *, params: Params) -> None: self._X = params.X
class ObjectDetectionRNPrimitive(PrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive that utilizes RetinaNet, a convolutional neural network (CNN), for object detection. The methodology comes from "Focal Loss for Dense Object Detection" by Lin et al. 2017 (https://arxiv.org/abs/1708.02002). The code implementation is based off of the base library found at: https://github.com/fizyr/keras-retinanet. The primitive accepts a Dataset consisting of images, labels as input and returns a dataframe as output which include the bounding boxes for each object in each image. """ metadata = metadata_base.PrimitiveMetadata({ 'id': 'd921be1e-b158-4ab7-abb3-cb1b17f42639', 'version': '0.1.0', 'name': 'retina_net', 'python_path': 'd3m.primitives.object_detection.retinanet_convolutional_neural_network', 'keywords': [ 'object detection', 'convolutional neural network', 'digital image processing', 'RetinaNet' ], 'source': { 'name': 'Sanjeev Namjoshi', 'contact': 'mailto:[email protected]', 'uris': ['https://github.com/NewKnowledge/object-detection'], }, 'installation': [ { 'type': 'PIP', 'package_uri': 'git+https://github.com/NewKnowledge/object-detection.git@{git_commit}#egg=object-detection' .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ) }, { 'type': "FILE", 'key': "resnet50", 'file_uri': "http://public.datadrivendiscovery.org/ResNet-50-model.keras.h5", 'file_digest': "0128cdfa3963288110422e4c1a57afe76aa0d760eb706cda4353ef1432c31b9c" # TBD } ], #'algorithm_types': [metadata_base.PrimitiveAlgorithmType.RETINANET_CONVOLUTIONAL_NEURAL_NETWORK], 'algorithm_types': [metadata_base.PrimitiveAlgorithmType.CONVOLUTIONAL_NEURAL_NETWORK], #'primitive_family': metadata_base.PrimitiveFamily.OBJECT_DETECTION 'primitive_family': metadata_base.PrimitiveFamily.DIGITAL_IMAGE_PROCESSING, }) def __init__(self, *, hyperparams: Hyperparams, volumes: typing.Dict[str, str] = None) -> None: super().__init__(hyperparams=hyperparams, volumes=volumes) self.image_paths = None self.annotations = None self.base_dir = None self.classes = None self.backbone = None self.y_true = None self.workers = 1 self.multiprocessing = 1 self.max_queue_size = 10 def get_params(self) -> Params: return self._params def set_params(self, *, params: Params) -> None: self.params = params def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """ Sets the primitive's training data and preprocesses the files for RetinaNet format. Parameters ---------- inputs: numpy ndarray of size (n_images, dimension) containing the d3m Index, image name, and bounding box for each image. Returns ------- No returns. Function is called by pipeline at runtime. """ # Prepare annotation file ## Generate image paths image_cols = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/FileName') self.base_dir = [ inputs.metadata.query( (metadata_base.ALL_ELEMENTS, t))['location_base_uris'][0].replace('file:///', '/') for t in image_cols ] self.image_paths = np.array([[ os.path.join(self.base_dir, filename) for filename in inputs.iloc[:, col] ] for self.base_dir, col in zip(self.base_dir, image_cols)]).flatten() self.image_paths = pd.Series(self.image_paths) ## Arrange proper bounding coordinates bounding_coords = inputs.bounding_box.str.split(',', expand=True) bounding_coords = bounding_coords.drop( bounding_coords.columns[[2, 5, 6, 7]], axis=1) bounding_coords.columns = ['x1', 'y1', 'y2', 'x2'] bounding_coords = bounding_coords[['x1', 'y1', 'x2', 'y2']] ## Generate class names class_name = pd.Series(['class'] * inputs.shape[0]) ## Assemble annotation file self.annotations = pd.concat( [self.image_paths, bounding_coords, class_name], axis=1) self.annotations.columns = [ 'img_file', 'x1', 'y1', 'x2', 'y2', 'class_name' ] # Prepare ID file self.classes = pd.DataFrame({'class_name': ['class'], 'class_id': [0]}) def _create_callbacks(self, model, training_model, prediction_model): """ Creates the callbacks to use during training. Parameters ---------- model : The base model. training_model : The model that is used for training. prediction_model : The model that should be used for validation. validation_generator : The generator for creating validation data. Returns ------- callbacks : A list of callbacks used for training. """ callbacks = [] callbacks.append( keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)) return callbacks def _create_models(self, backbone_retinanet, num_classes, weights, freeze_backbone=False, lr=1e-5): """ Creates three models (model, training_model, prediction_model). Parameters ---------- backbone_retinanet : A function to call to create a retinanet model with a given backbone. num_classes : The number of classes to train. weights : The weights to load into the model. multi_gpu : The number of GPUs to use for training. freeze_backbone : If True, disables learning for the backbone. config : Config parameters, None indicates the default configuration. Returns ------- model : The base model. training_model : The training model. If multi_gpu=0, this is identical to model. prediction_model : The model wrapped with utility functions to perform object detection (applies regression values and performs NMS). """ modifier = freeze_model if freeze_backbone else None anchor_params = None num_anchors = None model = self._model_with_weights(backbone_retinanet( num_classes, num_anchors=num_anchors, modifier=modifier), weights=weights, skip_mismatch=True) training_model = model prediction_model = retinanet_bbox(model=model, anchor_params=anchor_params) training_model.compile(loss={ 'regression': losses.smooth_l1(), 'classification': losses.focal() }, optimizer=keras.optimizers.adam(lr=lr, clipnorm=0.001)) return model, training_model, prediction_model def _num_classes(self): """ Number of classes in the dataset. """ return max(self.classes.values()) + 1 def _model_with_weights(self, model, weights, skip_mismatch): """ Load weights for model. Parameters ---------- model : The model to load weights for. weights : The weights to load. skip_mismatch : If True, skips layers whose shape of weights doesn't match with the model. Returns ------- model : Model with loaded weights. """ if weights is not None: model.load_weights(weights, by_name=True, skip_mismatch=skip_mismatch) return model def _create_generator(self, annotations, classes, shuffle_groups): """ Create generator for evaluation. """ validation_generator = CSVGenerator(self.annotations, self.classes, self.base_dir, self.hyperparams['batch_size'], self.backbone.preprocess_image, shuffle_groups=False) return validation_generator def _evaluate_model(self, generator, model, iou_threshold, score_threshold, max_detections, save_path): """ Evaluate a given dataset using a given model. Parameters ---------- generator : The generator that represents the dataset to evaluate. model : The model to evaluate. iou_threshold : The threshold used to consider when a detection is positive or negative. score_threshold : The score confidence threshold to use for detections. max_detections : The maximum number of detections to use per image. save_path : The path to save images with visualized detections to. Returns ------- all_detections : A list containing the predicted boxes for each image in the generator. """ box_list = [] score_list = [] for i in range(generator.size()): raw_image = generator.load_image(i) image = generator.preprocess_image(raw_image.copy()) image, scale = generator.resize_image(image) if keras.backend.image_data_format() == 'channels_first': image = image.transpose((2, 0, 1)) # run network boxes, scores, labels = model.predict_on_batch( np.expand_dims(image, axis=0))[:3] # correct boxes for image scale boxes /= scale for box, score in zip(boxes[0], scores[0]): if score < 0.5: break b = box.astype(int) box_list.append(b) score_list.append(score) ### !!! SAVEPATH CURRENTLY NOT IMPLEMENTED !!! ### This optional feature can be added later, maybe for TA3, allowing images to be output with ### bounding boxes to a specified directory after evaluation. # if save_path is True: # draw_annotations(raw_image, generator.load_annotations(i), label_to_name = generator.label_to_name) # draw_detections(raw_image, image_boxes, image_scores, image_labels, label_to_name = generator.label_to_name, score_threshold = score_threshold) # cv2.imwrite(os.path.join(save_path, '{}.png'.format(i)), raw_image) return box_list, score_list def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Creates the image generators and then trains RetinaNet model on the image paths in the input dataframe column. Can choose to use validation generator. If no weight file is provided, the default is to use the ImageNet weights. """ # Create object that stores backbone information self.backbone = models.backbone(self.hyperparams['backbone']) # Set up specific GPU # if self.hyperparams['gpu_id'] is not None: # setup_gpu(self.hyperparams['gpu_id']) # Create the generators train_generator = CSVGenerator(self.annotations, self.classes, self.base_dir, self.hyperparams['batch_size'], self.backbone.preprocess_image) # Running the model ## Assign weights if self.hyperparams['weights'] is False: weights = None else: weights = self.volumes[self.hyperparams['backbone']] ## Create model print('Creating model...', file=sys.__stdout__) model, self.training_model, prediction_model = self._create_models( backbone_retinanet=self.backbone.retinanet, num_classes=train_generator.num_classes(), weights=weights, freeze_backbone=self.hyperparams['freeze_backbone'], lr=self.hyperparams['learning_rate']) #print(model.summary(), file = sys.__stdout__) model.summary() ### !!! vgg AND densenet BACKBONES CURRENTLY NOT IMPLEMENTED !!! ## Let the generator compute the backbone layer shapes using the actual backbone model # if 'vgg' in self.hyperparams['backbone'] or 'densenet' in self.hyperparams['backbone']: # train_generator.compute_shapes = make_shapes_callback(model) # if validation_generator: # validation_generator.compute_shapes = train_generator.compute_shapes ## Set up callbacks callbacks = self._create_callbacks( model, self.training_model, prediction_model, ) start_time = time.time() print('Starting training...', file=sys.__stdout__) self.training_model.fit_generator( generator=train_generator, steps_per_epoch=self.hyperparams['n_steps'], epochs=self.hyperparams['n_epochs'], verbose=1, callbacks=callbacks, workers=self.workers, use_multiprocessing=self.multiprocessing, max_queue_size=self.max_queue_size) print( f'Training complete. Training took {time.time()-start_time} seconds.', file=sys.__stdout__) return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce image detection predictions. Parameters ---------- inputs : numpy ndarray of size (n_images, dimension) containing the d3m Index, image name, and bounding box for each image. Returns ------- outputs : A d3m dataframe container with the d3m index, image name, bounding boxes as a string (8 coordinate format), and confidence scores. """ iou_threshold = 0.5 # Bounding box overlap threshold for false positive or true positive score_threshold = 0.05 # The score confidence threshold to use for detections max_detections = 100 # Maxmimum number of detections to use per image # create the generator generator = self._create_generator(self.annotations, self.classes, shuffle_groups=False) # Convert training model to inference model inference_model = models.convert_model(self.training_model) # Assemble output lists ## Generate predicted bounding boxes (8-coordinate format, list) boxes, scores = self._evaluate_model(generator, inference_model, iou_threshold, score_threshold, max_detections, self.hyperparams['output']) ## Convert predicted boxes from a list of arrays to a list of strings boxes = np.array(boxes).tolist() boxes = list(map(lambda x: ",".join(map(str, x)), boxes)) ## Generate list of image names and d3m indices corresponding to predicted bounding boxes img_list = [ os.path.basename(list) for list in self.annotations['img_file'].tolist() ] d3m_idx = inputs.d3mIndex.tolist() print(len(d3m_idx), file=sys.__stdout__) print(len(img_list), file=sys.__stdout__) print(len(boxes), file=sys.__stdout__) print(len(scores), file=sys.__stdout__) ## Assemble in a Pandas DataFrame results = pd.DataFrame({ 'd3mIndex': d3m_idx, 'image': img_list, 'bounding_box': boxes, 'confidence': scores }) # Convert to DataFrame container results_df = d3m_DataFrame(results) ## Assemble first output column ('d3mIndex) col_dict = dict( results_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type("1") col_dict['name'] = 'd3mIndex' col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') results_df.metadata = results_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) ## Assemble second output column ('image') col_dict = dict( results_df.metadata.query((metadata_base.ALL_ELEMENTS, 1))) col_dict['structural_type'] = type("1") col_dict['name'] = 'image' col_dict['semantic_types'] = ( 'http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute') results_df.metadata = results_df.metadata.update( (metadata_base.ALL_ELEMENTS, 1), col_dict) ## Assemble third output column ('bounding_box') col_dict = dict( results_df.metadata.query((metadata_base.ALL_ELEMENTS, 2))) col_dict['structural_type'] = type("1") col_dict['name'] = 'bounding_box' col_dict['semantic_types'] = ( 'http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget', 'https://metadata.datadrivendiscovery.org/types/BoundingPolygon') results_df.metadata = results_df.metadata.update( (metadata_base.ALL_ELEMENTS, 2), col_dict) ## Assemble fourth output column ('confidence') col_dict = dict( results_df.metadata.query((metadata_base.ALL_ELEMENTS, 3))) col_dict['structural_type'] = type("1") col_dict['name'] = 'confidence' col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Score') results_df.metadata = results_df.metadata.update( (metadata_base.ALL_ELEMENTS, 3), col_dict) return CallResult(results_df)
class CSVReader(FeaturizationTransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ BBN D3M CSV Reader Primitive Arguments: """ __git_commit__ = utils.current_git_commit(os.path.dirname(__file__)) metadata = metadata_module.PrimitiveMetadata({ 'id': 'a771e153-67d7-4f69-b5f9-1a764e502a23', 'version': __version__, 'name': "CSV Reader", 'description': "BBN D3M CSV Reader Primitive.", 'keywords': [], 'source': { 'name': __author__, 'contact': 'mailto:[email protected]', 'uris': [ 'https://github.com/BBN-E/d3m-bbn-primitives/blob/{git_commit}/bbn_primitives/time_series/csv_reader.py' .format(git_commit=__git_commit__), 'https://github.com/BBN-E/d3m-bbn-primitives.git', ], }, 'installation': [{ 'type': 'UBUNTU', 'package': 'ffmpeg', 'version': '7:2.8.11-0', }, { 'type': 'PIP', 'package_uri': 'git+https://github.com/BBN-E/d3m-bbn-primitives.git@{git_commit}#egg={egg}' .format(git_commit=__git_commit__, egg='bbn_primitives'), }], 'python_path': 'd3m.primitives.data_preprocessing.csv_reader.CSVReader', 'algorithm_types': [metadata_module.PrimitiveAlgorithmType.DATA_CONVERSION ], #['DATA_CONVERSION'], # replaced 'AUDIO_MIXING' 'primitive_family': metadata_module.PrimitiveFamily.DATA_PREPROCESSING, #'algorithm_types': ['DATA_CONVERSION'], # TODO: replace by a new algorithm_type, e.g. ? #'primitive_family': 'DATA_PREPROCESSING', }) def __init__( self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._metadata_lookup = None return def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Arguments: - inputs: [ num_samples, num_channels ] Returns: - [ num_samples ] """ with stopit.ThreadingTimeout(timeout) as timer: metadata_lookup = self.__class__._parse_metadata( metadata=inputs.metadata) if not metadata_lookup: return None outputs = Outputs() metadata = self.__class__._can_accept(self=self, method_name='produce', arguments={ 'inputs': inputs.metadata, }, hyperparams=self.hyperparams, outputs=outputs) csv_location_base_uris = inputs.metadata.query( metadata_lookup['location_base_uris'] ['selector'])['location_base_uris'][0] for idx, row in inputs[metadata_lookup['primary_resource_id'] ['selector'][0]].iterrows(): #for idx in range(len(inputs[metadata_lookup['primary_resource_id']['selector'][0]])): #row = inputs[metadata_lookup['primary_resource_id']['selector'][0]][idx] #d3mIndex = row[metadata_lookup['primary_key']['selector'][-1]] d3mIndex = row['d3mIndex'] csv_fn = row[metadata_lookup['csv_fn']['selector'][-1]] filename = os.path.join(csv_location_base_uris, csv_fn) filename = re.sub('^file://', '', filename) #csv_file= csv.load(filename) csv_file = pd.read_csv(filename, index_col=0) start = 0 end = len(csv_file) outputs.append(csv_file) metadata = metadata.update((idx, ), {'sampling_rate': 1}) metadata = metadata.update((), {'dimension': { 'length': len(outputs) }}) # Set metadata attribute. outputs.metadata = metadata if timer.state == timer.EXECUTED: return CallResult(outputs) else: raise TimeoutError('Reader exceeded time limit') @classmethod def can_accept( cls, *, method_name: str, arguments: typing.Dict[str, typing.Union[metadata_module.Metadata, type]] ) -> typing.Optional[metadata_module.DataMetadata]: output_metadata = super().can_accept(method_name=method_name, arguments=arguments, hyperparams=hyperparams) return cls._can_accept(self=cls, method_name=method_name, arguments=arguments, hyperparams=hyperparams, outputs=Outputs()) @classmethod def _can_accept( cls, *, self, method_name: str, arguments: typing.Dict[str, typing.Union[metadata_module.Metadata, type]], hyperparams: Hyperparams, outputs: Outputs) -> typing.Optional[metadata_module.DataMetadata]: #output_metadata = super().can_accept(method_name=method_name, arguments=arguments, hyperparams=hyperparams) if 'inputs' not in arguments: return output_metadata inputs_metadata = typing.cast(metadata_module.DataMetadata, arguments['inputs']) metadata_lookup = cls._parse_metadata(metadata=inputs_metadata) num_data = inputs_metadata.query(metadata_lookup['primary_resource_id'] ['selector'])['dimension']['length'] metadata = inputs_metadata.clear( { 'schema': metadata_module.CONTAINER_SCHEMA_VERSION, 'structural_type': Outputs, 'dimension': { 'length': num_data, } }, for_value=outputs, source=self ).update( (metadata_module.ALL_ELEMENTS, ), { 'structural_type': d3m_ndarray, 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Timeseries', ) }, source=self) return metadata @classmethod def _update_metadata_lookup(cls, metadata_lookup, key, selector): if key not in metadata_lookup: raise Exception('Updating unknown key %s' % key) metadata_lookup[key]['found'] = True metadata_lookup[key]['selector'] = selector @classmethod def _valid_metadata_lookup(cls, metadata_lookup): for k in metadata_lookup.keys(): if metadata_lookup[k][ 'required'] and not metadata_lookup[k]['found']: return False return True @classmethod def _init_metadata_lookup(cls): metadata_lookup = dict() metadata_lookup['primary_key'] = { 'required': True, 'found': False, 'selector': None, } metadata_lookup['primary_resource_id'] = { 'required': True, 'found': False, 'selector': None, } metadata_lookup['csv_fn'] = { 'required': True, 'found': False, 'selector': None, } metadata_lookup['location_base_uris'] = { 'required': True, 'found': False, 'selector': None, } return metadata_lookup @classmethod def _parse_metadata(cls, *, metadata: metadata_module.DataMetadata): flatten = lambda l: [item for sublist in l for item in sublist] mdlu = cls._init_metadata_lookup() num_res = metadata.query(())['dimension']['length'] resources = [str(x) for x in range(num_res - 1)] resources.append('learningData') primary_key = [[ (res_id, metadata_module.ALL_ELEMENTS, col_id) for col_id in range( metadata.query(( res_id, metadata_module.ALL_ELEMENTS))['dimension']['length']) if 'd3mIndex' == metadata.query((res_id, metadata_module.ALL_ELEMENTS, col_id))['name'] ] for res_id in resources] primary_key = flatten(primary_key) if len(primary_key) != 1: raise Exception('One primary key supported') cls._update_metadata_lookup(mdlu, 'primary_key', primary_key[0]) cls._update_metadata_lookup(mdlu, 'primary_resource_id', (primary_key[0][0], )) csv_res_type = 'https://metadata.datadrivendiscovery.org/types/Timeseries' primary_resource_cols = metadata.query( (mdlu['primary_resource_id']['selector'][0], metadata_module.ALL_ELEMENTS)) for col_id in range(primary_resource_cols['dimension']['length']): cmd = metadata.query((mdlu['primary_resource_id']['selector'][0], metadata_module.ALL_ELEMENTS, col_id)) if 'semantic_types' in cmd: st = cmd['semantic_types'] if 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' in st: # we already found primary key pass elif 'https://metadata.datadrivendiscovery.org/types/Attribute' in st: if 'foreign_key' in cmd and cmd['foreign_key'][ 'type'] == 'COLUMN': foreign_resource_id = cmd['foreign_key']['resource_id'] foreign_resource_md = metadata.query( (foreign_resource_id, )) foreign_col_selector = ( foreign_resource_id, metadata_module.ALL_ELEMENTS, cmd['foreign_key']['column_index']) foreign_col_md = metadata.query(foreign_col_selector) if csv_res_type in foreign_col_md['semantic_types'] and \ 'https://metadata.datadrivendiscovery.org/types/FileName' in foreign_col_md['semantic_types']: cls._update_metadata_lookup( mdlu, 'csv_fn', (mdlu['primary_resource_id']['selector'][0], metadata_module.ALL_ELEMENTS, col_id)) cls._update_metadata_lookup( mdlu, 'location_base_uris', foreign_col_selector) else: _logger.warning( 'Expected foreign resource of type %s and column of semantic type Filename' % (csv_res_type)) else: _logger.warning('Unexpected semantic type Attribute') elif 'https://metadata.datadrivendiscovery.org/types/InstanceWeight' in st: _logger.warning( 'Semantic type InstanceWeight recognized but unused in the current implementation' ) elif 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in st: _logger.info( 'Semantic type SuggestedTarget is ignored by this primitive' ) #else: #raise Exception('Semantic type(s) %s does not match any supported types' % (st)) return mdlu if cls._valid_metadata_lookup(mdlu) else None
class DistilVertexNominationPrimitive(PrimitiveBase[container.List, container.DataFrame, Params, Hyperparams]): """ A primitive that uses random forest to solve vertext nomination problems. """ metadata = metadata_base.PrimitiveMetadata( { "id": "0130828c-1ac0-47a9-a167-f05bae5a3146", "version": version.__version__, "name": "VertexNomination", "python_path": "d3m.primitives.vertex_nomination.seeded_graph_matching.DistilVertexNomination", "source": { "name": "Distil", "contact": "mailto:[email protected]", "uris": [ "https://github.com/uncharted-distil/distil-primitives/blob/main/distil/primitives/vertex_nomination.py", "https://github.com/uncharted-distil/distil-primitives", ], }, "installation": [ CYTHON_DEP, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@{git_commit}#egg=distil-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.RANDOM_FOREST, ], "primitive_family": metadata_base.PrimitiveFamily.VERTEX_NOMINATION, }, ) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._model = VertexNominationCV( target_metric=self.hyperparams["metric"], random_seed=random_seed) def set_training_data(self, *, inputs: container.List, outputs: container.DataFrame) -> None: self._inputs = inputs self._outputs = outputs self._target_col = outputs.columns[0] def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: logger.debug(f"Fitting {__name__}") X_train, y_train, U_train = self._inputs X_train = X_train.value y_train = y_train.squeeze() self._model.fit(X_train, y_train, U_train) return CallResult(None) def produce(self, *, inputs: container.List, timeout: float = None, iterations: int = None) -> CallResult[container.DataFrame]: logger.debug(f"Producing {__name__}") X_train, _, U = inputs X_train = X_train.value result = self._model.predict(X_train, U) # create dataframe to hold d3mIndex and result result_df = container.DataFrame({ X_train.index.name: X_train.index, self._target_col: result }) # mark the semantic types on the dataframe result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), "https://metadata.datadrivendiscovery.org/types/PrimaryKey", ) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 1), "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ) return base.CallResult(result_df) def get_params(self) -> Params: return Params(model=self._model, target_col=self._target_col) def set_params(self, *, params: Params) -> None: self._target_col = params["target_col"] self._model = params["model"] return
class DeepAR(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive that applies a deep autoregressive forecasting algorithm for time series prediction. The implementation is based off of this paper: https://arxiv.org/pdf/1704.04110.pdf and is implemented in AWS's Sagemaker interface. Training inputs: 1) Feature dataframe, 2) Target dataframe Outputs: Dataframe with predictions for specific time series at specific future time instances Arguments: hyperparams {Hyperparams} -- D3M Hyperparameter object Keyword Arguments: random_seed {int} -- random seed (default: {0}) """ metadata = metadata_base.PrimitiveMetadata( { # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". "id": "3410d709-0a13-4187-a1cb-159dd24b584b", "version": __version__, "name": "DeepAR", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. "keywords": [ "time series", "forecasting", "recurrent neural network", "autoregressive", ], "source": { "name": __author__, "contact": __contact__, "uris": [ # Unstructured URIs. "https://github.com/NewKnowledge/TimeSeries-D3M-Wrappers", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. "installation": [ {"type": "PIP", "package": "cython", "version": "0.29.14"}, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/NewKnowledge/TimeSeries-D3M-Wrappers.git@{git_commit}#egg=TimeSeriesD3MWrappers".format( git_commit=utils.current_git_commit(os.path.dirname(__file__)), ), }, ], # The same path the primitive is registered with entry points in setup.py. "python_path": "d3m.primitives.time_series_forecasting.lstm.DeepAR", # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.RECURRENT_NEURAL_NETWORK, ], "primitive_family": metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING, } ) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) # set seed for reproducibility tf.random.set_seed(random_seed) self._is_fit = False self._new_train_data = False def get_params(self) -> Params: return self._params def set_params(self, *, params: Params) -> None: self._params = params def _drop_multiple_special_cols(self, col_list, col_type): """ private util function that creates list of duplicated special columns (for deletion) Arguments: col_list {List[int]} -- list of column indices col_type {str} -- D3M semantic type Returns: int or None -- first column idx in col_list if any column idxs are marked (else None) """ if len(col_list) == 0: return None elif len(col_list) > 1: logger.warn( f"""There are more than one {col_type} marked. This primitive will use the first and drop other {col_type}s.""" ) self._drop_cols += col_list[1:] if col_type != "target column": self._drop_cols_no_tgt += col_list[1:] return col_list[0] def _get_cols(self, input_metadata): """ private util function: get indices of important columns from metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Raises: ValueError: If Target column is not of type 'Integer' or 'Float' """ self._drop_cols = [] self._drop_cols_no_tgt = [] # get target idx (first column by default) target_columns = input_metadata.list_columns_with_semantic_types( ( "https://metadata.datadrivendiscovery.org/types/SuggestedTarget", "https://metadata.datadrivendiscovery.org/types/TrueTarget", "https://metadata.datadrivendiscovery.org/types/Target", ) ) if len(target_columns) == 0: raise ValueError("At least one column must be marked as a target") self._target_column = self._drop_multiple_special_cols( target_columns, "target column" ) # get timestamp idx (first column by default) timestamp_columns = input_metadata.list_columns_with_semantic_types( ( "https://metadata.datadrivendiscovery.org/types/Time", "http://schema.org/DateTime", ) ) self._timestamp_column = self._drop_multiple_special_cols( timestamp_columns, "timestamp column" ) # get grouping idx and add suggested grouping keys to drop_cols list grouping_columns = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/GroupingKey",) ) self._grouping_column = self._drop_multiple_special_cols( grouping_columns, "grouping column" ) suggested_grouping_columns = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey",) ) self._drop_cols += suggested_grouping_columns self._drop_cols_no_tgt += suggested_grouping_columns # get index_col (first index column by default) index_columns = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/PrimaryKey",) ) self._index_column = self._drop_multiple_special_cols( index_columns, "index column" ) # determine whether targets are count data target_semantic_types = input_metadata.query_column_field( self._target_column, "semantic_types" ) if self.hyperparams["count_data"] is not None: self._count_data = self.hyperparams["count_data"] elif "http://schema.org/Integer" in target_semantic_types: if np.min(self._ts_frame.iloc[:, self._target_column]) > 0: self._count_data = True else: self._count_data = False elif "http://schema.org/Float" in target_semantic_types: self._count_data = False else: raise ValueError("Target column is not of type 'Integer' or 'Float'") #logger.info(f"count data: {self._count_data}") def _update_indices(self): """ private util function: subtract length of drop cols from each marked idx to account for smaller df """ length = len(self._drop_cols) if self._target_column is not None: self._target_column -= length if self._timestamp_column is not None: self._timestamp_column -= length if self._grouping_column is not None: self._grouping_column -= length if self._index_column is not None: self._index_column -= length self._cols_after_drop = self._ts_frame.shape[0] def _create_data_object_and_learner(self, val_split): """ private util function: creates (or updates) train ds object and learner Arguments: val_split {float} -- proportion of training data to withhold for validation """ # Create TimeSeries dataset objects #logger.info(self._ts_frame.head()) self._ts_object = TimeSeriesTrain( self._ts_frame, target_idx=self._target_column, timestamp_idx=self._timestamp_column, grouping_idx=self._grouping_column, index_col=self._index_column, count_data=self._count_data, negative_obs=self.hyperparams["negative_obs"], val_split=val_split, integer_timestamps=self._integer_timestamps, freq=self.freq, ) #logger.info(self._ts_object.data.head()) # Create learner self._learner = DeepARLearner( self._ts_object, emb_dim=self.hyperparams["emb_dim"], lstm_dim=self.hyperparams["lstm_dim"], dropout=self.hyperparams["dropout_rate"], lr=self.hyperparams["learning_rate"], batch_size=self.hyperparams["batch_size"], train_window=self.hyperparams["window_size"], verbose=0, ) # save weights so we can restart fitting from scratch (if desired by caller) self._learner.save_weights("model_initial_weights.h5") def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """ Sets primitive's training data Arguments: inputs {Inputs} -- D3M dataframe containing attributes outputs {Outputs} -- D3M dataframe containing targets Raises: ValueError: If multiple columns are annotated with 'Time' or 'DateTime' metadata """ # save copy of train data so we don't predict for each row in training self._output_columns = outputs.columns self._train_data = inputs.copy() # combine inputs and outputs for internal TimeSeries object self._ts_frame = inputs.append_columns(outputs) # Parse cols needed for ts object self._get_cols(self._ts_frame.metadata) # drop cols if multiple special type columns if len(self._drop_cols) > 0: self._ts_frame = self._ts_frame.remove_columns(self._drop_cols) self._update_indices() # assumption is that integer timestamps are days (treated this way by DeepAR objects) if "http://schema.org/Integer" in self._ts_frame.metadata.query_column_field( self._timestamp_column, "semantic_types" ): self._integer_timestamps = True else: self._integer_timestamps = False # calculate frequency of time series g_col, t_col = ( self._ts_frame.columns[self._grouping_column], self._ts_frame.columns[self._timestamp_column], ) if self._grouping_column is None: time_col_sorted = np.sort(self._ts_frame[t_col]) self._min_train = time_col_sorted[0] self.freq = calculate_time_frequency(time_col_sorted[1] - self._min_train) # self._train_diff = int( # np.diff(np.sort(self._ts_frame.iloc[:, self._timestamp_column]))[0] # ) else: # assume frequency is the same across all time series self.freq = calculate_time_frequency( int( self._ts_frame.groupby(g_col)[t_col] .apply(lambda x: np.diff(np.sort(x))) .iloc[0][0] ) ) self._min_train = self._ts_frame.groupby(g_col)[t_col].agg("min").min() # Create TimeSeries dataset object and learner self._create_data_object_and_learner(self.hyperparams["val_split"]) # mark that new training data has been set self._new_train_data = True self._in_sample_preds = None def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Fits DeepAR model using training data from set_training_data and hyperparameters Keyword Arguments: timeout {float} -- timeout, considered (default: {None}) iterations {int} -- iterations, considered (default: {None}) Returns: CallResult[None] """ # restore initial model weights if new training data if self._new_train_data: # only create new dataset object / model (w/out val) if new training data if iterations is not None: self._create_data_object_and_learner(0) self._learner.load_weights("model_initial_weights.h5") if iterations is None: iterations_set = False iterations = self.hyperparams["epochs"] validation = self.hyperparams["val_split"] > 0 else: iterations_set = True validation = False # time training for 1 epoch so we can consider timeout argument thoughtfully if timeout: logger.info( """Timing the fitting procedure for one epoch so we can consider timeout thoughtfully""" ) start_time = time.time() _, iterations_completed = self._learner.fit( validation=validation, steps_per_epoch=self.hyperparams["steps_per_epoch"], epochs=1, stopping_patience=self.hyperparams["early_stopping_patience"], stopping_delta=self.hyperparams["early_stopping_delta"], tensorboard=False, ) epoch_time_estimate = time.time() - start_time # subract 1 for epoch that already happened and 1 more to be safe timeout_epochs = timeout // epoch_time_estimate - 2 iters = min(timeout_epochs, iterations) else: iters = iterations # normal fitting logger.info(f"Fitting for {iters} iterations") start_time = time.time() _, iterations_completed = self._learner.fit( validation=validation, steps_per_epoch=self.hyperparams["steps_per_epoch"], epochs=iters, stopping_patience=self.hyperparams["early_stopping_patience"], stopping_delta=self.hyperparams["early_stopping_delta"], tensorboard=False, ) logger.info( f"Fit for {iterations_completed} epochs, took {time.time() - start_time}s" ) # maintain primitive state (mark that training data has been used) self._new_train_data = False self._is_fit = True # use fitting history to set CallResult return values if iterations_set: has_finished = False elif iters < iterations: has_finished = False else: has_finished = self._is_fit return CallResult( None, has_finished=has_finished, iterations_done=iterations_completed ) def _get_pred_intervals(self, df, keep_all=False): """ private util function that retrieves unevenly spaced prediction intervals from data frame Arguments: df {pandas df} -- df of predictions from which to extract prediction intervals Keyword Arguments: keep_all {bool} -- if True, take every interval slice, otherwise only take those given by the df Returns: pd Series -- series of intervals, indexed by group, granularity of 1 interval """ # no grouping column if self._grouping_column is None: interval = discretize_time_difference( df.iloc[:, self._timestamp_column], self._min_train, self.freq, self._integer_timestamps, ) if keep_all: interval = np.arange(min(interval), max(interval) + 1) return pd.Series([interval]) # grouping column else: g_col, t_col = ( df.columns[self._grouping_column], df.columns[self._timestamp_column], ) all_intervals, groups = [], [] for (group, vals) in df.groupby(g_col)[t_col]: interval = discretize_time_difference( vals, self._min_train, self.freq, self._integer_timestamps ) if keep_all: interval = np.arange(min(interval), max(interval) + 1) all_intervals.append(interval) groups.append(group) return pd.Series(all_intervals, index=groups) def produce( self, *, inputs: Inputs, timeout: float = None, iterations: int = None ) -> CallResult[Outputs]: """ Produce primitive's predictions for specific time series at specific future time instances * these specific timesteps / series are specified implicitly by input dataset Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- (N, 2) dataframe with d3m_index and value for each prediction slice requested. prediction slice = specific horizon idx for specific series in specific regression """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") if len(self._drop_cols_no_tgt) > 0 and inputs.shape[1] != self._cols_after_drop: test_frame = inputs.remove_columns(self._drop_cols_no_tgt) else: test_frame = inputs.copy() # Create TimeSeriesTest object if self._train_data.equals(inputs): ts_test_object = TimeSeriesTest(self._ts_object) include_all_training = True # test else: ts_test_object = TimeSeriesTest(self._ts_object, test_frame) include_all_training = self.hyperparams['seed_predictions_with_all_data'] # get prediction slices pred_intervals = self._get_pred_intervals(test_frame) # make predictions with learner start_time = time.time() logger.info(f"Making predictions...") preds = self._learner.predict(ts_test_object, include_all_training=include_all_training) logger.info( f"Prediction took {time.time() - start_time}s. Predictions array shape: {preds.shape}" ) # append saved in-sample predictions to test predictions if not seeding with all context if self._in_sample_preds is None: self._in_sample_preds = preds elif not self.hyperparams['seed_predictions_with_all_data']: preds = np.concatenate((self._in_sample_preds, preds), axis=1) # slice predictions with learned intervals all_preds = [] for p, idxs in zip(preds, pred_intervals.values): # all_preds.extend(p[: len(idxs)]) # this takes first n predictions all_preds.extend( [p[i] for i in idxs] ) # this takes predictions at actual indices flat_list = np.array([p for pred_list in all_preds for p in pred_list]) # if np.isinf(all_preds).any(): # logger.debug(f'There are {np.isinf(all_preds).sum()} inf preds') # if np.isnan(all_preds).any(): # logger.debug(f'There are {np.isnan(all_preds).sum()} nan preds') # logger.debug(f'Max: {preds.max()}, Min: {preds.min()}') # fill nans with 0s in case model predicted some (shouldnt need to - preventing edge case) flat_list = np.nan_to_num(flat_list) # create output frame result_df = container.DataFrame( {self._ts_frame.columns[self._target_column]: flat_list}, generate_metadata=True, ) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"), ) return CallResult(result_df, has_finished=self._is_fit) def produce_confidence_intervals( self, *, inputs: Inputs, timeout: float = None, iterations: int = None ) -> CallResult[Outputs]: """ produce confidence intervals for each series 'confidence_interval_horizon' periods into the future Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, considered (default: {None}) Raises: PrimitiveNotFittedError: Returns: CallResult[Outputs] -- Ex. series | timestep | mean | 0.05 | 0.95 -------------------------------------- a | 0 | 5 | 3 | 7 a | 1 | 6 | 4 | 8 b | 0 | 5 | 3 | 7 b | 1 | 6 | 4 | 8 """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") alpha = self.hyperparams["confidence_interval_alpha"] if len(self._drop_cols_no_tgt) > 0 and inputs.shape[1] != self._cols_after_drop: test_frame = inputs.remove_columns(self._drop_cols_no_tgt) else: test_frame = inputs.copy() # Create TimeSeriesTest object if self._train_data.equals(inputs): ts_test_object = TimeSeriesTest(self._ts_object) include_all_training = True horizon = 0 # test else: ts_test_object = TimeSeriesTest(self._ts_object, test_frame) include_all_training = self.hyperparams['seed_predictions_with_all_data'] horizon = self.hyperparams["confidence_interval_horizon"] # make predictions with learner start_time = time.time() logger.info(f"Making predictions...") preds = self._learner.predict( ts_test_object, horizon=horizon, samples=self.hyperparams["confidence_interval_samples"], include_all_training=include_all_training, point_estimate = False ) logger.info( f"Prediction took {time.time() - start_time}s. Predictions array shape: {preds.shape}" ) # convert samples to percentiles means = np.percentile(preds, 50, axis=2).reshape(-1, 1) lowers = np.percentile(preds, alpha / 2 * 100, axis=2).reshape(-1, 1) uppers = np.percentile(preds, (1 - alpha / 2) * 100, axis=2).reshape(-1, 1) assert (lowers < means).all() assert (means < uppers).all() # convert to df if self._grouping_column is None: indices = np.repeat(self._output_columns[0], preds.shape[1]) else: indices = np.repeat( test_frame[test_frame.columns[self._grouping_column]].unique(), preds.shape[1] ) interval_df = pd.DataFrame( np.concatenate((means, lowers, uppers), axis=1), columns=["mean", str(alpha / 2), str(1 - alpha / 2)], index=indices, ) # add index column interval_df["horizon_index"] = np.tile( np.arange(preds.shape[1]), len(interval_df.index.unique()) ) logger.debug(interval_df.head()) # structure return df return CallResult( container.DataFrame(interval_df, generate_metadata=True), has_finished=self._is_fit, )