class RandomProjectionTimeSeriesFeaturization( UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): ''' Timeseries collection featurization using random projection. ''' metadata = hyperparams.base.PrimitiveMetadata({ "id": "dsbox.timeseries_featurization.random_projection", "version": config.VERSION, "name": "DSBox random projection timeseries featurization ", "description": "A simple timeseries featurization using random projection", "python_path": "d3m.primitives.feature_extraction.random_projection_timeseries_featurization.DSBOX", "primitive_family": "FEATURE_EXTRACTION", "algorithm_types": ["RANDOM_PROJECTION"], "source": { "name": config.D3M_PERFORMER_TEAM, "contact": config.D3M_CONTACT, "uris": [config.REPOSITORY] }, ### Automatically generated # "primitive_code" # "original_python_path" # "schema" # "structural_type" ### Optional "keywords": ["feature_extraction", "timeseries"], "installation": [config.INSTALLATION], #"location_uris": [], "precondition": ["NO_MISSING_VALUES", "NO_CATEGORICAL_VALUES"], "effects": ["NO_JAGGED_VALUES"], #"hyperparms_to_tune": [] }) def __init__(self, *, hyperparams: Hyperparams) -> None: super().__init__(hyperparams=hyperparams) self.hyperparams = hyperparams self._model = None self._training_data = None self._value_found = False self._x_dim = 0 # x_dim : the amount of timeseries dataset self._y_dim = 0 # y_dim : the length of each timeseries dataset self._value_dimension = 0 # value_dimension : used to determine which dimension data is the values we want self._fitted = False def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: # if self._training_data is None or self._y_dim==0: inputs_timeseries = inputs[1] inputs_d3mIndex = inputs[0] if not self._fitted: return CallResult(None, True, 0) if isinstance(inputs_timeseries, np.ndarray): X = np.zeros((inputs_timeseries.shape[0], self._y_dim)) else: X = np.zeros((len(inputs_timeseries), self._y_dim)) for i, series in enumerate(inputs_timeseries): if series.shape[1] > 1 and not self._value_found: series_output = pd.DataFrame() for j in range(series.shape[1]): series_output = pd.concat( [series_output, series.iloc[:, j]]) else: series_output = series if (series_output.shape[0] < self._y_dim): # pad with zeros X[i, :series_output. shape[0]] = series_output.iloc[:series_output.shape[0], self._value_dimension] else: # Truncate or just fit in X[i, :] = series_output.iloc[:self._y_dim, self._value_dimension] # save the result to DataFrame format output_ndarray = self._model.transform(X) output_dataFrame = container.DataFrame(output_ndarray) # update the original index to be d3mIndex output_dataFrame = container.DataFrame( pd.concat([ pd.DataFrame(inputs_d3mIndex, columns=['d3mIndex']), pd.DataFrame(output_dataFrame) ], axis=1)) # add d3mIndex metadata index_metadata_selector = (mbase.ALL_ELEMENTS, 0) index_metadata = { "name": "d3mIndex", "structural_type": str, 'semantic_types': ("https://metadata.datadrivendiscovery.org/types/TabularColumn", "https://metadata.datadrivendiscovery.org/types/PrimaryKey") } output_dataFrame.metadata = output_dataFrame.metadata.update( metadata=index_metadata, selector=index_metadata_selector) # add other metadata if self.hyperparams["generate_metadata"]: if type(output_ndarray[0][0]) is np.float64: metadata_each_column = { "structural_type": float, 'semantic_types': ("http://schema.org/Float", 'https://metadata.datadrivendiscovery.org/types/Attribute' ), } else: metadata_each_column = { "structural_type": int, 'semantic_types': ("http://schema.org/Integer", 'https://metadata.datadrivendiscovery.org/types/Attribute' ), } for each_column in range(1, output_dataFrame.shape[1]): metadata_selector = (mbase.ALL_ELEMENTS, each_column) output_dataFrame.metadata = output_dataFrame.metadata.update( metadata=metadata_each_column, selector=metadata_selector) # 2019.4.15: now d3m need to check also inside the query of ((metadata_base.ALL_ELEMENTS,)) metadata_selector = (mbase.ALL_ELEMENTS, ) metadata_dimension_columns = { "name": "columns", "semantic_types": ("https://metadata.datadrivendiscovery.org/types/TabularColumn", ), "length": output_ndarray.shape[1] } # d3m require it to be frozen ordered dict metadata_dimension_columns = frozendict.FrozenOrderedDict( metadata_dimension_columns) metadata_all_elements = {"dimension": metadata_dimension_columns} metadata_all_elements = frozendict.FrozenOrderedDict( metadata_all_elements) output_dataFrame.metadata = output_dataFrame.metadata.update( metadata=metadata_all_elements, selector=metadata_selector) # in the case of further more restricted check, also add metadta query of () metadata_selector = () metadata_dimension_rows = { "name": "rows", "semantic_types": ("https://metadata.datadrivendiscovery.org/types/TabularRow", ), "length": output_ndarray.shape[0] } # d3m require it to be frozen ordered dict metadata_dimension_rows = frozendict.FrozenOrderedDict( metadata_dimension_rows) metadata_all = { "structural_type": d3m_DataFrame, "semantic_types": ("https://metadata.datadrivendiscovery.org/types/Table", ), "dimension": metadata_dimension_rows, "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/container.json" } metadata_all = frozendict.FrozenOrderedDict(metadata_all) output_dataFrame.metadata = output_dataFrame.metadata.update( metadata=metadata_all, selector=metadata_selector) return CallResult(output_dataFrame, True, None) def set_training_data(self, *, inputs: Inputs) -> None: if len(inputs) != 2: raise InvalidArgumentValueError('Expecting two inputs') inputs_timeseries = inputs[1] inputs_d3mIndex = inputs[0] if len(inputs_timeseries) == 0: _logger.info( "Warning: Inputs timeseries data to timeseries_featurization primitive's length is 0." ) return # update: now we need to get the whole shape of inputs to process lengths = [x.shape[0] for x in inputs_timeseries] widths = [x.shape[1] for x in inputs_timeseries] # here just take first timeseries dataset to search column_name = list(inputs_timeseries[0].columns.values) ''' New things, the previous version only trying to load the fixed columns It will cause problems that may load the wrong data e.g.: at dataset 66, it will read the "time" data instead of "value" So here I added a function to check the name of each column to ensure that we read the correct data ''' for i in range(len(column_name)): if 'value' in column_name[i]: self._value_found = True self._value_dimension = i is_same_length = len(set(lengths)) == 1 is_same_width = len(set(widths)) == 1 if not is_same_width: _logger.info("Warning: some csv file have different dimensions!") if self._value_found: if is_same_length: self._y_dim = lengths[0] else: # Truncate all time series to the shortest time series self._y_dim = min(lengths) else: if is_same_length: self._y_dim = lengths[0] * widths[0] else: # Truncate all time series to the shortest time series self._y_dim = min(lengths) * min(widths) self._x_dim = len(inputs_timeseries) self._training_data = np.zeros((self._x_dim, self._y_dim)) for i, series in enumerate(inputs_timeseries): if series.shape[1] > 1 and not self._value_found: series_output = pd.DataFrame() for each_dimension in range(series.shape[1]): series_output = pd.concat( [series_output, series.iloc[:, each_dimension]]) else: series_output = series self._training_data[ i, :] = series_output.iloc[:self._y_dim, self._value_dimension] def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: eps = self.hyperparams['eps'] n_components = johnson_lindenstrauss_min_dim(n_samples=self._x_dim, eps=eps) _logger.info("[INFO] n_components is " + str(n_components)) if n_components > self._y_dim: # Default n_components == 'auto' fails. Need to explicitly assign n_components self._model = GaussianRandomProjection( n_components=self._y_dim, random_state=self.random_seed) else: try: self._model = GaussianRandomProjection( eps=eps, random_state=self.random_seed) self._model.fit(self._training_data) except: _logger.info( "[Warning] Using given eps value failed, will use default conditions." ) self._model = GaussianRandomProjection() self._model.fit(self._training_data) self._fitted = True return CallResult(None, has_finished=True) def get_params(self) -> Params: if self._model: return Params(y_dim=self._y_dim, x_dim=self._x_dim, value_found=self._value_found, value_dimension=self._value_dimension, projection_param=self._model.get_params(), components_=getattr(self._model, 'components_', None)) else: return Params({'y_dim': 0, 'projection_param': {}}) def set_params(self, *, params: Params) -> None: self._y_dim = params['y_dim'] self._x_dim = params['x_dim'] self._value_found = params['value_found'] self._value_dimension = params['value_dimension'] self._model = None if params['projection_param']: self._model = GaussianRandomProjection() self._model.set_params(**params['projection_param']) self._model.components_ = params['components_'] self._fitted = True else: self._fitted = False
error_rate_train_1 = np.zeros(np.shape(data1_X_train)[1]) error_rate_test_1 = np.zeros(np.shape(data1_X_train)[1]) DT1 = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=5, max_depth=None) error_rate_train_DT_1 = sum( DT1.fit(data1_X_train, data1_y_train).predict(data1_X_train) == data1_y_train) * 1.0 / data1_y_train.shape[0] print "error_rate_train_DT_1", error_rate_train_DT_1 error_rate_test_DT_1 = sum( DT1.fit(data1_X_train, data1_y_train).predict(data1_X_test) == data1_y_test) * 1.0 / data1_y_test.shape[0] print "error_rate_test_DT_2", error_rate_test_DT_1 for i in range(0, np.shape(data1_X_train)[1]): print i start_time = time.time() grp.set_params(n_components=i + 1) data1_X_train_grp = grp.fit_transform(data1_X_train) # data2_X_train is observation, data2_X_train_ica is ICAed # A_1 = ica.mixing_ # Get estimated mixing matrix # # print "A_2", A_2 # data1_X_test_ica = np.dot(data1_X_test, A_1) data1_X_test_grp = grp.transform(data1_X_test) error_rate_train_1[i] = sum( DT1.fit(data1_X_train_grp, data1_y_train).predict(data1_X_train_grp) == data1_y_train) * 1.0 / \ data1_y_train.shape[0] print("error_rate_train_1[%f]" % i), error_rate_train_1[i] error_rate_test_1[i] = sum( DT1.fit(data1_X_train_grp, data1_y_train).predict(data1_X_test_grp) == data1_y_test) * 1.0 / \ data1_y_test.shape[0] print("error_rate_test_1[%f]" % i), error_rate_test_1[i] print "time consumed:", time.time() - start_time
class RandomProjectionTimeSeriesFeaturization( FeaturizationPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): ''' classdocs ''' metadata = PrimitiveMetadata({ "id": "dsbox.timeseries_featurization.random_projection", "version": "v0.1.0", "name": "DSBox Data Encoder", "description": "Encode data, such as one-hot encoding for categorical data", "python_path": "d3m.primitives.dsbox.Encoder", "primitive_family": "DATA_CLEANING", "algorithm_types": ["ENCODE_ONE_HOT"], # FIXME Need algorithm type "source": { "name": 'ISI', "uris": ['git+https://github.com/usc-isi-i2/dsbox-ta2'] }, ### Automatically generated # "primitive_code" # "original_python_path" # "schema" # "structural_type" ### Optional "keywords": ["feature_extraction", "timeseries"], # "installation": [ config.INSTALLATION ], #"location_uris": [], #"precondition": [], #"effects": [], #"hyperparms_to_tune": [] }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Dict[str, str] = None) -> None: self.hyperparams = hyperparams self.random_seed = random_seed self.docker_containers = docker_containers self._model = None self._training_data = None self._x_dim = 0 self._y_dim = 0 def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if self._training_data is None or self._y_dim == 0: return CallResult(None, True, 0) if isinstance(inputs, np.ndarray): X = np.zeros((inputs.shape[0], self._y_dim)) else: X = np.zeros((len(inputs), self._y_dim)) for i, series in enumerate(inputs): X[i, :] = series.iloc[:self._y_dim, 0] return CallResult(self._model.transform(X), True, 1) def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: if len(inputs) == 0: return lengths = [x.shape[0] for x in inputs] is_same_length = len(set(lengths)) == 1 if is_same_length: self._y_dim = lengths[0] else: # Truncate all time series to the shortest time series self._y_dim = min(lengths) self._x_dim = len(inputs) self._training_data = np.zeros((self._x_dim, self._y_dim)) for i, series in enumerate(inputs): self._training_data[i, :] = series.iloc[:self._y_dim, 0] def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: eps = self.hyperparams['eps'] n_components = johnson_lindenstrauss_min_dim(n_samples=self._x_dim, eps=eps) if n_components > self._x_dim: self._model = GaussianRandomProjection(n_components=self._x_dim) else: self._model = GaussianRandomProjection(eps=eps) self._model.fit(self._training_data) def get_params(self) -> Params: if self._model: return Params(y_dim=self._y_dim, projection_param={'': self._model.get_params()}) else: return Params() def set_params(self, *, params: Params) -> None: self._y_dim = params['y_dim'] self._model = GaussianRandomProjection() self._model.set_params(params['projection_param'])