def fit(self, data, args): self.model = Normalizer(norm="l2") with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval
def make_models(X, y, y_bin): return dict(ols=LinearRegression().fit(X, y), lr_bin=LogisticRegression().fit(X, y_bin), lr_ovr=LogisticRegression(multi_class='ovr').fit(X, y), lr_mn=LogisticRegression(solver='lbfgs', multi_class='multinomial').fit(X, y), svc=SVC(kernel='linear').fit(X, y_bin), svr=SVR(kernel='linear').fit(X, y), dtc=DecisionTreeClassifier(max_depth=4).fit(X, y), dtr=DecisionTreeRegressor(max_depth=4).fit(X, y), rfc=RandomForestClassifier(n_estimators=3, max_depth=3, random_state=1).fit(X, y), rfr=RandomForestRegressor(n_estimators=3, max_depth=3, random_state=1).fit(X, y), gbc=GradientBoostingClassifier(n_estimators=3, max_depth=3, random_state=1).fit(X, y), gbr=GradientBoostingRegressor(n_estimators=3, max_depth=3, random_state=1).fit(X, y), abc=AdaBoostClassifier(algorithm='SAMME', n_estimators=3, random_state=1).fit(X, y), abc2=AdaBoostClassifier(algorithm='SAMME.R', n_estimators=3, random_state=1).fit(X, y), abc3=AdaBoostClassifier(algorithm='SAMME', n_estimators=3, random_state=1).fit(X, y_bin), abc4=AdaBoostClassifier(algorithm='SAMME.R', n_estimators=3, random_state=1).fit(X, y_bin), km=KMeans(1).fit(X), km2=KMeans(5).fit(X), pc1=PCA(1).fit(X), pc2=PCA(2).fit(X), pc3=PCA(2, whiten=True).fit(X), mlr1=MLPRegressor([2], 'relu').fit(X, y), mlr2=MLPRegressor([2, 1], 'tanh').fit(X, y), mlr3=MLPRegressor([2, 2, 2], 'identity').fit(X, y), mlc=MLPClassifier([2, 2], 'tanh').fit(X, y), mlc_bin=MLPClassifier([2, 2], 'identity').fit(X, y_bin), bin=Binarizer(0.5), mms=MinMaxScaler().fit(X), mas=MaxAbsScaler().fit(X), ss1=StandardScaler().fit(X), ss2=StandardScaler(with_mean=False).fit(X), ss3=StandardScaler(with_std=False).fit(X), n1=Normalizer('l1'), n2=Normalizer('l2'), n3=Normalizer('max'))
class NormalizerImpl(): def __init__(self, norm='l2', copy=True): self._hyperparams = {'norm': norm, 'copy': copy} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def test_fit_transform(): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) for obj in ((StandardScaler(), Normalizer(), Binarizer())): X_transformed = obj.fit(X).transform(X) X_transformed2 = obj.fit_transform(X) assert_array_equal(X_transformed, X_transformed2)
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def test_normalizer_l1(): rng = np.random.RandomState(0) X_dense = rng.randn(4, 5) X_sparse_unpruned = sparse.csr_matrix(X_dense) # set the row number 3 to zero X_dense[3, :] = 0.0 # set the row number 3 to zero without pruning (can happen in real life) indptr_3 = X_sparse_unpruned.indptr[3] indptr_4 = X_sparse_unpruned.indptr[4] X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0 # build the pruned variant using the regular constructor X_sparse_pruned = sparse.csr_matrix(X_dense) # check inputs that support the no-copy optim for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): normalizer = Normalizer(norm='l1', copy=True) X_norm = normalizer.transform(X) assert_true(X_norm is not X) X_norm1 = toarray(X_norm) normalizer = Normalizer(norm='l1', copy=False) X_norm = normalizer.transform(X) assert_true(X_norm is X) X_norm2 = toarray(X_norm) for X_norm in (X_norm1, X_norm2): row_sums = np.abs(X_norm).sum(axis=1) for i in range(3): assert_almost_equal(row_sums[i], 1.0) assert_almost_equal(row_sums[3], 0.0) # check input for which copy=False won't prevent a copy for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix): X = init(X_dense) X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) assert_true(X_norm is not X) assert_true(isinstance(X_norm, sparse.csr_matrix)) X_norm = toarray(X_norm) for i in range(3): assert_almost_equal(row_sums[i], 1.0) assert_almost_equal(la.norm(X_norm[3]), 0.0)
class CreateNormalizer(CreateModel): def fit(self, data, args): self.model = Normalizer(norm="l2") with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval def test(self, data): assert self.model is not None return self.model.transform(data.X_test) def predict(self, data): with Timer() as t: self.predictions = self.test(data) data.learning_task = LearningTask.REGRESSION return t.interval
def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) # False self._clf = Normalizer( norm=self.hyperparams['norm'], ) self._inputs = None self._outputs = None self._training_inputs = None self._training_outputs = None self._target_names = None self._training_indices = None self._target_column_indices = None self._target_columns_metadata: List[OrderedDict] = None self._input_column_names = None self._fitted = False
class DFNormalizer(TransformerMixin): # Row wise transformer? - Can be removed if so def __init__(self, norm='l2', copy=True): self.norm = norm self.copy = copy self.ss_ = None def fit(self, X, y=None): return self def transform(self, X): # assumes X is a DataFrame self.ss_ = Normalizer() Xss = self.ss_.transform(X) Xscaled = pd.DataFrame(Xss, index=X.index, columns=X.columns) return Xscaled
def __init__(self, norm='l2', copy=True): self._hyperparams = {'norm': norm, 'copy': copy} self._wrapped_model = SKLModel(**self._hyperparams)
'MaxAbsScaler':MaxAbsScaler(), 'MeanShift':MeanShift(), 'MinCovDet':MinCovDet(), 'MinMaxScaler':MinMaxScaler(), 'MiniBatchDictionaryLearning':MiniBatchDictionaryLearning(), 'MiniBatchKMeans':MiniBatchKMeans(), 'MiniBatchSparsePCA':MiniBatchSparsePCA(), 'MultiTaskElasticNet':MultiTaskElasticNet(), 'MultiTaskElasticNetCV':MultiTaskElasticNetCV(), 'MultiTaskLasso':MultiTaskLasso(), 'MultiTaskLassoCV':MultiTaskLassoCV(), 'MultinomialNB':MultinomialNB(), 'NMF':NMF(), 'NearestCentroid':NearestCentroid(), 'NearestNeighbors':NearestNeighbors(), 'Normalizer':Normalizer(), 'NuSVC':NuSVC(), 'NuSVR':NuSVR(), 'Nystroem':Nystroem(), 'OAS':OAS(), 'OneClassSVM':OneClassSVM(), 'OrthogonalMatchingPursuit':OrthogonalMatchingPursuit(), 'OrthogonalMatchingPursuitCV':OrthogonalMatchingPursuitCV(), 'PCA':PCA(), 'PLSCanonical':PLSCanonical(), 'PLSRegression':PLSRegression(), 'PLSSVD':PLSSVD(), 'PassiveAggressiveClassifier':PassiveAggressiveClassifier(), 'PassiveAggressiveRegressor':PassiveAggressiveRegressor(), 'Perceptron':Perceptron(), 'ProjectedGradientNMF':ProjectedGradientNMF(),
from sklearn import datasets from sklearn.preprocessing.data import Normalizer iris = datasets.load_iris() newX = Normalizer().fit_transform(iris.data) print(iris.data) print('==============') print(newX)
class MovingAverageTransform(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ A primitive to generate moving average Genrates moving average based on the window_size passed as hyperparameter. Columns for which moving average is calculated is passed as hyperparameter . Default is all values column """ __author__ = "DATA Lab at Texas A&M University" metadata = metadata_base.PrimitiveMetadata({ "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.MOVING_AVERAGE_TRANSFORM, ], "name": "pandas.preprocessing.data.MovingAverageTransform", "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, "python_path": "d3m.primitives.tods.timeseries_processing.transformation.moving_average_transform", "source": { 'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:[email protected]', 'uris': [ 'https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/mia/anomaly-primitives/anomaly_primitives/MovingAverageTransform.py' ] }, "version": "0.0.1", "id": "ab8c90a6-d10e-49f1-8c5a-38884defc570", "hyperparams_to_tune": ['window_size'], }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) # False self._clf = Normalizer(norm=self.hyperparams['norm'], ) self._inputs = None self._outputs = None self._training_inputs = None self._training_outputs = None self._target_names = None self._training_indices = None self._target_column_indices = None self._target_columns_metadata: List[OrderedDict] = None self._input_column_names = None self._fitted = False def set_training_data(self, *, inputs: Inputs) -> None: self._inputs = inputs self._fitted = False def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return CallResult(None) self._training_inputs, self._training_indices = self._get_columns_to_fit( self._inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns if self._training_inputs is None: return CallResult(None) if len(self._training_indices) > 0: self._clf.fit(self._training_inputs) self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: self.logger.info('Time Series Moving Average Primitive called') outputs = inputs self._training_inputs, self._training_indices = self._get_columns_to_fit( inputs, self.hyperparams) try: columns_to_calculate_moving_average = List[str] if (self.hyperparams['use_columns'] == ()): columns_to_calculate_moving_average = list( set(inputs.columns) - set(['d3mIndex', 'timestamp', 'ground_truth'])) else: columns_to_calculate_moving_average = self.hyperparams[ 'use_columns'] for column in self._training_indices: outputs[inputs.columns[column] + "_moving_average"] = (inputs.iloc[:, column]).rolling( 3, min_periods=1, center=True).mean() except Exception as e: self.logger.error("Error in Calculating Moving Average", e) self._update_metadata(outputs) # print(inputs) # print("-------------") # print(outputs) return base.CallResult(outputs) def _update_metadata(self, outputs): outputs.metadata = outputs.metadata.generate(outputs, ) def get_params(self) -> Params: if not self._fitted: return Params( input_column_names=self._input_column_names, training_indices_=self._training_indices, target_names_=self._target_names, target_column_indices_=self._target_column_indices, target_columns_metadata_=self._target_columns_metadata) return Params(input_column_names=self._input_column_names, training_indices_=self._training_indices, target_names_=self._target_names, target_column_indices_=self._target_column_indices, target_columns_metadata_=self._target_columns_metadata) def set_params(self, *, params: Params) -> None: self._input_column_names = params['input_column_names'] self._training_indices = params['training_indices_'] self._target_names = params['target_names_'] self._target_column_indices = params['target_column_indices_'] self._target_columns_metadata = params['target_columns_metadata_'] self._fitted = True @classmethod def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): if not hyperparams['use_semantic_types']: return inputs, list(range(len(inputs.columns))) inputs_metadata = inputs.metadata def can_produce_column(column_index: int) -> bool: return cls._can_produce_column(inputs_metadata, column_index, hyperparams) columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use( inputs_metadata, use_columns=hyperparams['use_columns'], exclude_columns=hyperparams['exclude_columns'], can_use_column=can_produce_column) return inputs.iloc[:, columns_to_produce], columns_to_produce # return columns_to_produce @classmethod def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) accepted_structural_types = (int, float, numpy.integer, numpy.float64) accepted_semantic_types = set() accepted_semantic_types.add( "https://metadata.datadrivendiscovery.org/types/Attribute") if not issubclass(column_metadata['structural_type'], accepted_structural_types): return False semantic_types = set(column_metadata.get('semantic_types', [])) if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False @classmethod def _get_target_columns_metadata( cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_metadata = OrderedDict( outputs_metadata.query_column(column_index)) # Update semantic types and prepare it for predicted targets. semantic_types = set(column_metadata.get('semantic_types', [])) semantic_types_to_remove = set([]) add_semantic_types = [] add_semantic_types.add(hyperparams["return_semantic_type"]) semantic_types = semantic_types - semantic_types_to_remove semantic_types = semantic_types.union(add_semantic_types) column_metadata['semantic_types'] = list(semantic_types) target_columns_metadata.append(column_metadata) return target_columns_metadata @classmethod def _update_predictions_metadata( cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict] ) -> metadata_base.DataMetadata: outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) for column_index, column_metadata in enumerate( target_columns_metadata): column_metadata.pop("structural_type", None) outputs_metadata = outputs_metadata.update_column( column_index, column_metadata) return outputs_metadata def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: outputs = d3m_dataframe(predictions, generate_metadata=True) target_columns_metadata = self._copy_inputs_metadata( inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) outputs.metadata = self._update_predictions_metadata( inputs.metadata, outputs, target_columns_metadata) return outputs @classmethod def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], outputs_metadata: metadata_base.DataMetadata, hyperparams): outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in input_indices: column_name = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)).get("name") if column_name is None: column_name = "output_{}".format(column_index) column_metadata = OrderedDict( inputs_metadata.query_column(column_index)) semantic_types = set(column_metadata.get('semantic_types', [])) semantic_types_to_remove = set([]) add_semantic_types = set() add_semantic_types.add(hyperparams["return_semantic_type"]) semantic_types = semantic_types - semantic_types_to_remove semantic_types = semantic_types.union(add_semantic_types) column_metadata['semantic_types'] = list(semantic_types) column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) # If outputs has more columns than index, add Attribute Type to all remaining if outputs_length > len(input_indices): for column_index in range(len(input_indices), outputs_length): column_metadata = OrderedDict() semantic_types = set() semantic_types.add(hyperparams["return_semantic_type"]) column_name = "output_{}".format(column_index) column_metadata["semantic_types"] = list(semantic_types) column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata
class SimpleExponentialSmoothing(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams] ): """ Primitive wrapping for simple exponential smoothing `statsmodels documentation <https://www.statsmodels.org/stable/generated/statsmodels.tsa.holtwinters.SimpleExpSmoothing.html#statsmodels.tsa.holtwinters.SimpleExpSmoothing>`_ """ __author__ = "DATA Lab at Texas A&M University" metadata = metadata_base.PrimitiveMetadata({ "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.SIMPLE_EXPONENTIAL_SMOOTHING, ], "name": "statsmodels.preprocessing.data.SimpleExponentialSmoothing", "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, "python_path": "d3m.primitives.tods.timeseries_processing.transformation.simple_exponential_smoothing", "source": { 'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:[email protected]', 'uris': [ 'https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/mia/anomaly-primitives/anomaly_primitives/SimpleExponentialSmoothing.py' ] }, "version": "0.0.1", "id": "3e92984e-b7d1-4de0-9203-3a6093ddb38e", "hyperparams_to_tune": ['endog', 'use_columns'], }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) # False self._clf = Normalizer(norm=self.hyperparams['norm'], ) self._inputs = None self._outputs = None self._training_inputs = None self._training_outputs = None self._target_names = None self._training_indices = None self._target_column_indices = None self._target_columns_metadata: List[OrderedDict] = None self._input_column_names = None self._fitted = False def set_training_data(self, *, inputs: Inputs) -> None: self._inputs = inputs self._fitted = False def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return CallResult(None) self._training_inputs, self._training_indices = self._get_columns_to_fit( self._inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns if self._training_inputs is None: return CallResult(None) if len(self._training_indices) > 0: self._clf.fit(self._training_inputs) self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: self.logger.info('Simple Exponential Smoothing Primitive called') outputs = inputs self._training_inputs, self._training_indices = self._get_columns_to_fit( inputs, self.hyperparams) try: columns_to_calculate_simple_exponential_smoothing = List[str] if (self.hyperparams['use_columns'] == ()): columns_to_calculate_simple_exponential_smoothing = list( set(inputs.columns) - set(['d3mIndex', 'timestamp', 'ground_truth'])) else: columns_to_calculate_simple_exponential_smoothing = self.hyperparams[ 'use_columns'] for column in self._training_indices: outputs[inputs.columns[column] + "_simple_exponential_smoothing"] = SimpleExpSmoothing( inputs.iloc[:, column]).fit( smoothing_level=0.2, optimized=False).fittedvalues except Exception as e: self.logger.error( "Error in Calculating simple exponential smoothing", e) self._update_metadata(outputs) #print(inputs) #print("-------------") print(outputs) return base.CallResult(outputs) def _update_metadata(self, outputs): outputs.metadata = outputs.metadata.generate(outputs, ) def get_params(self) -> Params: if not self._fitted: return Params( input_column_names=self._input_column_names, training_indices_=self._training_indices, target_names_=self._target_names, target_column_indices_=self._target_column_indices, target_columns_metadata_=self._target_columns_metadata) return Params(input_column_names=self._input_column_names, training_indices_=self._training_indices, target_names_=self._target_names, target_column_indices_=self._target_column_indices, target_columns_metadata_=self._target_columns_metadata) def set_params(self, *, params: Params) -> None: self._input_column_names = params['input_column_names'] self._training_indices = params['training_indices_'] self._target_names = params['target_names_'] self._target_column_indices = params['target_column_indices_'] self._target_columns_metadata = params['target_columns_metadata_'] self._fitted = True @classmethod def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): if not hyperparams['use_semantic_types']: return inputs, list(range(len(inputs.columns))) inputs_metadata = inputs.metadata def can_produce_column(column_index: int) -> bool: return cls._can_produce_column(inputs_metadata, column_index, hyperparams) columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use( inputs_metadata, use_columns=hyperparams['use_columns'], exclude_columns=hyperparams['exclude_columns'], can_use_column=can_produce_column) return inputs.iloc[:, columns_to_produce], columns_to_produce # return columns_to_produce @classmethod def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) accepted_structural_types = (int, float, numpy.integer, numpy.float64) accepted_semantic_types = set() accepted_semantic_types.add( "https://metadata.datadrivendiscovery.org/types/Attribute") if not issubclass(column_metadata['structural_type'], accepted_structural_types): return False semantic_types = set(column_metadata.get('semantic_types', [])) if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False @classmethod def _get_target_columns_metadata( cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_metadata = OrderedDict( outputs_metadata.query_column(column_index)) # Update semantic types and prepare it for predicted targets. semantic_types = set(column_metadata.get('semantic_types', [])) semantic_types_to_remove = set([]) add_semantic_types = [] add_semantic_types.add(hyperparams["return_semantic_type"]) semantic_types = semantic_types - semantic_types_to_remove semantic_types = semantic_types.union(add_semantic_types) column_metadata['semantic_types'] = list(semantic_types) target_columns_metadata.append(column_metadata) return target_columns_metadata @classmethod def _update_predictions_metadata( cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict] ) -> metadata_base.DataMetadata: outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) for column_index, column_metadata in enumerate( target_columns_metadata): column_metadata.pop("structural_type", None) outputs_metadata = outputs_metadata.update_column( column_index, column_metadata) return outputs_metadata def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: outputs = d3m_dataframe(predictions, generate_metadata=True) target_columns_metadata = self._copy_inputs_metadata( inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) outputs.metadata = self._update_predictions_metadata( inputs.metadata, outputs, target_columns_metadata) return outputs @classmethod def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], outputs_metadata: metadata_base.DataMetadata, hyperparams): outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in input_indices: column_name = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)).get("name") if column_name is None: column_name = "output_{}".format(column_index) column_metadata = OrderedDict( inputs_metadata.query_column(column_index)) semantic_types = set(column_metadata.get('semantic_types', [])) semantic_types_to_remove = set([]) add_semantic_types = set() add_semantic_types.add(hyperparams["return_semantic_type"]) semantic_types = semantic_types - semantic_types_to_remove semantic_types = semantic_types.union(add_semantic_types) column_metadata['semantic_types'] = list(semantic_types) column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) # If outputs has more columns than index, add Attribute Type to all remaining if outputs_length > len(input_indices): for column_index in range(len(input_indices), outputs_length): column_metadata = OrderedDict() semantic_types = set() semantic_types.add(hyperparams["return_semantic_type"]) column_name = "output_{}".format(column_index) column_metadata["semantic_types"] = list(semantic_types) column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata
def transform(self, X): # assumes X is a DataFrame self.ss_ = Normalizer() Xss = self.ss_.transform(X) Xscaled = pd.DataFrame(Xss, index=X.index, columns=X.columns) return Xscaled