예제 #1
0
    def transform_inputs_outputs(self, ds: DataSet, **kwargs):
        """Transform of data into inputs and outptus for a strategy

        Parameters
        ----------
        ds: `DataSet`
            Dataset with columns corresponding to the inputs and objectives of the domain.
        copy: bool, optional
            Copy the dataset internally. Defaults to True.
        transform_descriptors: bool, optional
            Transform the descriptors into continuous variables. Default True.
        Returns
        -------
        inputs, outputs
            Datasets with the input and output datasets
        """
        copy = kwargs.get("copy", True)
        transform_descriptors = kwargs.get("transform_descriptors", True)

        data_columns = ds.data_columns
        new_ds = ds.copy() if copy else ds

        # Determine input and output columns in dataset
        input_columns = []
        output_columns = []

        for variable in self.domain.input_variables:
            if isinstance(variable,
                          CategoricalVariable) and transform_descriptors:
                # Add descriptors to the dataset
                var_descriptor_names = variable.ds.data_columns
                if all(
                        np.isin(var_descriptor_names,
                                new_ds.columns.levels[0].to_list())):
                    # Make the descriptors columns a metadata column
                    column_list_1 = new_ds.columns.levels[0].to_list()
                    ix = [
                        column_list_1.index(d_name)
                        for d_name in var_descriptor_names
                    ]
                    column_codes_2 = list(new_ds.columns.codes[1])
                    ix_code = [
                        np.where(new_ds.columns.codes[0] == tmp_ix)[0][0]
                        for tmp_ix in ix
                    ]
                    for ixc in ix_code:
                        column_codes_2[ixc] = 0
                    new_ds.columns.set_codes(column_codes_2,
                                             level=1,
                                             inplace=True)
                else:
                    indices = new_ds[variable.name].values
                    descriptors = variable.ds.loc[indices]
                    descriptors.index = new_ds.index
                    new_ds = new_ds.join(descriptors, how="inner")

                # Make the original descriptors column a metadata column
                column_list_1 = new_ds.columns.levels[0].to_list()
                ix = column_list_1.index(variable.name)
                column_codes_2 = list(new_ds.columns.codes[1])
                ix_code = np.where(new_ds.columns.codes[0] == ix)[0][0]
                column_codes_2[ix_code] = 1
                new_ds.columns.set_codes(column_codes_2, level=1, inplace=True)

                # add descriptors data columns to inputs
                input_columns.extend(var_descriptor_names)
            elif isinstance(variable, Variable):
                input_columns.append(variable.name)
            else:
                raise DomainError(
                    f"Variable {variable.name} is not a continuous or categorical variable."
                )

        for variable in self.domain.output_variables:
            if variable.name in data_columns and variable.is_objective:
                if isinstance(variable, CategoricalVariable):
                    raise DomainError(
                        "Output variables cannot be categorical variables currently."
                    )
                output_columns.append(variable.name)
                # Ensure continuous variables are floats
                new_ds[variable.name] = new_ds[variable.name].astype(np.float)
            else:
                raise DomainError(
                    f"Variable {variable.name} is not in the dataset.")

        if output_columns is None:
            raise DomainError(
                "No output columns in the domain.  Add at least one output column for optimisation."
            )

        # Return the inputs and outputs as separate datasets
        return new_ds[input_columns].copy(), new_ds[output_columns].copy()
예제 #2
0
    def transform_inputs_outputs(self, ds: DataSet, **kwargs):
        """Transform of data into inputs and outptus for a strategy

        Parameters
        ----------
        ds: `DataSet`
            Dataset with columns corresponding to the inputs and objectives of the domain.
        copy: bool, optional
            Copy the dataset internally. Defaults to True.
        standardize_inputs : bool, optional
            Standardize all input continuous variables. Default is False.
        standardize_outputs : bool, optional
            Standardize all output continuous variables. Default is False.
        categorical_method : str, optional
            The method for transforming categorical variables. Either
            "one-hot" or "descriptors". Descriptors must be included in the
            categorical variables for the later.

        Returns
        -------
        inputs, outputs
            Datasets with the input and output datasets
        """
        copy = kwargs.get("copy", True)
        categorical_method = kwargs.get("categorical_method", "one-hot")
        standardize_inputs = kwargs.get("standardize_inputs", False)
        standardize_outputs = kwargs.get("standardize_outputs", False)

        data_columns = ds.data_columns
        new_ds = ds.copy() if copy else ds

        # Determine input and output columns in dataset
        input_columns = []
        output_columns = []
        self.input_means, self.input_stds = {}, {}
        self.output_means, self.output_stds = {}, {}
        for variable in self.domain.input_variables:
            if (isinstance(variable, CategoricalVariable)
                    and categorical_method == "descriptors"):
                # Add descriptors to the dataset
                var_descriptor_names = variable.ds.data_columns
                if all(
                        np.isin(var_descriptor_names,
                                new_ds.columns.levels[0].to_list())):
                    # Make the descriptors columns a metadata column
                    column_list_1 = new_ds.columns.levels[0].to_list()
                    ix = [
                        column_list_1.index(d_name)
                        for d_name in var_descriptor_names
                    ]
                    column_codes_2 = list(new_ds.columns.codes[1])
                    ix_code = [
                        np.where(new_ds.columns.codes[0] == tmp_ix)[0][0]
                        for tmp_ix in ix
                    ]
                    for ixc in ix_code:
                        column_codes_2[ixc] = 0
                    new_ds.columns.set_codes(column_codes_2,
                                             level=1,
                                             inplace=True)
                else:
                    indices = new_ds[variable.name].values
                    descriptors = variable.ds.loc[indices]
                    descriptors.index = new_ds.index
                    new_ds = new_ds.join(descriptors, how="inner")

                # Make the original descriptors column a metadata column
                column_list_1 = new_ds.columns.levels[0].to_list()
                ix = column_list_1.index(variable.name)
                column_codes_2 = list(new_ds.columns.codes[1])
                ix_code = np.where(new_ds.columns.codes[0] == ix)[0][0]
                column_codes_2[ix_code] = 1
                new_ds.columns.set_codes(column_codes_2, level=1, inplace=True)

                # add descriptors data columns to inputs
                input_columns.extend(var_descriptor_names)
            elif (isinstance(variable, CategoricalVariable)
                  and categorical_method == "one-hot"):
                # Create one-hot encoding columns & insert to DataSet
                enc = OneHotEncoder(categories=[variable.levels])
                values = np.atleast_2d(new_ds[variable.name].to_numpy()).T
                one_hot_values = enc.fit_transform(values).toarray()
                for loc, l in enumerate(variable.levels):
                    column_name = f"{variable.name}_{l}"
                    new_ds[column_name, "DATA"] = one_hot_values[:, loc]
                    input_columns.append(column_name)
                variable.enc = enc

                # Drop old categorical column, then write as metadata
                new_ds = new_ds.drop(variable.name, axis=1)
                new_ds[variable.name, "METADATA"] = values

            elif isinstance(variable, ContinuousVariable):
                if standardize_inputs:
                    values, mean, std = self.standardize_column(
                        new_ds[variable.name].astype(np.float))
                    self.input_means[variable.name] = mean
                    self.input_stds[variable.name] = std
                    new_ds[variable.name, "DATA"] = values
                input_columns.append(variable.name)
            else:
                raise DomainError(
                    f"Variable {variable.name} is not a continuous or categorical variable."
                )

        for variable in self.domain.output_variables:
            if variable.name in data_columns and variable.is_objective:
                if isinstance(variable, CategoricalVariable):
                    raise DomainError(
                        "Output variables cannot be categorical variables currently."
                    )
                if standardize_outputs:
                    values, mean, std = self.standardize_column(
                        new_ds[variable.name].astype(np.float))
                    self.output_means[variable.name] = mean
                    self.output_stds[variable.name] = std
                    new_ds[variable.name, "DATA"] = values
                output_columns.append(variable.name)
                # Ensure continuous variables are floats
                new_ds[variable.name] = new_ds[variable.name].astype(np.float)
            else:
                raise DomainError(
                    f"Variable {variable.name} is not in the dataset.")

        if output_columns is None:
            raise DomainError(
                "No output columns in the domain.  Add at least one output column for optimisation."
            )

        # Return the inputs and outputs as separate datasets
        return new_ds[input_columns].copy(), new_ds[output_columns].copy()