예제 #1
0
    def get_model(self, X_shape=(1, 1), **kwargs):
        import h2oaicore.keras as keras
        from h2oaicore.models_utils import import_tensorflow
        tf = import_tensorflow()

        class SplitLayer(tf.keras.layers.Layer):
            def __init__(self, splits, **kwargs):
                super(SplitLayer, self).__init__(**kwargs)
                self.splits = splits

            def build(self, input_shape):
                pass

            def call(self, input, **kwargs):
                from h2oaicore.models_utils import import_tensorflow
                tf = import_tensorflow()
                return tf.split(input, self.splits, 1)

            def get_config(self):
                config = {'splits': self.splits}
                base_config = super(SplitLayer, self).get_config()
                return dict(list(base_config.items()) + list(config.items()))

        with keras.utils.CustomObjectScope({'SplitLayer': SplitLayer}):
            if self.model is None:
                self.did_get_model = True
                assert self.model_bytes is not None
                self.pre_get_model(X_shape=X_shape, **kwargs)
                self.model = load_obj_bytes(self.model_bytes)
            return self.model
예제 #2
0
    def create_data(X: dt.Frame = None):
        from h2oaicore.models_utils import import_tensorflow
        tf = import_tensorflow()
        mnist = tf.keras.datasets.mnist
        (train_images, train_labels), (test_images,
                                       test_labels) = mnist.load_data()

        train_images = train_images.reshape((len(train_images), -1))
        test_images = test_images.reshape((len(test_images), -1))

        train_data = pd.DataFrame(train_images)
        test_data = pd.DataFrame(test_images)

        train_data = train_data.add_prefix('b')
        test_data = test_data.add_prefix('b')

        train_data["number"] = train_labels
        test_data["number"] = test_labels

        train_data = train_data.apply(np.int8)
        test_data = test_data.apply(np.int8)

        return {"mnist_train": train_data, "mnist_test": test_data}
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        '''
        Basic Multi GPU computation example using TensorFlow library.
        Author: Aymeric Damien
        Project: https://github.com/aymericdamien/TensorFlow-Examples/
        '''

        assert ngpus_vis != 0, "Shouldn't be using/testing this recipe without GPUs"
        '''
        This tutorial requires your machine to have 1 GPU
        "/cpu:0": The CPU of your machine.
        "/gpu:0": The first GPU of your machine
        '''
        tf = import_tensorflow()
        import numpy as np
        import datetime

        # Processing Units logs
        log_device_placement = True

        # Num of multiplications to perform
        n = 3
        '''
        Example: compute A^n + B^n on 2 GPUs
        Results on 8 cores with 2 GTX-980:
         * Single GPU computation time: 0:00:11.277449
         * Multi GPU computation time: 0:00:07.131701
        '''
        # Create random large matrix
        A = np.random.rand(10000, 10000).astype('float32')
        B = np.random.rand(10000, 10000).astype('float32')

        # Create a graph to store results
        c1 = []
        c2 = []

        def matpow(M, n):
            if n < 1:  # Abstract cases where n < 1
                return M
            else:
                return tf.matmul(M, matpow(M, n - 1))

        '''
        Single GPU computing
        '''
        with tf.device('/gpu:0'):
            a = tf.placeholder(tf.float32, [10000, 10000])
            b = tf.placeholder(tf.float32, [10000, 10000])
            # Compute A^n and B^n and store results in c1
            c1.append(matpow(a, n))
            c1.append(matpow(b, n))

        with tf.device('/gpu:0'):
            sum = tf.add_n(
                c1)  # Addition of all elements in c1, i.e. A^n + B^n

        t1_1 = datetime.datetime.now()
        with tf.Session(config=tf.ConfigProto(
                log_device_placement=log_device_placement,
                allow_soft_placement=True)) as sess:
            # Run the op.
            sess.run(sum, {a: A, b: B})
        t2_1 = datetime.datetime.now()

        print("Single GPU computation time: " + str(t2_1 - t1_1))

        self.set_model_properties(model=[1],
                                  features=list(X.names),
                                  importances=([1.0] * len(list(X.names))),
                                  iterations=0)
예제 #4
0
 def call(self, input, **kwargs):
     from h2oaicore.models_utils import import_tensorflow
     tf = import_tensorflow()
     return tf.split(input, self.splits, 1)
예제 #5
0
        def xnn_initialize(features,
                           ridge_functions=3,
                           arch=[20, 12],
                           learning_rate=0.01,
                           bg_samples=100,
                           beta1=0.9,
                           beta2=0.999,
                           dec=0.0,
                           ams=True,
                           bseed=None,
                           is_categorical=False):

            #
            # Prepare model architecture
            #
            # Input to the network, our observation containing all the features
            input = keras.layers.Input(shape=(features, ), name='main_input')

            # Record current column names
            loggerinfo(logger, "XNN LOG")
            loggerdata(logger, "Feature list:")
            loggerdata(logger, str(orig_cols))

            # Input to ridge function number i is the dot product of our original input vector times coefficients
            ridge_input = keras.layers.Dense(ridge_functions,
                                             name="projection_layer",
                                             activation='linear')(input)

            ridge_networks = []
            # Each subnetwork uses only 1 neuron from the projection layer as input so we need to split it

            from h2oaicore.models_utils import import_tensorflow
            tf = import_tensorflow()

            class SplitLayer(tf.keras.layers.Layer):
                def __init__(self, splits, **kwargs):
                    super(SplitLayer, self).__init__(**kwargs)
                    self.splits = splits

                def build(self, input_shape):
                    pass

                def call(self, input, **kwargs):
                    from h2oaicore.models_utils import import_tensorflow
                    tf = import_tensorflow()
                    return tf.split(input, self.splits, 1)

                def get_config(self):
                    config = {'splits': self.splits}
                    base_config = super(SplitLayer, self).get_config()
                    return dict(
                        list(base_config.items()) + list(config.items()))

            ridge_inputs = SplitLayer(ridge_functions)(ridge_input)
            for i, ridge_input in enumerate(ridge_inputs):
                # Generate subnetwork i
                mlp = _mlp(ridge_input, i, arch)
                ridge_networks.append(mlp)

            added = keras.layers.Concatenate(
                name='concatenate_1')(ridge_networks)

            # Add the correct output layer for the problem
            if is_categorical:
                out = keras.layers.Dense(1,
                                         activation='sigmoid',
                                         input_shape=(ridge_functions, ),
                                         name='main_output')(added)
            else:
                out = keras.layers.Dense(1,
                                         activation='linear',
                                         input_shape=(ridge_functions, ),
                                         name='main_output')(added)

            model = keras.models.Model(inputs=input, outputs=out)

            optimizer = keras.optimizers.Adam(lr=learning_rate,
                                              beta_1=beta1,
                                              beta_2=beta2,
                                              decay=dec,
                                              amsgrad=ams)

            # Use the correct loss for the problem
            if is_categorical:
                model.compile(loss={'main_output': 'binary_crossentropy'},
                              optimizer=optimizer)
            else:
                model.compile(loss={'main_output': 'mean_squared_error'},
                              optimizer=optimizer)

            return model
예제 #6
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        # Get column names
        orig_cols = list(X.names)

        from h2oaicore.models_utils import import_tensorflow
        tf = import_tensorflow()
        import shap
        import scipy
        import pandas as pd

        self.setup_keras_session()

        import h2oaicore.keras as keras
        import matplotlib.pyplot as plt

        if not hasattr(self, 'save_model_path'):
            model_id = str(uuid.uuid4())[:8]
            self.save_model_path = os.path.join(user_dir(),
                                                "custom_xnn_model.hdf5")

        np.random.seed(self.random_state)

        my_init = keras.initializers.RandomUniform(seed=self.random_state)

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Set up temp folter
        tmp_folder = self._create_tmp_folder(logger)

        # define base model
        def xnn_initialize(features,
                           ridge_functions=3,
                           arch=[20, 12],
                           learning_rate=0.01,
                           bg_samples=100,
                           beta1=0.9,
                           beta2=0.999,
                           dec=0.0,
                           ams=True,
                           bseed=None,
                           is_categorical=False):

            #
            # Prepare model architecture
            #
            # Input to the network, our observation containing all the features
            input = keras.layers.Input(shape=(features, ), name='main_input')

            # Record current column names
            loggerinfo(logger, "XNN LOG")
            loggerdata(logger, "Feature list:")
            loggerdata(logger, str(orig_cols))

            # Input to ridge function number i is the dot product of our original input vector times coefficients
            ridge_input = keras.layers.Dense(ridge_functions,
                                             name="projection_layer",
                                             activation='linear')(input)

            ridge_networks = []
            # Each subnetwork uses only 1 neuron from the projection layer as input so we need to split it

            from h2oaicore.models_utils import import_tensorflow
            tf = import_tensorflow()

            class SplitLayer(tf.keras.layers.Layer):
                def __init__(self, splits, **kwargs):
                    super(SplitLayer, self).__init__(**kwargs)
                    self.splits = splits

                def build(self, input_shape):
                    pass

                def call(self, input, **kwargs):
                    from h2oaicore.models_utils import import_tensorflow
                    tf = import_tensorflow()
                    return tf.split(input, self.splits, 1)

                def get_config(self):
                    config = {'splits': self.splits}
                    base_config = super(SplitLayer, self).get_config()
                    return dict(
                        list(base_config.items()) + list(config.items()))

            ridge_inputs = SplitLayer(ridge_functions)(ridge_input)
            for i, ridge_input in enumerate(ridge_inputs):
                # Generate subnetwork i
                mlp = _mlp(ridge_input, i, arch)
                ridge_networks.append(mlp)

            added = keras.layers.Concatenate(
                name='concatenate_1')(ridge_networks)

            # Add the correct output layer for the problem
            if is_categorical:
                out = keras.layers.Dense(1,
                                         activation='sigmoid',
                                         input_shape=(ridge_functions, ),
                                         name='main_output')(added)
            else:
                out = keras.layers.Dense(1,
                                         activation='linear',
                                         input_shape=(ridge_functions, ),
                                         name='main_output')(added)

            model = keras.models.Model(inputs=input, outputs=out)

            optimizer = keras.optimizers.Adam(lr=learning_rate,
                                              beta_1=beta1,
                                              beta_2=beta2,
                                              decay=dec,
                                              amsgrad=ams)

            # Use the correct loss for the problem
            if is_categorical:
                model.compile(loss={'main_output': 'binary_crossentropy'},
                              optimizer=optimizer)
            else:
                model.compile(loss={'main_output': 'mean_squared_error'},
                              optimizer=optimizer)

            return model

        def _mlp(input, idx, arch=[20, 12], activation='relu'):
            # Set up a submetwork

            # Hidden layers
            mlp = keras.layers.Dense(arch[0],
                                     activation=activation,
                                     name='mlp_{}_dense_0'.format(idx),
                                     kernel_initializer=my_init)(input)
            for i, layer in enumerate(arch[1:]):
                mlp = keras.layers.Dense(layer,
                                         activation=activation,
                                         name='mlp_{}_dense_{}'.format(
                                             idx, i + 1),
                                         kernel_initializer=my_init)(mlp)

            # Output of the MLP
            mlp = keras.layers.Dense(
                1,
                activation='linear',
                name='mlp_{}_dense_last'.format(idx),
                kernel_regularizer=keras.regularizers.l1(1e-3),
                kernel_initializer=my_init)(mlp)
            return mlp

        def get_shap(X, model):
            # Calculate the Shap values
            np.random.seed(24)
            bg_samples = min(X.shape[0], 1000)

            if isinstance(X, pd.DataFrame):
                background = X.iloc[np.random.choice(X.shape[0],
                                                     bg_samples,
                                                     replace=False)]
            else:
                background = X[np.random.choice(X.shape[0],
                                                bg_samples,
                                                replace=False)]

            # Explain predictions of the model on the subset
            explainer = shap.DeepExplainer(model, background)
            shap_values = explainer.shap_values(X)

            # Return the mean absolute value of each shap value for each dataset
            xnn_shap = np.abs(shap_values[0]).mean(axis=0)

            return xnn_shap

        # Initialize the xnn's
        features = X.shape[1]
        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

            self.is_cat = True
            xnn1 = xnn_initialize(features=features,
                                  ridge_functions=features,
                                  arch=self.params["arch"],
                                  learning_rate=self.params["lr"],
                                  beta1=self.params["beta_1"],
                                  beta2=self.params["beta_1"],
                                  dec=self.params["decay"],
                                  ams=self.params["amsgrad"],
                                  is_categorical=self.is_cat)
            xnn = xnn_initialize(features=features,
                                 ridge_functions=features,
                                 arch=self.params["arch"],
                                 learning_rate=self.params["lr"],
                                 beta1=self.params["beta_1"],
                                 beta2=self.params["beta_1"],
                                 dec=self.params["decay"],
                                 ams=self.params["amsgrad"],
                                 is_categorical=self.is_cat)
        else:
            self.is_cat = False
            xnn1 = xnn_initialize(features=features,
                                  ridge_functions=features,
                                  arch=self.params["arch"],
                                  learning_rate=self.params["lr"],
                                  beta1=self.params["beta_1"],
                                  beta2=self.params["beta_1"],
                                  dec=self.params["decay"],
                                  ams=self.params["amsgrad"],
                                  is_categorical=self.is_cat)
            xnn = xnn_initialize(features=features,
                                 ridge_functions=features,
                                 arch=self.params["arch"],
                                 learning_rate=self.params["lr"],
                                 beta1=self.params["beta_1"],
                                 beta2=self.params["beta_1"],
                                 dec=self.params["decay"],
                                 ams=self.params["amsgrad"],
                                 is_categorical=self.is_cat)

        X = self.basic_impute(X)
        X = X.to_numpy()

        inputs = {'main_input': X}
        validation_set = 0
        verbose = 0

        # Train the neural network once with early stopping and a validation set
        history = keras.callbacks.History()
        es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min')

        history = xnn1.fit(inputs,
                           y,
                           epochs=self.params["n_estimators"],
                           batch_size=self.params["batch_size"],
                           validation_split=0.3,
                           verbose=verbose,
                           callbacks=[history, es])

        # Train again on the full data
        number_of_epochs_it_ran = len(history.history['loss'])

        xnn.fit(inputs,
                y,
                epochs=number_of_epochs_it_ran,
                batch_size=self.params["batch_size"],
                validation_split=0.0,
                verbose=verbose)

        # Get the mean absolute Shapley values
        importances = np.array(get_shap(X, xnn))

        int_output = {}
        int_weights = {}
        int_bias = {}
        int_input = {}

        original_activations = {}

        x_labels = list(map(lambda x: 'x' + str(x), range(features)))

        intermediate_output = []

        # Record and plot the projection weights
        #
        weight_list = []
        for layer in xnn.layers:

            layer_name = layer.get_config()['name']
            if layer_name != "main_input":
                print(layer_name)
                weights = layer.get_weights()

                # Record the biases
                try:
                    bias = layer.get_weights()[1]
                    int_bias[layer_name] = bias
                except:
                    print("No Bias")

                # Record outputs for the test set
                intermediate_layer_model = keras.models.Model(
                    inputs=xnn.input, outputs=xnn.get_layer(layer_name).output)

                # Record the outputs from the training set
                if self.is_cat and (layer_name == 'main_output'):
                    original_activations[layer_name] = scipy.special.logit(
                        intermediate_layer_model.predict(X))
                    original_activations[
                        layer_name +
                        "_p"] = intermediate_layer_model.predict(X)
                else:
                    original_activations[
                        layer_name] = intermediate_layer_model.predict(X)

                    # Record other weights, inputs, and outputs
                int_weights[layer_name] = weights
                int_input[layer_name] = layer.input
                int_output[layer_name] = layer.output

            # Plot the projection layers
            if "projection_layer" in layer.get_config()['name']:

                # print(layer.get_config()['name'])

                # Record the weights for each projection layer
                weights = [np.transpose(layer.get_weights()[0])]

                weight_list2 = []
                for i, weight in enumerate(weights[0]):
                    weight_list.append(weight)
                    weight_list2.append(
                        list(np.reshape(weight, (1, features))[0]))

                    # Plot weights
                    plt.bar(orig_cols,
                            abs(np.reshape(weight, (1, features))[0]),
                            1,
                            color="blue")
                    plt.ylabel("Coefficient value")
                    plt.title("Projection Layer Weights {}".format(i),
                              fontdict={'fontsize': 10})
                    plt.xticks(rotation=90)
                    plt.show()
                    plt.savefig(os.path.join(
                        tmp_folder, 'projection_layer_' + str(i) + '.png'),
                                bbox_inches="tight")
                    plt.clf()

            if "main_output" in layer.get_config()['name']:
                weights_main = layer.get_weights()
                print(weights_main)

        pd.DataFrame(weight_list2).to_csv(os.path.join(tmp_folder,
                                                       "projection_data.csv"),
                                          index=False)

        intermediate_output = []

        for feature_num in range(features):
            intermediate_layer_model = keras.models.Model(
                inputs=xnn.input,
                outputs=xnn.get_layer('mlp_' + str(feature_num) +
                                      '_dense_last').output)
            intermediate_output.append(intermediate_layer_model.predict(X))

        # Record and plot the ridge functions
        ridge_x = []
        ridge_y = []
        for weight_number in range(len(weight_list)):
            ridge_x.append(
                list(
                    sum(X[:, ii] * weight_list[weight_number][ii]
                        for ii in range(features))))
            ridge_y.append(list(intermediate_output[weight_number]))

            plt.plot(
                sum(X[:, ii] * weight_list[weight_number][ii]
                    for ii in range(features)),
                intermediate_output[weight_number], 'o')
            plt.xlabel("Input")
            plt.ylabel("Subnetwork " + str(weight_number))
            plt.title("Ridge Function {}".format(i), fontdict={'fontsize': 10})
            plt.show()
            plt.savefig(
                os.path.join(tmp_folder,
                             'ridge_' + str(weight_number) + '.png'))
            plt.clf()

        # Output the ridge function importance
        weights2 = np.array([item[0] for item in list(weights)[0]])

        output_activations = np.abs(
            np.array([
                item * weights2
                for item in list(original_activations["concatenate_1"])
            ])).mean(axis=0)
        loggerinfo(logger, str(output_activations))
        pd.DataFrame(output_activations).to_csv(os.path.join(
            tmp_folder, "ridge_weights.csv"),
                                                index=False)

        plt.bar(x_labels, output_activations, 1, color="blue")
        plt.xlabel("Ridge function number")
        plt.ylabel("Feature importance")
        plt.title("Ridge function importance", fontdict={'fontsize': 10})
        plt.show()
        plt.savefig(os.path.join(tmp_folder, 'Ridge_function_importance.png'))

        pd.DataFrame(ridge_y).applymap(lambda x: x[0]).to_csv(os.path.join(
            tmp_folder, "ridge_y.csv"),
                                                              index=False)
        pd.DataFrame(ridge_x).to_csv(os.path.join(tmp_folder, "ridge_x.csv"),
                                     index=False)

        pd.DataFrame(orig_cols).to_csv(os.path.join(tmp_folder,
                                                    "input_columns.csv"),
                                       index=False)

        self.set_model_properties(model=xnn,
                                  features=orig_cols,
                                  importances=importances.tolist(),
                                  iterations=self.params['n_estimators'])
    def create_data():

        import pandas as pd
        from h2oaicore.models_utils import import_tensorflow
        tf = import_tensorflow()
        # above is because aif360 requires tensorflow
        from aif360.datasets import BinaryLabelDataset
        from aif360.algorithms.preprocessing.reweighing import Reweighing

        """
        Update the below as needed
        """
        #########
        #########
        #########
        # Path to the data
        folder_path = 'tmp/'
        # Data file
        data_file = 'housing_train_proc.csv'
        full_data_file = folder_path + data_file

        if not os.path.isfile(full_data_file):
            # for testing, just return something
            if config.hard_asserts:
                return dt.Frame(np.array([[1, 2, 3], [4, 5, 6]]))
            else:
                return []

        train = pd.read_csv(full_data_file)

        validation_test_files = ['housing_test_proc.csv']

        validation_split = [0.6, 0.8]

        # Target column
        target = 'high_priced'
        favorable_label = 0
        unfavorable_label = 1

        # Privleged_group_info  = [[Protetected group name 1, prevleged level, unprivleged level], [Protetected group name 2, prevleged level, unprivleged level]]
        # The protected group columns need to be binary
        protected_group_info = [['hispanic', 0, 1], ['black', 0, 1]]
        #########
        #########
        #########

        # Set up protected group info
        protected_groups = [group_info[0] for group_info in protected_group_info]

        dataset_orig = BinaryLabelDataset(df=train, label_names=[target], favorable_label=favorable_label,
                                          unfavorable_label=unfavorable_label,
                                          protected_attribute_names=protected_groups)

        privileged_groups = []
        unprivileged_groups = []
        for protected_group in protected_group_info:
            privileged_groups_dict = {}
            unprivileged_groups_dict = {}
            privileged_groups_dict[protected_group[0]] = protected_group[1]
            unprivileged_groups_dict[protected_group[0]] = protected_group[2]
            privileged_groups.append(privileged_groups_dict)
            unprivileged_groups.append(unprivileged_groups_dict)

        # Fit weights on the full dataset to be used on the external test set, if given
        RW_full = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
        RW_full.fit(dataset_orig)

        # Split the original data into train, validation, and test if applicable
        if len(validation_split) == 1:
            dataset_orig_train, dataset_orig_valid = dataset_orig.split(validation_split, shuffle=True)
        elif len(validation_split) == 2:
            dataset_orig_train_valid, dataset_orig_test = dataset_orig.split([validation_split[1]], shuffle=True)
            # Fit the weights on both the validation and test set for the test set split
            RW_train_valid = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
            RW_train_valid.fit(dataset_orig_train_valid)
            dataset_orig_train, dataset_orig_valid = dataset_orig_train_valid.split(
                [validation_split[0] / (validation_split[1])], shuffle=True)
        else:
            dataset_orig_train = dataset_orig

        # Fit weights on the training set only    
        RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
        RW.fit(dataset_orig_train)
        dataset_transf_train = RW.transform(dataset_orig_train)

        # Add the weigts to the training set
        train_df = pd.DataFrame(dataset_transf_train.features, columns=dataset_transf_train.feature_names)
        train_df[target] = dataset_transf_train.labels.ravel()
        train_df['weights'] = dataset_transf_train.instance_weights.ravel()

        # Create datasets with minimum features calculated the given number of days ahead
        dataset_dict = {}
        dataset_dict[data_file.split('.')[0] + "_rw_train.csv"] = train_df

        # Add weights to the validation split (if a validation split was specified)
        if len(validation_split) >= 1:
            dataset_transf_valid = RW.transform(dataset_orig_valid)
            valid_df = pd.DataFrame(dataset_transf_valid.features, columns=dataset_transf_valid.feature_names)
            valid_df[target] = dataset_transf_valid.labels.ravel()
            valid_df['weights'] = dataset_transf_valid.instance_weights.ravel()
            dataset_dict[data_file.split('.')[0] + "_rw_validation.csv"] = valid_df

        # Add weights to the test split (if a test split was specified)
        if len(validation_split) >= 2:
            dataset_transf_test = RW_train_valid.transform(dataset_orig_test)
            test_df = pd.DataFrame(dataset_transf_test.features, columns=dataset_transf_test.feature_names)
            test_df[target] = dataset_transf_test.labels.ravel()
            test_df['weights'] = dataset_transf_test.instance_weights.ravel()
            dataset_dict[data_file.split('.')[0] + "_rw_test.csv"] = test_df

        # Add weights to the test files (If provided)       
        for valid_file in validation_test_files:
            valid = pd.read_csv(folder_path + valid_file)
            dataset_valid_orig = BinaryLabelDataset(df=valid, label_names=[target], favorable_label=favorable_label,
                                                    unfavorable_label=unfavorable_label,
                                                    protected_attribute_names=protected_groups)
            dataset_transf_valid = RW_full.transform(dataset_valid_orig)

            valid_df = pd.DataFrame(dataset_transf_valid.features, columns=dataset_transf_valid.feature_names)
            valid_df[target] = dataset_transf_valid.labels.ravel()
            valid_df['weights'] = dataset_transf_valid.instance_weights.ravel()

            dataset_dict[valid_file.split('.')[0] + "_rw_transformed.csv"] = valid_df

        return dataset_dict