示例#1
0
    def setUp(self):
        self.cwd = os.getcwd()
        tests_dir = __file__
        os.chdir(os.path.dirname(tests_dir))

        decoder = arff.ArffDecoder()
        with open(os.path.join("datasets", "dataset.arff")) as fh:
            dataset = decoder.decode(fh, encode_nominal=True)

        # -1 because the last attribute is the class
        self.attribute_types = [
            'numeric' if type(type_) != list else 'nominal'
            for name, type_ in dataset['attributes'][:-1]]
        self.categorical = [True if attribute == 'nominal' else False
                            for attribute in self.attribute_types]

        data = np.array(dataset['data'], dtype=np.float64)
        X = data[:,:-1]
        y = data[:,-1].reshape((-1,))

        ohe = OneHotEncoder(self.categorical)
        X_transformed = ohe.fit_transform(X)
        imp = Imputer(copy=False)
        X_transformed = imp.fit_transform(X_transformed)
        standard_scaler = StandardScaler()
        X_transformed = standard_scaler.fit_transform(X_transformed)
        X_transformed = X_transformed.todense()

        # Transform the array which indicates the categorical metafeatures
        number_numerical = np.sum(~np.array(self.categorical))
        categorical_transformed = [True] * (X_transformed.shape[1] -
                                            number_numerical) + \
                                  [False] * number_numerical
        self.categorical_transformed = categorical_transformed

        self.X = X
        self.X_transformed = X_transformed
        self.y = y
        self.mf = meta_features.metafeatures
        self.helpers = meta_features.helper_functions

        # Precompute some helper functions
        self.helpers.set_value("PCA", self.helpers["PCA"]
            (self.X_transformed, self.y))
        self.helpers.set_value("MissingValues", self.helpers[
            "MissingValues"](self.X, self.y, self.categorical))
        self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"](
            self.X, self.y, self.categorical))
        self.helpers.set_value("ClassOccurences",
                               self.helpers["ClassOccurences"](self.X, self.y))
        self.helpers.set_value("Skewnesses",
            self.helpers["Skewnesses"](self.X_transformed, self.y,
                                       self.categorical_transformed))
        self.helpers.set_value("Kurtosisses",
            self.helpers["Kurtosisses"](self.X_transformed, self.y,
                                        self.categorical_transformed))
示例#2
0
    def perform1HotEncoding(self):
        if not hasattr(self, "data"):
            raise ValueError("perform1HotEncoding can only be called when "
                             "data is loaded")
        if hasattr(self, "encoder_"):
            raise ValueError("perform1HotEncoding can only be called on "
                             "non-encoded data.")
        self.encoder_ = None

        sparse = True if self.info['is_sparse'] == 1 else False
        has_missing = True if self.info['has_missing'] else False

        to_encode = ['categorical']
        if has_missing:
            to_encode += ['binary']
        encoding_mask = [
            feat_type.lower() in to_encode for feat_type in self.feat_type
        ]

        categorical = [
            True if feat_type.lower() == 'categorical' else False
            for feat_type in self.feat_type
        ]

        predicted_RAM_usage = float(
            data_util.predict_RAM_usage(self.data['X_train'],
                                        categorical)) / 1024 / 1024

        if predicted_RAM_usage > 1000:
            sparse = True

        if any(encoding_mask):
            encoder = OneHotEncoder(categorical_features=encoding_mask,
                                    dtype=np.float32,
                                    sparse=sparse)
            self.data['X_train'] = encoder.fit_transform(self.data['X_train'])
            if 'X_valid' in self.data:
                self.data['X_valid'] = encoder.transform(self.data['X_valid'])
            if 'X_test' in self.data:
                self.data['X_test'] = encoder.transform(self.data['X_test'])

            if not sparse and scipy.sparse.issparse(self.data['X_train']):
                self.data['X_train'] = self.data['X_train'].todense()
                if 'X_valid' in self.data:
                    self.data['X_valid'] = self.data['X_valid'].todense()
                if 'X_test' in self.data:
                    self.data['X_test'] = self.data['X_test'].todense()

            self.encoder_ = encoder
            self.info['is_sparse'] = 1 if sparse else 0
    def test_transform_with_unknown_value(self):
        input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose()
        ohe = OneHotEncoder()
        ohe.fit(input)
        test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose()
        output = ohe.transform(test_data).todense()
        self.assertEqual(5, np.sum(output))

        input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose()
        ips = scipy.sparse.csr_matrix(input)
        ohe = OneHotEncoder()
        ohe.fit(ips)
        test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose()
        tds = scipy.sparse.csr_matrix(test_data)
        output = ohe.transform(tds).todense()
        self.assertEqual(3, np.sum(output))
    def perform1HotEncoding(self):
        if not hasattr(self, 'data'):
            raise ValueError('perform1HotEncoding can only be called when '
                             'data is loaded')
        if hasattr(self, 'encoder_'):
            raise ValueError('perform1HotEncoding can only be called on '
                             'non-encoded data.')
        self._encoder = None

        sparse = True if self.info['is_sparse'] == 1 else False
        has_missing = True if self.info['has_missing'] else False

        to_encode = ['categorical']
        if has_missing:
            to_encode += ['binary']
        encoding_mask = [feat_type.lower() in to_encode
                         for feat_type in self.feat_type]

        categorical = [True if feat_type.lower() == 'categorical' else False
                       for feat_type in self.feat_type]

        predicted_RAM_usage = float(predict_RAM_usage(
            self.data['X_train'], categorical)) / 1024 / 1024

        if predicted_RAM_usage > 1000:
            sparse = True

        if any(encoding_mask):
            encoder = OneHotEncoder(categorical_features=encoding_mask,
                                    dtype=np.float32,
                                    sparse=sparse)
            self.data['X_train'] = encoder.fit_transform(self.data['X_train'])
            if 'X_valid' in self.data:
                self.data['X_valid'] = encoder.transform(self.data['X_valid'])
            if 'X_test' in self.data:
                self.data['X_test'] = encoder.transform(self.data['X_test'])

            if not sparse and scipy.sparse.issparse(self.data['X_train']):
                self.data['X_train'] = self.data['X_train'].todense()
                if 'X_valid' in self.data:
                    self.data['X_valid'] = self.data['X_valid'].todense()
                if 'X_test' in self.data:
                    self.data['X_test'] = self.data['X_test'].todense()

            self.encoder = encoder
            self.info['is_sparse'] = 1 if sparse else 0
    def perform_hot_encoding(self):
        if not hasattr(self, '_data') and self._data is not None:
            raise ValueError('perform1HotEncoding can only be called when '
                             'data is loaded')
        if hasattr(self, '_encoder') and self._encoder is not None:
            raise ValueError('perform1HotEncoding can only be called on '
                             'non-encoded data.')

        sparse = True if self.info['is_sparse'] == 1 else False
        has_missing = True if self.info['has_missing'] else False

        to_encode = ['categorical']
        if has_missing:
            to_encode += ['binary']
        encoding_mask = [feat_type.lower() in to_encode
                         for feat_type in self._feat_type]

        categorical = [True if feat_type.lower() == 'categorical' else False
                       for feat_type in self._feat_type]

        predicted_RAM_usage = float(
            predict_RAM_usage(self.data['X_train'], categorical)) / pow(1024, 2)

        if predicted_RAM_usage > 1000:
            sparse = True

        if any(encoding_mask):
            encoder = OneHotEncoder(categorical_features=encoding_mask,
                                    dtype=np.float32,
                                    sparse=sparse)

            to_dence_flg = False
            for x in ['X_train', 'X_valid', 'X_test']:
                if x in self.data:
                    self.data[x] = encoder.fit_transform(self.data[x])
                    if x == 'X_train':
                        to_dence_flg = not sparse and scipy.sparse.issparse(self.data[x])
                    if to_dence_flg:
                        self.data[x] = self.data[x].todense()

            self._encoder = encoder
            self.info['is_sparse'] = 1 if sparse else 0
    def fit_then_transform_dense(self, expected, input,
                                 categorical_features='all',
                                 minimum_fraction=None):
        ohe = OneHotEncoder(categorical_features=categorical_features,
                            sparse=False, minimum_fraction=minimum_fraction)
        transformation = ohe.fit_transform(input.copy())
        self.assertIsInstance(transformation, np.ndarray)
        assert_array_almost_equal(expected, transformation)

        ohe2 = OneHotEncoder(categorical_features=categorical_features,
                             sparse=False, minimum_fraction=minimum_fraction)
        ohe2.fit(input.copy())
        transformation = ohe2.transform(input.copy())
        self.assertIsInstance(transformation, np.ndarray)
        assert_array_almost_equal(expected, transformation)
    def fit_then_transform(self, expected, input, categorical_features='all',
                           minimum_fraction=None):
        # Test fit_transform
        ohe = OneHotEncoder(categorical_features=categorical_features,
                            minimum_fraction=minimum_fraction)
        transformation = ohe.fit_transform(input.copy())
        self.assertIsInstance(transformation, scipy.sparse.csr_matrix)
        assert_array_almost_equal(expected.astype(float),
                                  transformation.todense())

        # Test fit, and afterwards transform
        ohe2 = OneHotEncoder(categorical_features=categorical_features,
                             minimum_fraction=minimum_fraction)
        ohe2.fit(input.copy())
        transformation = ohe2.transform(input.copy())
        self.assertIsInstance(transformation, scipy.sparse.csr_matrix)
        assert_array_almost_equal(expected, transformation.todense())
示例#8
0
def calculate_all_metafeatures(X, y, categorical, dataset_name,
        calculate=None, dont_calculate=None, densify_threshold=1000):
    logger = get_logger(__name__)

    """Calculate all metafeatures."""
    helper_functions.clear()
    metafeatures.clear()
    mf_ = dict()

    visited = set()
    to_visit = deque()
    to_visit.extend(metafeatures)

    X_transformed = None
    y_transformed = None

    # TODO calculate the numpy metafeatures after all others to consume less
    # memory
    while len(to_visit) > 0:
        name = to_visit.pop()
        if calculate is not None and name not in calculate:
            continue
        if dont_calculate is not None and name in dont_calculate:
            continue

        if name in npy_metafeatures:
            if X_transformed is None:
                # TODO make sure this is done as efficient as possible (no copy for
                # sparse matrices because of wrong sparse format)
                sparse = scipy.sparse.issparse(X)
                ohe = OneHotEncoder(categorical_features=categorical, sparse=True)
                X_transformed = ohe.fit_transform(X)
                imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype)
                X_transformed = imputer.fit_transform(X_transformed)
                standard_scaler = StandardScaler(copy=False)
                X_transformed = standard_scaler.fit_transform(X_transformed)

                # Transform the array which indicates the categorical metafeatures
                number_numerical = np.sum(~np.array(categorical))
                categorical_transformed = [True] * (X_transformed.shape[1] -
                                                    number_numerical) + \
                                          [False] * number_numerical

                # Densify the transformed matrix
                if not sparse and scipy.sparse.issparse(X_transformed):
                    bytes_per_float = X_transformed.dtype.itemsize
                    num_elements = X_transformed.shape[0] * X_transformed.shape[1]
                    megabytes_required = num_elements * bytes_per_float / 1000 / 1000
                    if megabytes_required < densify_threshold:
                        X_transformed = X_transformed.todense()

                # This is not only important for datasets which are somehow
                # sorted in a strange way, but also prevents lda from failing in
                # some cases.
                # Because this is advanced indexing, a copy of the data is returned!!!
                X_transformed = check_array(X_transformed,
                                            force_all_finite=True,
                                            accept_sparse='csr')
                rs = np.random.RandomState(42)
                indices = np.arange(X_transformed.shape[0])
                rs.shuffle(indices)
                # TODO Shuffle inplace
                X_transformed = X_transformed[indices]
                y_transformed = y[indices]

            X_ = X_transformed
            y_ = y_transformed
            categorical_ = categorical_transformed
        else:
            X_ = X
            y_ = y
            categorical_ = categorical

        dependency = metafeatures.get_dependency(name)
        if dependency is not None:
            is_metafeature = dependency in metafeatures
            is_helper_function = dependency in helper_functions

            if is_metafeature and is_helper_function:
                raise NotImplementedError()
            elif not is_metafeature and not is_helper_function:
                raise ValueError(dependency)
            elif is_metafeature and not metafeatures.is_calculated(dependency):
                to_visit.appendleft(name)
                continue
            elif is_helper_function and not helper_functions.is_calculated(
                    dependency):
                logger.info("%s: Going to calculate: %s", dataset_name,
                            dependency)
                value = helper_functions[dependency](X_, y_, categorical_)
                helper_functions.set_value(dependency, value)
                mf_[dependency] = value

        logger.info("%s: Going to calculate: %s", dataset_name,
                    name)

        value = metafeatures[name](X_, y_, categorical_)
        metafeatures.set_value(name, value)
        mf_[name] = value
        visited.add(name)

    mf_ = DatasetMetafeatures(dataset_name, mf_)
    return mf_