示例#1
0
def _read_url_files(url, data=None, file_dictionary=None, file_elements=None):
    """do a post request to url with data, file content of
    file_dictionary and sending file_elements as files"""

    data = {} if data is None else data
    data['api_key'] = config.apikey
    if file_elements is None:
        file_elements = {}
    if file_dictionary is not None:
        for key, path in file_dictionary.items():
            path = os.path.abspath(path)
            if os.path.exists(path):
                try:
                    if key is 'dataset':
                        # check if arff is valid?
                        decoder = arff.ArffDecoder()
                        with io.open(path, encoding='utf8') as fh:
                            decoder.decode(fh, encode_nominal=True)
                except:
                    raise ValueError("The file you have provided is not a valid arff file")

                file_elements[key] = open(path, 'rb')

            else:
                raise ValueError("File doesn't exist")

    # Using requests.post sets header 'Accept-encoding' automatically to
    # 'gzip,deflate'
    response = requests.post(url, data=data, files=file_elements)
    if response.status_code != 200:
        raise _parse_server_exception(response, url=url)
    if 'Content-Encoding' not in response.headers or \
            response.headers['Content-Encoding'] != 'gzip':
        warnings.warn('Received uncompressed content from OpenML for %s.' % url)
    return response.text
示例#2
0
 def read_data(self, filename):
     file_ = codecs.open(filename, 'rb', 'utf-8')
     decoder = arff.ArffDecoder()
     dataset = decoder.decode(file_.readlines(), encode_nominal=True)
     file_.close()
     data = dataset['data']
     return self.normlize_data(np.mat(data))
示例#3
0
def load_arff_data(filepath, is_regression=True):
    from sklearn.model_selection import train_test_split

    with open(filepath, 'r') as f:
        decoder = arff.ArffDecoder()
        d = decoder.decode(f, encode_nominal=True)

    data = np.array(d['data'])
    X = data[:, :-1]
    y = data[:, -1]

    rng = np.random.RandomState(0)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2,
        random_state=rng)  # TODO: Test size should be an arg
    n_dim = X_train.shape[1]
    n_train = X_train.shape[0]
    n_test = X_test.shape[0]
    n_class = 1 if is_regression else np.unique(y_train)

    data = {
        'x_train': X_train,
        'y_train': y_train,
        'n_class': n_class,
        'n_dim': n_dim,
        'n_train': n_train,
        'x_test': X_test,
        'y_test': y_test,
        'n_test': n_test,
        'is_sparse': False
    }

    return data
示例#4
0
def scrape_data():
    # decode the .arff data and change text labels into numerical
    decoder = arff.ArffDecoder()
    data = decoder.decode(file, encode_nominal=True)

    # split the raw data into data and labels
    vals = [val[0:-1] for val in data['data']]
    labels = [label[-1] for label in data['data']]

    for val in labels:
        if labels[val] != 0:
            labels[val] = 1

    # split the labels and data into traning and validation sets
    training_data = vals[0:int(.9 * len(vals))]
    training_labels = labels[0:int(.9 * len(vals))]
    validation_data = vals[int(.9 * len(vals)):]
    validation_labels = labels[int(.9 * len(vals)):]

    print(training_labels)

    # flatten labels with one hot encoding
    training_labels = to_categorical(training_labels, 5)
    validation_labels = to_categorical(validation_labels, 5)

    # save all arrays with numpy
    np.save('saved-files/vals', np.asarray(vals))
    np.save('saved-files/labels', np.asarray(labels))
    np.save('saved-files/training_data', np.asarray(training_data))
    np.save('saved-files/validation_data', np.asarray(validation_data))
    np.save('saved-files/training_labels', np.asarray(training_labels))
    np.save('saved-files/validation_labels', np.asarray(validation_labels))
示例#5
0
    def test_encode_adding_quotes_with_spaces(self):
        # regression tests for https://github.com/renatopp/liac-arff/issues/87
        encoder = self.get_encoder()

        # \u3000 corresponds to an ideographic space. It should be treated as
        # a space.
        fixture = {
            'relation': 'name',
            'attributes': [('A', 'STRING'), ('B', 'STRING')],
            'data': [['a', 'b'], ['b\u3000e', 'a']],
        }
        expected_data = """@RELATION name

@ATTRIBUTE A STRING
@ATTRIBUTE B STRING

@DATA
a,b
'b\u3000e',a
"""
        arff_data = encoder.encode(fixture)
        self.assertEqual(arff_data, expected_data)

        decoder = arff.ArffDecoder()
        arff_object = decoder.decode(arff_data)
        self.assertEqual(arff_object['data'], fixture['data'])
def meta_train_data_transformed(request):
    tests_dir = __file__
    os.chdir(os.path.dirname(tests_dir))

    decoder = arff.ArffDecoder()
    with open(os.path.join("datasets", "dataset.arff")) as fh:
        dataset = decoder.decode(fh, encode_nominal=True)

    # -1 because the last attribute is the class
    attribute_types = [
        'numeric' if type(type_) != list else 'nominal'
        for name, type_ in dataset['attributes'][:-1]]
    categorical = {i: True if attribute == 'nominal' else False
                   for i, attribute in enumerate(attribute_types)}

    data = np.array(dataset['data'], dtype=np.float64)
    X = data[:, :-1]
    y = data[:, -1].reshape((-1,))

    logger = logging.getLogger('Meta')
    meta_features.helper_functions.set_value(
        "MissingValues", meta_features.helper_functions["MissingValues"](X, y, logger, categorical),
        )
    meta_features.helper_functions.set_value(
        "NumSymbols",
        meta_features.helper_functions["NumSymbols"](X, y, logger,  categorical),
    )
    meta_features.helper_functions.set_value(
        "ClassOccurences",
        meta_features.helper_functions["ClassOccurences"](X, y, logger),
    )

    DPP = FeatTypeSplit(feat_type={
        col: 'categorical' if category else 'numerical' for col, category in categorical.items()
    })
    X_transformed = DPP.fit_transform(X)

    number_numerical = np.sum(~np.array(list(categorical.values())))
    categorical_transformed = {i: True if i < (X_transformed.shape[1] - number_numerical) else False
                               for i in range(X_transformed.shape[1])}

    # pre-compute values for transformed inputs
    meta_features.helper_functions.set_value(
        "PCA", meta_features.helper_functions["PCA"](X_transformed, y, logger),
    )
    meta_features.helper_functions.set_value(
        "Skewnesses", meta_features.helper_functions["Skewnesses"](
            X_transformed, y, logger, categorical_transformed),
    )
    meta_features.helper_functions.set_value(
        "Kurtosisses", meta_features.helper_functions["Kurtosisses"](
            X_transformed, y, logger, categorical_transformed)
    )

    if request.param == 'numpy':
        return X_transformed, y, categorical_transformed
    elif request.param == 'pandas':
        return pd.DataFrame(X_transformed), y, categorical_transformed
    else:
        raise ValueError(request.param)
示例#7
0
    def return_arff(self):
        filename = self.directory

        with io.open(filename) as fh:
            decoder = arff.ArffDecoder()
            return decoder.decode(fh,
                                  encode_nominal=True,
                                  return_type=arff.DENSE)
示例#8
0
 def read_data(self, filename):
     file_ = codecs.open(filename, 'rb', 'utf-8')
     decoder = arff.ArffDecoder()
     dataset = decoder.decode(file_.readlines(), encode_nominal=True)
     file_.close()
     self.__data = dataset['data']
     if (self.__data is not None and self.__data[0] is not None):
         self.__dim_size = len(self.__data[0])
    def setUp(self):
        self.cwd = os.getcwd()
        tests_dir = __file__
        os.chdir(os.path.dirname(tests_dir))

        decoder = arff.ArffDecoder()
        with open(os.path.join("datasets", "dataset.arff")) as fh:
            dataset = decoder.decode(fh, encode_nominal=True)

        # -1 because the last attribute is the class
        self.attribute_types = [
            'numeric' if type(type_) != list else 'nominal'
            for name, type_ in dataset['attributes'][:-1]]
        self.categorical = [True if attribute == 'nominal' else False
                            for attribute in self.attribute_types]

        data = np.array(dataset['data'], dtype=np.float64)
        X = data[:,:-1]
        y = data[:,-1].reshape((-1,))

        ohe = OneHotEncoder(self.categorical)
        X_transformed = ohe.fit_transform(X)
        imp = Imputer(copy=False)
        X_transformed = imp.fit_transform(X_transformed)
        center = not scipy.sparse.isspmatrix((X_transformed))
        standard_scaler = StandardScaler(with_mean=center)
        X_transformed = standard_scaler.fit_transform(X_transformed)
        X_transformed = X_transformed.todense()

        # Transform the array which indicates the categorical metafeatures
        number_numerical = np.sum(~np.array(self.categorical))
        categorical_transformed = [True] * (X_transformed.shape[1] -
                                            number_numerical) + \
                                  [False] * number_numerical
        self.categorical_transformed = categorical_transformed

        self.X = X
        self.X_transformed = X_transformed
        self.y = y
        self.mf = meta_features.metafeatures
        self.helpers = meta_features.helper_functions

        # Precompute some helper functions
        self.helpers.set_value("PCA", self.helpers["PCA"]
            (self.X_transformed, self.y))
        self.helpers.set_value("MissingValues", self.helpers[
            "MissingValues"](self.X, self.y, self.categorical))
        self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"](
            self.X, self.y, self.categorical))
        self.helpers.set_value("ClassOccurences",
                               self.helpers["ClassOccurences"](self.X, self.y))
        self.helpers.set_value("Skewnesses",
            self.helpers["Skewnesses"](self.X_transformed, self.y,
                                       self.categorical_transformed))
        self.helpers.set_value("Kurtosisses",
            self.helpers["Kurtosisses"](self.X_transformed, self.y,
                                        self.categorical_transformed))
示例#10
0
def load_arff_data(filename):
    with open(filename) as f:
        decoder = arff.ArffDecoder()
        arff_obj = decoder.decode(f, encode_nominal=True)
        # feat_num = len([v for v in arff_obj['attributes'] if v[0] != 'class'])
        data = np.array(arff_obj['data'])
        X = data[:, :-1]
        y = data[:, -1]

        return X, y
示例#11
0
def arff2df(filepath):
    decoder = arff.ArffDecoder()
    arff_file = open(filepath)
    decoded_arff = decoder.decode(arff_file, return_type=arff.LOD)
    data = decoded_arff['data']
    column_names = list(map(lambda x: x[0], decoded_arff['attributes']))
    df = pd.DataFrame.from_records(data,
                                   columns=list(range(len(column_names))))
    df = df.fillna(0)
    df.columns = column_names
    return df
示例#12
0
def load_arff_data(filepath):

    with open(filepath, 'r') as f:
        decoder = arff.ArffDecoder()
        d = decoder.decode(f, encode_nominal=True)
    # tvas: We are assuming the target/dependent is the last column
    data = np.array(d['data'])
    X = data[:, :-1]
    y = data[:, -1]

    return X, y
示例#13
0
 def read_data(self, filename):
     """
     Read data from file.
     :param filename: filename
     :return: normalized data
     """
     file_ = codecs.open(filename, 'rb', 'utf-8')
     decoder = arff.ArffDecoder()
     dataset = decoder.decode(file_.readlines(), encode_nominal=True)
     file_.close()
     data = dataset['data']
     return self.normalize_data(np.mat(data))
def sparse_data():
    tests_dir = __file__
    os.chdir(os.path.dirname(tests_dir))

    decoder = arff.ArffDecoder()
    with open(os.path.join("datasets", "dataset.arff")) as fh:
        dataset = decoder.decode(fh, encode_nominal=True)

    # -1 because the last attribute is the class
    attribute_types = [
        'numeric' if type(type_) != list else 'nominal'
        for name, type_ in dataset['attributes'][:-1]
    ]
    categorical = {
        i: True if attribute == 'nominal' else False
        for i, attribute in enumerate(attribute_types)
    }

    data = np.array(dataset['data'], dtype=np.float64)
    X = data[:, :-1]
    y = data[:, -1].reshape((-1, ))

    # First, swap NaNs and zeros, because when converting an encoded
    # dense matrix to sparse, the values which are encoded to zero are lost
    X_sparse = X.copy()
    NaNs = ~np.isfinite(X_sparse)
    X_sparse[NaNs] = 0
    X_sparse = sparse.csr_matrix(X_sparse)

    X = X_sparse
    y = y
    mf = meta_features.metafeatures
    helpers = meta_features.helper_functions
    logger = logging.getLogger()
    # Precompute some helper functions
    helpers.set_value(
        "MissingValues",
        helpers["MissingValues"](X, y, logger, categorical),
    )
    mf.set_value(
        "NumberOfMissingValues",
        mf["NumberOfMissingValues"](X, y, logger, categorical),
    )
    helpers.set_value(
        "NumSymbols",
        helpers["NumSymbols"](X, y, logger, categorical),
    )
    helpers.set_value(
        "ClassOccurences",
        helpers["ClassOccurences"](X, y, logger),
    )
    return X, y, categorical
示例#15
0
 def test_date(self):
     file_ = '''@RELATION employee
     @ATTRIBUTE Name STRING
     @ATTRIBUTE start_date DATE
     @ATTRIBUTE end_date DATE '%Y/%m/%dT%H:%M:%S'
     @ATTRIBUTE simple_date DATE '%Y/%m/%d'        
     @DATA
     Lulu,'2011-05-20T12:34:56','2014/06/21T12:34:56','2018/03/04'
     Daisy,'2012-09-30T12:34:56','2015/11/21T12:34:56','2018/03/04'
     Brie,'2013-05-01T12:34:56','2016/12/21T12:34:56','2018/03/04'
     '''
     decoder = arff.ArffDecoder()
     d = decoder.decode(file_, encode_nominal=True)
     reconstituted = arff.dumps(d)
     decoder2 = arff.ArffDecoder()
     d2 = decoder2.decode(reconstituted, encode_nominal=True)
     self.assertEqual(d['data'][1][1], d2['data'][1][1])
     self.assertEqual(d['data'][1][2], d2['data'][1][2])
     self.assertEqual(d['data'][1][3], d2['data'][1][3])
     self.assertEqual(d['data'][2][1], d2['data'][2][1])
     self.assertEqual(d['data'][2][2], d2['data'][2][2])
     self.assertEqual(d['data'][2][3], d2['data'][2][3])
示例#16
0
    def retrieve_class_labels_for_dataset(self, dataset):
        """Reads the datasets arff to determine the class-labels, and returns those.
        If the task has no class labels (for example a regression problem) it returns None."""
        # TODO improve performance, currently reads the whole file
        # Should make a method that only reads the attributes
        arffFileName = dataset.data_file
        with open(arffFileName) as fh:
            arffData = arff.ArffDecoder().decode(fh)

        dataAttributes = dict(arffData['attributes'])
        if('class' in dataAttributes):
            return dataAttributes['class']
        else:
            return None
示例#17
0
    def read_data(self, filename):
        """
        Read data from file.

        :param filename: Name of the file to read
        :return: no return
        """
        file_ = codecs.open(filename, 'rb', 'utf-8')
        decoder = arff.ArffDecoder()
        dataset = decoder.decode(file_.readlines(), encode_nominal=True)
        file_.close()
        self.__data = dataset['data']
        if self.__data is not None and self.__data[0] is not None:
            self.__dim_size = len(self.__data[0])
    def test_get_online_dataset_arff(self):
        dataset_id = 100  # Australian
        # lazy loading not used as arff file is checked.
        dataset = openml.datasets.get_dataset(dataset_id)
        decoder = arff.ArffDecoder()
        # check if the arff from the dataset is
        # the same as the arff from _get_arff function
        d_format = (dataset.format).lower()

        self.assertEqual(
            dataset._get_arff(d_format),
            decoder.decode(
                _get_online_dataset_arff(dataset_id),
                encode_nominal=True,
                return_type=arff.DENSE if d_format == 'arff' else arff.COO),
            "ARFF files are not equal")
示例#19
0
def scrape_data():
    # 解码arff数据,文本标签转变为二进制数据
    decoder = arff.ArffDecoder()
    data = decoder.decode(file, encode_nominal=True)

    # 将原始数据分解为数据和标签
    vals = [val[0: -1] for val in data['data']]
    labels = [label[-1] for label in data['data']]

    for val in labels:
        if labels[val] != 0:
            labels[val] = 1

    #将标签和数据分成训练和验证集
    training_data = vals[0: int(.9 * len(vals))]
    training_labels = labels[0: int(.9 * len(vals))]
    validation_data = vals[int(.9 * len(vals)):]
    validation_labels = labels[int(.9 * len(vals)):]

    a = np.asarray(training_data, dtype=float)
    scaler = preprocessing.StandardScaler().fit(a)
    scaler.mean_
    scaler.var_
    scaler.scale_
    training_data = scaler.transform(a)

    b = np.asarray(validation_data, dtype=float)
    scaler = preprocessing.StandardScaler().fit(b)
    scaler.mean_
    scaler.var_
    scaler.scale_
    validation_data = scaler.transform(b)
    #print(training_labels)

    # 将原有的类别向量转换为独热编码的形式
    training_labels = to_categorical(training_labels, 5)
    validation_labels = to_categorical(validation_labels, 5)

    # 用numpy保存所有数组
    np.save('saved-files/vals', np.asarray(vals))
    np.save('saved-files/labels', np.asarray(labels))
    np.save('saved-files/training_data', np.asarray(training_data))
    np.save('saved-files/validation_data', np.asarray(validation_data))
    np.save('saved-files/training_labels', np.asarray(training_labels))
    np.save('saved-files/validation_labels', np.asarray(validation_labels))
示例#20
0
def meta_train_data(request):
    tests_dir = __file__
    os.chdir(os.path.dirname(tests_dir))

    decoder = arff.ArffDecoder()
    with open(os.path.join("datasets", "dataset.arff")) as fh:
        dataset = decoder.decode(fh, encode_nominal=True)

    # -1 because the last attribute is the class
    attribute_types = [
        'numeric' if type(type_) != list else 'nominal'
        for name, type_ in dataset['attributes'][:-1]
    ]

    categorical = {
        i: True if attribute == 'nominal' else False
        for i, attribute in enumerate(attribute_types)
    }

    data = np.array(dataset['data'], dtype=np.float64)
    X = data[:, :-1]
    y = data[:, -1].reshape((-1, ))

    logger = logging.getLogger('Meta')
    meta_features.helper_functions.set_value(
        "MissingValues",
        meta_features.helper_functions["MissingValues"](X, y, logger,
                                                        categorical),
    )
    meta_features.helper_functions.set_value(
        "NumSymbols",
        meta_features.helper_functions["NumSymbols"](X, y, logger,
                                                     categorical),
    )
    meta_features.helper_functions.set_value(
        "ClassOccurences",
        meta_features.helper_functions["ClassOccurences"](X, y, logger),
    )
    if request.param == 'numpy':
        return X, y, categorical
    elif request.param == 'pandas':
        return pd.DataFrame(X), y, categorical
    else:
        raise ValueError(request.param)
示例#21
0
    def publish(self):
        """Publish the dataset on the OpenML server.

        Upload the dataset description and dataset content to openml.

        Returns
        -------
        dataset_id: int
            Id of the dataset uploaded to the server.
        """
        file_elements = {'description': self._to_xml()}

        # the arff dataset string is available
        if self._dataset is not None:
            file_elements['dataset'] = self._dataset
        else:
            # the path to the arff dataset is given
            if self.data_file is not None:
                path = os.path.abspath(self.data_file)
                if os.path.exists(path):
                    try:

                        with io.open(path, encoding='utf8') as fh:
                            # check if arff is valid
                            decoder = arff.ArffDecoder()
                            decoder.decode(fh, encode_nominal=True)
                    except arff.ArffException:
                        raise ValueError("The file you have provided is not "
                                         "a valid arff file.")

                    with open(path, 'rb') as fp:
                        file_elements['dataset'] = fp.read()
            else:
                if self.url is None:
                    raise ValueError("No url/path to the data file was given")

        return_value = openml._api_calls._perform_api_call(
            "data/",
            'post',
            file_elements=file_elements,
        )
        response = xmltodict.parse(return_value)
        self.dataset_id = int(response['oml:upload_data_set']['oml:id'])
        return self.dataset_id
示例#22
0
 def detect_dialect(self):
     # The arff package loads an arff file into a dict with the the keys:
     #   description (description of dataset)
     #   relation (name of dataset)
     #   attributes (list of tuples with name and type of attribute)
     #   data (list with the data rows)
     decoder = arff.ArffDecoder()
     if self.contents is None:
         file = open(self.path, 'r')
         weka = decoder.decode(file)
         file.close()
     else:
         weka = decoder.decode(self.decoded_contents)
         # The decoded contents are no longer needed and should not waste memory
         self.decoded_contents = None
     self.name = weka['relation']
     self.description = weka['description']
     # Attribute types are either 'REAL', 'INTEGER', 'NUMERIC' or a list of values (NOMINAL???)
     self.attributes = weka['attributes']
     self.data = weka['data']
示例#23
0
    def _get_file_elements(self) -> Dict:
        """ Adds the 'dataset' to file elements. """
        file_elements = {}
        path = None if self.data_file is None else os.path.abspath(
            self.data_file)

        if self._dataset is not None:
            file_elements['dataset'] = self._dataset
        elif path is not None and os.path.exists(path):
            with open(path, 'rb') as fp:
                file_elements['dataset'] = fp.read()
            try:
                dataset_utf8 = str(file_elements['dataset'], 'utf8')
                arff.ArffDecoder().decode(dataset_utf8, encode_nominal=True)
            except arff.ArffException:
                raise ValueError(
                    "The file you have provided is not a valid arff file.")
        elif self.url is None:
            raise ValueError("No valid url/path to the data file was given.")
        return file_elements
示例#24
0
def _load_arff(filename, target):
    with open(filename) as fh:
        decoder = arff.ArffDecoder()
        arff_object = decoder.decode(fh, encode_nominal=True)

    dataset_name = arff_object['relation']
    attributes = arff_object['attributes']
    data = arff_object['data']

    if isinstance(data, list):
        data = np.array(data)
    elif isinstance(data, tuple):
        data = sparse.coo_matrix(data)
    else:
        raise ValueError('arff returned unknown data format of type %s' %
                         str(type(data)))

    target_attribute = -1
    for i, attribute in enumerate(attributes):
        if attribute[0] == target:
            target_attribute = i
            break

    if target_attribute < 0:
        raise ValueError(
            'Target feature %s not found. Available features '
            'are: %s' %
            (target, str([attribute[0] for attribute in attributes])))

    y = data[:, target_attribute]
    X = data[:, np.arange(data.shape[1]) != target_attribute]

    # Do not add the target to the feat_type list
    feat_type = [
        'Categorical' if type(attribute[1]) in (list, tuple) else 'Numerical'
        for attribute in attributes[:-1]
    ]

    return X, y, dataset_name, feat_type
示例#25
0
    def retrieve_class_labels(self, target_name='class'):
        """Reads the datasets arff to determine the class-labels.

        If the task has no class labels (for example a regression problem)
        it returns None. Necessary because the data returned by get_data
        only contains the indices of the classes, while OpenML needs the real
        classname when uploading the results of a run.

        Parameters
        ----------
        target_name : str
            Name of the target attribute

        Returns
        -------
        list
        """

        # TODO improve performance, currently reads the whole file
        # Should make a method that only reads the attributes
        arffFileName = self.data_file

        if self.format.lower() == 'arff':
            return_type = arff.DENSE
        elif self.format.lower() == 'sparse_arff':
            return_type = arff.COO
        else:
            raise ValueError('Unknown data format %s' % self.format)

        with io.open(arffFileName, encoding='utf8') as fh:
            arffData = arff.ArffDecoder().decode(fh, return_type=return_type)

        dataAttributes = dict(arffData['attributes'])
        if target_name in dataAttributes:
            return dataAttributes[target_name]
        else:
            return None
def sparse_data_transformed():
    tests_dir = __file__
    os.chdir(os.path.dirname(tests_dir))

    decoder = arff.ArffDecoder()
    with open(os.path.join("datasets", "dataset.arff")) as fh:
        dataset = decoder.decode(fh, encode_nominal=True)

    # -1 because the last attribute is the class
    attribute_types = [
        'numeric' if type(type_) != list else 'nominal'
        for name, type_ in dataset['attributes'][:-1]
    ]
    categorical = {
        i: True if attribute == 'nominal' else False
        for i, attribute in enumerate(attribute_types)
    }

    data = np.array(dataset['data'], dtype=np.float64)
    X = data[:, :-1]
    y = data[:, -1].reshape((-1, ))

    # First, swap NaNs and zeros, because when converting an encoded
    # dense matrix to sparse, the values which are encoded to zero are lost
    X_sparse = X.copy()
    NaNs = ~np.isfinite(X_sparse)
    X_sparse[NaNs] = 0
    X_sparse = sparse.csr_matrix(X_sparse)

    ohe = FeatTypeSplit(
        feat_type={
            col: 'categorical' if category else 'numerical'
            for col, category in categorical.items()
        })
    X_transformed = X_sparse.copy()
    X_transformed = ohe.fit_transform(X_transformed)
    imp = SimpleImputer(copy=False)
    X_transformed = imp.fit_transform(X_transformed)
    standard_scaler = StandardScaler(with_mean=False)
    X_transformed = standard_scaler.fit_transform(X_transformed)

    # Transform the array which indicates the categorical metafeatures
    number_numerical = np.sum(~np.array(list(categorical.values())))
    categorical_transformed = {
        i: True if i < (X_transformed.shape[1] - number_numerical) else False
        for i in range(X_transformed.shape[1])
    }

    X = X_sparse
    X_transformed = X_transformed
    y = y
    mf = meta_features.metafeatures
    helpers = meta_features.helper_functions
    logger = logging.getLogger()

    # Precompute some helper functions
    helpers.set_value(
        "PCA",
        helpers["PCA"](X_transformed, y, logger),
    )
    helpers.set_value(
        "MissingValues",
        helpers["MissingValues"](X, y, logger, categorical),
    )
    mf.set_value(
        "NumberOfMissingValues",
        mf["NumberOfMissingValues"](X, y, logger, categorical),
    )
    helpers.set_value(
        "NumSymbols",
        helpers["NumSymbols"](X, y, logger, categorical),
    )
    helpers.set_value(
        "ClassOccurences",
        helpers["ClassOccurences"](X, y, logger),
    )
    helpers.set_value(
        "Skewnesses",
        helpers["Skewnesses"](X_transformed, y, logger,
                              categorical_transformed),
    )
    helpers.set_value(
        "Kurtosisses",
        helpers["Kurtosisses"](X_transformed, y, logger,
                               categorical_transformed),
    )
    return X_transformed, y, categorical_transformed
示例#27
0
 def get_decoder(self):
     decoder = arff.ArffDecoder()
     return decoder
示例#28
0
 def get_decoder(self, conversors):
     decoder = arff.ArffDecoder()
     decoder._conversors = conversors
     return decoder
示例#29
0
    model.fit(x_train,y_train)
    predictionsLR=model.predict(x_test)
    a1=accuracy_score(y_test,predictionsLR)
    return(a1)


# In[155]:


df_all=[]
df_all=pd.DataFrame(df_all)
files_to_read = [("Combined1.arff","FileName"),("Combined2.arff","FileName"),("Combined3.arff","FileName"),("Combined5a.arff","FileName"),("Combined5b.arff","FileName")]
for (file, file_data) in files_to_read:
    df1=[]
    with open (file) as f:
        decoder=arff.ArffDecoder()
        datadictionary=decoder.decode(f,encode_nominal=True,return_type=arff.LOD)
        data=datadictionary['data']
        arff.ArffDecoder
        df1=pd.DataFrame(data)
        #df[3094] = np.where(df[3094]==1.0, 'human', 'worm')
        df1=df1.replace(-np.inf,np.nan)
        df1.fillna(df.mean(),inplace=True)
        b=extract_metafeature(df1)
        b=pd.DataFrame(b)
        df_all = df_all.append(b)
        print(df_all)


# In[156]:
示例#30
0
    def setUp(self):
        self.cwd = os.getcwd()
        tests_dir = __file__
        os.chdir(os.path.dirname(tests_dir))

        decoder = arff.ArffDecoder()
        with open(os.path.join("datasets", "dataset.arff")) as fh:
            dataset = decoder.decode(fh, encode_nominal=True)

        # -1 because the last attribute is the class
        self.attribute_types = [
            'numeric' if type(type_) != list else 'nominal'
            for name, type_ in dataset['attributes'][:-1]
        ]
        self.categorical = [
            True if attribute == 'nominal' else False
            for attribute in self.attribute_types
        ]

        data = np.array(dataset['data'], dtype=np.float64)
        X = data[:, :-1]
        y = data[:, -1].reshape((-1, ))

        # First, swap NaNs and zeros, because when converting an encoded
        # dense matrix to sparse, the values which are encoded to zero are lost
        X_sparse = X.copy()
        NaNs = ~np.isfinite(X_sparse)
        X_sparse[NaNs] = 0
        X_sparse = sparse.csr_matrix(X_sparse)

        ohe = OneHotEncoder(self.categorical)
        X_transformed = X_sparse.copy()
        X_transformed = ohe.fit_transform(X_transformed)
        imp = SimpleImputer(copy=False)
        X_transformed = imp.fit_transform(X_transformed)
        standard_scaler = StandardScaler(with_mean=False)
        X_transformed = standard_scaler.fit_transform(X_transformed)

        # Transform the array which indicates the categorical metafeatures
        number_numerical = np.sum(~np.array(self.categorical))
        categorical_transformed = [True] * (X_transformed.shape[1] -
                                            number_numerical) + \
                                  [False] * number_numerical
        self.categorical_transformed = categorical_transformed

        self.X = X_sparse
        self.X_transformed = X_transformed
        self.y = y
        self.mf = meta_features.metafeatures
        self.helpers = meta_features.helper_functions

        # Precompute some helper functions
        self.helpers.set_value("PCA", self.helpers["PCA"](self.X_transformed,
                                                          self.y))
        self.helpers.set_value(
            "MissingValues", self.helpers["MissingValues"](self.X, self.y,
                                                           self.categorical))
        self.mf.set_value(
            "NumberOfMissingValues",
            self.mf["NumberOfMissingValues"](self.X, self.y, self.categorical))
        self.helpers.set_value(
            "NumSymbols", self.helpers["NumSymbols"](self.X, self.y,
                                                     self.categorical))
        self.helpers.set_value("ClassOccurences",
                               self.helpers["ClassOccurences"](self.X, self.y))
        self.helpers.set_value(
            "Skewnesses",
            self.helpers["Skewnesses"](self.X_transformed, self.y,
                                       self.categorical_transformed))
        self.helpers.set_value(
            "Kurtosisses",
            self.helpers["Kurtosisses"](self.X_transformed, self.y,
                                        self.categorical_transformed))