def test_create_dataset_list(self): data = [ ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'], ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'], ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'], ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'], ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes'], ['f', 'rainy', 65.0, 70.0, 'TRUE', 'no'], ['g', 'overcast', 64.0, 65.0, 'TRUE', 'yes'], ['h', 'sunny', 72.0, 95.0, 'FALSE', 'no'], ['i', 'sunny', 69.0, 70.0, 'FALSE', 'yes'], ['j', 'rainy', 75.0, 80.0, 'FALSE', 'yes'], ['k', 'sunny', 75.0, 70.0, 'TRUE', 'yes'], ['l', 'overcast', 72.0, 90.0, 'TRUE', 'yes'], ['m', 'overcast', 81.0, 75.0, 'FALSE', 'yes'], ['n', 'rainy', 71.0, 91.0, 'TRUE', 'no'], ] attributes = [ ('rnd_str', 'STRING'), ('outlook', ['sunny', 'overcast', 'rainy']), ('temperature', 'REAL'), ('humidity', 'REAL'), ('windy', ['TRUE', 'FALSE']), ('play', ['yes', 'no']), ] dataset = create_dataset( name="%s-ModifiedWeather" % self._get_sentinel(), description=( 'Testing dataset upload when the data is a list of lists' ), creator='OpenML test', contributor=None, collection_date='21-09-2018', language='English', licence='MIT', default_target_attribute='play', row_id_attribute=None, ignore_attribute=None, citation='None', attributes=attributes, data=data, version_label='test', original_data_url='http://openml.github.io/openml-python', paper_url='http://openml.github.io/openml-python' ) upload_did = dataset.publish() self.assertEqual( _get_online_dataset_arff(upload_did), dataset._dataset, "Uploaded ARFF does not match original one" ) self.assertEqual( _get_online_dataset_format(upload_did), 'arff', "Wrong format for dataset" )
def create_openml_dataset(df: pd.DataFrame, da: 'DataAnnotation') -> OpenMLDataset: collection_date = None if da.collection_date is None else da.collection_date.strftime( "%d-%m-%Y") for column in df.columns: if df[column].dtype.name == 'category': # OpenML Python requires categorical values to be strings. df[column] = df[column].astype(str).astype('category') return create_dataset( name=da.name, description=da.description, creator=da.creator or None, contributor=da.contributor or None, collection_date=collection_date, language=da.language or None, licence=da.licence or None, default_target_attribute=da.target_column or None, row_id_attribute=da.id_column or None, citation=da.citation or None, ignore_attribute=da.ignore_columns or None, attributes='auto', data=df, # version_label original_data_url=da.data_url or None, paper_url=da.paper_url or None)
def test_create_dataset_numpy(self): data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T attributes = [('col_{}'.format(i), 'REAL') for i in range(data.shape[1])] dataset = create_dataset( name='%s-NumPy_testing_dataset' % self._get_sentinel(), description='Synthetic dataset created from a NumPy array', creator='OpenML tester', contributor=None, collection_date='01-01-2018', language='English', licence='MIT', default_target_attribute='col_{}'.format(data.shape[1] - 1), row_id_attribute=None, ignore_attribute=None, citation='None', attributes=attributes, data=data, version_label='test', original_data_url='http://openml.github.io/openml-python', paper_url='http://openml.github.io/openml-python') upload_did = dataset.publish() self.assertEqual(_get_online_dataset_arff(upload_did), dataset._dataset, "Uploaded arff does not match original one") self.assertEqual(_get_online_dataset_format(upload_did), 'arff', "Wrong format for dataset")
def test_create_dataset_numpy(self): data = np.array( [ [1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0] ] ).T attributes = [('col_{}'.format(i), 'REAL') for i in range(data.shape[1])] dataset = create_dataset( name='%s-NumPy_testing_dataset' % self._get_sentinel(), description='Synthetic dataset created from a NumPy array', creator='OpenML tester', contributor=None, collection_date='01-01-2018', language='English', licence='MIT', default_target_attribute='col_{}'.format(data.shape[1] - 1), row_id_attribute=None, ignore_attribute=None, citation='None', attributes=attributes, data=data, version_label='test', original_data_url='http://openml.github.io/openml-python', paper_url='http://openml.github.io/openml-python' ) upload_did = dataset.publish() self.assertEqual( _get_online_dataset_arff(upload_did), dataset._dataset, "Uploaded arff does not match original one" ) self.assertEqual( _get_online_dataset_format(upload_did), 'arff', "Wrong format for dataset" )
diabetes_dataset = create_dataset( # The name of the dataset (needs to be unique). # Must not be longer than 128 characters and only contain # a-z, A-Z, 0-9 and the following special characters: _\-\.(), name=name, # Textual description of the dataset. description=description, # The person who created the dataset. creator="Bradley Efron, Trevor Hastie, " "Iain Johnstone and Robert Tibshirani", # People who contributed to the current version of the dataset. contributor=None, # The date the data was originally collected, given by the uploader. collection_date='09-01-2012', # Language in which the data is represented. # Starts with 1 upper case letter, rest lower case, e.g. 'English'. language='English', # License under which the data is/will be distributed. licence='BSD (from scikit-learn)', # Name of the target. Can also have multiple values (comma-separated). default_target_attribute='class', # The attribute that represents the row-id column, if present in the # dataset. row_id_attribute=None, # Attributes that should be excluded in modelling, such as identifiers and # indexes. ignore_attribute=None, # How to cite the paper. citation=citation, # Attributes of the data attributes=attributes, data=data, # A version label which is provided by the user. version_label='test', original_data_url=( 'http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html'), paper_url=paper_url, )
diabetes_dataset = create_dataset( # The name of the dataset (needs to be unique). # Must not be longer than 128 characters and only contain # a-z, A-Z, 0-9 and the following special characters: _\-\.(), name=name, # Textual description of the dataset. description=description, # The person who created the dataset. creator="Bradley Efron, Trevor Hastie, " "Iain Johnstone and Robert Tibshirani", # People who contributed to the current version of the dataset. contributor=None, # The date the data was originally collected, given by the uploader. collection_date='09-01-2012', # Language in which the data is represented. # Starts with 1 upper case letter, rest lower case, e.g. 'English'. language='English', # License under which the data is/will be distributed. licence='BSD (from scikit-learn)', # Name of the target. Can also have multiple values (comma-separated). default_target_attribute='class', # The attribute that represents the row-id column, if present in the # dataset. row_id_attribute=None, # Attributes that should be excluded in modelling, such as identifiers and # indexes. ignore_attribute=None, # How to cite the paper. citation=citation, # Attributes of the data attributes=attributes, data=data, # A version label which is provided by the user. version_label='test', original_data_url=( 'http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html' ), paper_url=paper_url, )
def test_create_dataset_sparse(self): # test the scipy.sparse.coo_matrix sparse_data = scipy.sparse.coo_matrix(( [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]) )) column_names = [ ('input1', 'REAL'), ('input2', 'REAL'), ('y', 'REAL'), ] xor_dataset = create_dataset( name="%s-XOR" % self._get_sentinel(), description='Dataset representing the XOR operation', creator=None, contributor=None, collection_date=None, language='English', licence=None, default_target_attribute='y', row_id_attribute=None, ignore_attribute=None, citation=None, attributes=column_names, data=sparse_data, version_label='test', ) upload_did = xor_dataset.publish() self.assertEqual( _get_online_dataset_arff(upload_did), xor_dataset._dataset, "Uploaded ARFF does not match original one" ) self.assertEqual( _get_online_dataset_format(upload_did), 'sparse_arff', "Wrong format for dataset" ) # test the list of dicts sparse representation sparse_data = [ {0: 0.0}, {1: 1.0, 2: 1.0}, {0: 1.0, 2: 1.0}, {0: 1.0, 1: 1.0} ] xor_dataset = create_dataset( name="%s-XOR" % self._get_sentinel(), description='Dataset representing the XOR operation', creator=None, contributor=None, collection_date=None, language='English', licence=None, default_target_attribute='y', row_id_attribute=None, ignore_attribute=None, citation=None, attributes=column_names, data=sparse_data, version_label='test', ) upload_did = xor_dataset.publish() self.assertEqual( _get_online_dataset_arff(upload_did), xor_dataset._dataset, "Uploaded ARFF does not match original one" ) self.assertEqual( _get_online_dataset_format(upload_did), 'sparse_arff', "Wrong format for dataset" )