def test_create_dataset_list(self):

        data = [
            ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'],
            ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'],
            ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'],
            ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'],
            ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes'],
            ['f', 'rainy', 65.0, 70.0, 'TRUE', 'no'],
            ['g', 'overcast', 64.0, 65.0, 'TRUE', 'yes'],
            ['h', 'sunny', 72.0, 95.0, 'FALSE', 'no'],
            ['i', 'sunny', 69.0, 70.0, 'FALSE', 'yes'],
            ['j', 'rainy', 75.0, 80.0, 'FALSE', 'yes'],
            ['k', 'sunny', 75.0, 70.0, 'TRUE', 'yes'],
            ['l', 'overcast', 72.0, 90.0, 'TRUE', 'yes'],
            ['m', 'overcast', 81.0, 75.0, 'FALSE', 'yes'],
            ['n', 'rainy', 71.0, 91.0, 'TRUE', 'no'],
        ]

        attributes = [
            ('rnd_str', 'STRING'),
            ('outlook', ['sunny', 'overcast', 'rainy']),
            ('temperature', 'REAL'),
            ('humidity', 'REAL'),
            ('windy', ['TRUE', 'FALSE']),
            ('play', ['yes', 'no']),
        ]

        dataset = create_dataset(
            name="%s-ModifiedWeather" % self._get_sentinel(),
            description=(
                'Testing dataset upload when the data is a list of lists'
            ),
            creator='OpenML test',
            contributor=None,
            collection_date='21-09-2018',
            language='English',
            licence='MIT',
            default_target_attribute='play',
            row_id_attribute=None,
            ignore_attribute=None,
            citation='None',
            attributes=attributes,
            data=data,
            version_label='test',
            original_data_url='http://openml.github.io/openml-python',
            paper_url='http://openml.github.io/openml-python'
        )

        upload_did = dataset.publish()
        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            dataset._dataset,
            "Uploaded ARFF does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(upload_did),
            'arff',
            "Wrong format for dataset"
        )
示例#2
0
def create_openml_dataset(df: pd.DataFrame,
                          da: 'DataAnnotation') -> OpenMLDataset:
    collection_date = None if da.collection_date is None else da.collection_date.strftime(
        "%d-%m-%Y")

    for column in df.columns:
        if df[column].dtype.name == 'category':
            # OpenML Python requires categorical values to be strings.
            df[column] = df[column].astype(str).astype('category')

    return create_dataset(
        name=da.name,
        description=da.description,
        creator=da.creator or None,
        contributor=da.contributor or None,
        collection_date=collection_date,
        language=da.language or None,
        licence=da.licence or None,
        default_target_attribute=da.target_column or None,
        row_id_attribute=da.id_column or None,
        citation=da.citation or None,
        ignore_attribute=da.ignore_columns or None,
        attributes='auto',
        data=df,
        # version_label
        original_data_url=da.data_url or None,
        paper_url=da.paper_url or None)
    def test_create_dataset_numpy(self):

        data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T

        attributes = [('col_{}'.format(i), 'REAL')
                      for i in range(data.shape[1])]

        dataset = create_dataset(
            name='%s-NumPy_testing_dataset' % self._get_sentinel(),
            description='Synthetic dataset created from a NumPy array',
            creator='OpenML tester',
            contributor=None,
            collection_date='01-01-2018',
            language='English',
            licence='MIT',
            default_target_attribute='col_{}'.format(data.shape[1] - 1),
            row_id_attribute=None,
            ignore_attribute=None,
            citation='None',
            attributes=attributes,
            data=data,
            version_label='test',
            original_data_url='http://openml.github.io/openml-python',
            paper_url='http://openml.github.io/openml-python')

        upload_did = dataset.publish()

        self.assertEqual(_get_online_dataset_arff(upload_did),
                         dataset._dataset,
                         "Uploaded arff does not match original one")
        self.assertEqual(_get_online_dataset_format(upload_did), 'arff',
                         "Wrong format for dataset")
    def test_create_dataset_numpy(self):

        data = np.array(
            [
                [1, 2, 3],
                [1.2, 2.5, 3.8],
                [2, 5, 8],
                [0, 1, 0]
            ]
        ).T

        attributes = [('col_{}'.format(i), 'REAL')
                      for i in range(data.shape[1])]

        dataset = create_dataset(
            name='%s-NumPy_testing_dataset' % self._get_sentinel(),
            description='Synthetic dataset created from a NumPy array',
            creator='OpenML tester',
            contributor=None,
            collection_date='01-01-2018',
            language='English',
            licence='MIT',
            default_target_attribute='col_{}'.format(data.shape[1] - 1),
            row_id_attribute=None,
            ignore_attribute=None,
            citation='None',
            attributes=attributes,
            data=data,
            version_label='test',
            original_data_url='http://openml.github.io/openml-python',
            paper_url='http://openml.github.io/openml-python'
        )

        upload_did = dataset.publish()

        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            dataset._dataset,
            "Uploaded arff does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(upload_did),
            'arff',
            "Wrong format for dataset"
        )
示例#5
0
diabetes_dataset = create_dataset(
    # The name of the dataset (needs to be unique).
    # Must not be longer than 128 characters and only contain
    # a-z, A-Z, 0-9 and the following special characters: _\-\.(),
    name=name,
    # Textual description of the dataset.
    description=description,
    # The person who created the dataset.
    creator="Bradley Efron, Trevor Hastie, "
    "Iain Johnstone and Robert Tibshirani",
    # People who contributed to the current version of the dataset.
    contributor=None,
    # The date the data was originally collected, given by the uploader.
    collection_date='09-01-2012',
    # Language in which the data is represented.
    # Starts with 1 upper case letter, rest lower case, e.g. 'English'.
    language='English',
    # License under which the data is/will be distributed.
    licence='BSD (from scikit-learn)',
    # Name of the target. Can also have multiple values (comma-separated).
    default_target_attribute='class',
    # The attribute that represents the row-id column, if present in the
    # dataset.
    row_id_attribute=None,
    # Attributes that should be excluded in modelling, such as identifiers and
    # indexes.
    ignore_attribute=None,
    # How to cite the paper.
    citation=citation,
    # Attributes of the data
    attributes=attributes,
    data=data,
    # A version label which is provided by the user.
    version_label='test',
    original_data_url=(
        'http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html'),
    paper_url=paper_url,
)
diabetes_dataset = create_dataset(
    # The name of the dataset (needs to be unique).
    # Must not be longer than 128 characters and only contain
    # a-z, A-Z, 0-9 and the following special characters: _\-\.(),
    name=name,
    # Textual description of the dataset.
    description=description,
    # The person who created the dataset.
    creator="Bradley Efron, Trevor Hastie, "
            "Iain Johnstone and Robert Tibshirani",
    # People who contributed to the current version of the dataset.
    contributor=None,
    # The date the data was originally collected, given by the uploader.
    collection_date='09-01-2012',
    # Language in which the data is represented.
    # Starts with 1 upper case letter, rest lower case, e.g. 'English'.
    language='English',
    # License under which the data is/will be distributed.
    licence='BSD (from scikit-learn)',
    # Name of the target. Can also have multiple values (comma-separated).
    default_target_attribute='class',
    # The attribute that represents the row-id column, if present in the
    # dataset.
    row_id_attribute=None,
    # Attributes that should be excluded in modelling, such as identifiers and
    # indexes.
    ignore_attribute=None,
    # How to cite the paper.
    citation=citation,
    # Attributes of the data
    attributes=attributes,
    data=data,
    # A version label which is provided by the user.
    version_label='test',
    original_data_url=(
        'http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html'
    ),
    paper_url=paper_url,
)
    def test_create_dataset_sparse(self):

        # test the scipy.sparse.coo_matrix
        sparse_data = scipy.sparse.coo_matrix((
            [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
            ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
        ))

        column_names = [
            ('input1', 'REAL'),
            ('input2', 'REAL'),
            ('y', 'REAL'),
        ]

        xor_dataset = create_dataset(
            name="%s-XOR" % self._get_sentinel(),
            description='Dataset representing the XOR operation',
            creator=None,
            contributor=None,
            collection_date=None,
            language='English',
            licence=None,
            default_target_attribute='y',
            row_id_attribute=None,
            ignore_attribute=None,
            citation=None,
            attributes=column_names,
            data=sparse_data,
            version_label='test',
        )

        upload_did = xor_dataset.publish()
        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            xor_dataset._dataset,
            "Uploaded ARFF does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(upload_did),
            'sparse_arff',
            "Wrong format for dataset"
        )

        # test the list of dicts sparse representation
        sparse_data = [
            {0: 0.0},
            {1: 1.0, 2: 1.0},
            {0: 1.0, 2: 1.0},
            {0: 1.0, 1: 1.0}
        ]

        xor_dataset = create_dataset(
            name="%s-XOR" % self._get_sentinel(),
            description='Dataset representing the XOR operation',
            creator=None,
            contributor=None,
            collection_date=None,
            language='English',
            licence=None,
            default_target_attribute='y',
            row_id_attribute=None,
            ignore_attribute=None,
            citation=None,
            attributes=column_names,
            data=sparse_data,
            version_label='test',
        )

        upload_did = xor_dataset.publish()
        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            xor_dataset._dataset,
            "Uploaded ARFF does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(upload_did),
            'sparse_arff',
            "Wrong format for dataset"
        )