def test_create_dataset_list(self):

        data = [
            ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'],
            ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'],
            ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'],
            ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'],
            ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes'],
            ['f', 'rainy', 65.0, 70.0, 'TRUE', 'no'],
            ['g', 'overcast', 64.0, 65.0, 'TRUE', 'yes'],
            ['h', 'sunny', 72.0, 95.0, 'FALSE', 'no'],
            ['i', 'sunny', 69.0, 70.0, 'FALSE', 'yes'],
            ['j', 'rainy', 75.0, 80.0, 'FALSE', 'yes'],
            ['k', 'sunny', 75.0, 70.0, 'TRUE', 'yes'],
            ['l', 'overcast', 72.0, 90.0, 'TRUE', 'yes'],
            ['m', 'overcast', 81.0, 75.0, 'FALSE', 'yes'],
            ['n', 'rainy', 71.0, 91.0, 'TRUE', 'no'],
        ]

        attributes = [
            ('rnd_str', 'STRING'),
            ('outlook', ['sunny', 'overcast', 'rainy']),
            ('temperature', 'REAL'),
            ('humidity', 'REAL'),
            ('windy', ['TRUE', 'FALSE']),
            ('play', ['yes', 'no']),
        ]

        dataset = create_dataset(
            name="%s-ModifiedWeather" % self._get_sentinel(),
            description=(
                'Testing dataset upload when the data is a list of lists'
            ),
            creator='OpenML test',
            contributor=None,
            collection_date='21-09-2018',
            language='English',
            licence='MIT',
            default_target_attribute='play',
            row_id_attribute=None,
            ignore_attribute=None,
            citation='None',
            attributes=attributes,
            data=data,
            version_label='test',
            original_data_url='http://openml.github.io/openml-python',
            paper_url='http://openml.github.io/openml-python'
        )

        upload_did = dataset.publish()
        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            dataset._dataset,
            "Uploaded ARFF does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(upload_did),
            'arff',
            "Wrong format for dataset"
        )
    def test_create_dataset_numpy(self):

        data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T

        attributes = [('col_{}'.format(i), 'REAL')
                      for i in range(data.shape[1])]

        dataset = create_dataset(
            name='%s-NumPy_testing_dataset' % self._get_sentinel(),
            description='Synthetic dataset created from a NumPy array',
            creator='OpenML tester',
            contributor=None,
            collection_date='01-01-2018',
            language='English',
            licence='MIT',
            default_target_attribute='col_{}'.format(data.shape[1] - 1),
            row_id_attribute=None,
            ignore_attribute=None,
            citation='None',
            attributes=attributes,
            data=data,
            version_label='test',
            original_data_url='http://openml.github.io/openml-python',
            paper_url='http://openml.github.io/openml-python')

        upload_did = dataset.publish()

        self.assertEqual(_get_online_dataset_arff(upload_did),
                         dataset._dataset,
                         "Uploaded arff does not match original one")
        self.assertEqual(_get_online_dataset_format(upload_did), 'arff',
                         "Wrong format for dataset")
 def test_create_dataset_row_id_attribute_inference(self):
     # meta-information
     name = '%s-pandas_testing_dataset' % self._get_sentinel()
     description = 'Synthetic dataset created from a Pandas DataFrame'
     creator = 'OpenML tester'
     collection_date = '01-01-2018'
     language = 'English'
     licence = 'MIT'
     default_target_attribute = 'target'
     citation = 'None'
     original_data_url = 'http://openml.github.io/openml-python'
     paper_url = 'http://openml.github.io/openml-python'
     # Check that the index name is well inferred.
     data = [['a', 1, 0],
             ['b', 2, 1],
             ['c', 3, 0],
             ['d', 4, 1],
             ['e', 5, 0]]
     column_names = ['rnd_str', 'integer', 'target']
     df = pd.DataFrame(data, columns=column_names)
     row_id_attr = [None, 'integer']
     df_index_name = [None, 'index_name']
     expected_row_id = [None, 'index_name', 'integer', 'integer']
     for output_row_id, (row_id, index_name) in zip(expected_row_id,
                                                    product(row_id_attr,
                                                            df_index_name)):
         df.index.name = index_name
         dataset = openml.datasets.functions.create_dataset(
             name=name,
             description=description,
             creator=creator,
             contributor=None,
             collection_date=collection_date,
             language=language,
             licence=licence,
             default_target_attribute=default_target_attribute,
             ignore_attribute=None,
             citation=citation,
             attributes='auto',
             data=df,
             row_id_attribute=row_id,
             version_label='test',
             original_data_url=original_data_url,
             paper_url=paper_url
         )
         self.assertEqual(dataset.row_id_attribute, output_row_id)
         upload_did = dataset.publish()
         arff_dataset = arff.loads(_get_online_dataset_arff(upload_did))
         arff_data = np.array(arff_dataset['data'], dtype=object)
         # if we set the name of the index then the index will be added to
         # the data
         expected_shape = (5, 3) if index_name is None else (5, 4)
         self.assertEqual(arff_data.shape, expected_shape)
    def test_get_online_dataset_arff(self):
        dataset_id = 100  # Australian
        # lazy loading not used as arff file is checked.
        dataset = openml.datasets.get_dataset(dataset_id)
        decoder = arff.ArffDecoder()
        # check if the arff from the dataset is
        # the same as the arff from _get_arff function
        d_format = (dataset.format).lower()

        self.assertEqual(
            dataset._get_arff(d_format),
            decoder.decode(
                _get_online_dataset_arff(dataset_id),
                encode_nominal=True,
                return_type=arff.DENSE if d_format == 'arff' else arff.COO),
            "ARFF files are not equal")
    def test_create_dataset_numpy(self):

        data = np.array(
            [
                [1, 2, 3],
                [1.2, 2.5, 3.8],
                [2, 5, 8],
                [0, 1, 0]
            ]
        ).T

        attributes = [('col_{}'.format(i), 'REAL')
                      for i in range(data.shape[1])]

        dataset = create_dataset(
            name='%s-NumPy_testing_dataset' % self._get_sentinel(),
            description='Synthetic dataset created from a NumPy array',
            creator='OpenML tester',
            contributor=None,
            collection_date='01-01-2018',
            language='English',
            licence='MIT',
            default_target_attribute='col_{}'.format(data.shape[1] - 1),
            row_id_attribute=None,
            ignore_attribute=None,
            citation='None',
            attributes=attributes,
            data=data,
            version_label='test',
            original_data_url='http://openml.github.io/openml-python',
            paper_url='http://openml.github.io/openml-python'
        )

        upload_did = dataset.publish()

        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            dataset._dataset,
            "Uploaded arff does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(upload_did),
            'arff',
            "Wrong format for dataset"
        )
    def test_get_online_dataset_arff(self):

        # Australian dataset
        dataset_id = 100
        dataset = openml.datasets.get_dataset(dataset_id)
        decoder = arff.ArffDecoder()
        # check if the arff from the dataset is
        # the same as the arff from _get_arff function
        d_format = (dataset.format).lower()

        self.assertEqual(
            dataset._get_arff(d_format),
            decoder.decode(
                _get_online_dataset_arff(dataset_id),
                encode_nominal=True,
                return_type=arff.DENSE
                if d_format == 'arff' else arff.COO
            ),
            "ARFF files are not equal"
        )
    def test_create_dataset_pandas(self):
        data = [
            ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'],
            ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'],
            ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'],
            ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'],
            ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes']
        ]
        column_names = ['rnd_str', 'outlook', 'temperature', 'humidity',
                        'windy', 'play']
        df = pd.DataFrame(data, columns=column_names)
        # enforce the type of each column
        df['outlook'] = df['outlook'].astype('category')
        df['windy'] = df['windy'].astype('bool')
        df['play'] = df['play'].astype('category')
        # meta-information
        name = '%s-pandas_testing_dataset' % self._get_sentinel()
        description = 'Synthetic dataset created from a Pandas DataFrame'
        creator = 'OpenML tester'
        collection_date = '01-01-2018'
        language = 'English'
        licence = 'MIT'
        default_target_attribute = 'play'
        citation = 'None'
        original_data_url = 'http://openml.github.io/openml-python'
        paper_url = 'http://openml.github.io/openml-python'
        dataset = openml.datasets.functions.create_dataset(
            name=name,
            description=description,
            creator=creator,
            contributor=None,
            collection_date=collection_date,
            language=language,
            licence=licence,
            default_target_attribute=default_target_attribute,
            row_id_attribute=None,
            ignore_attribute=None,
            citation=citation,
            attributes='auto',
            data=df,
            version_label='test',
            original_data_url=original_data_url,
            paper_url=paper_url
        )
        upload_did = dataset.publish()
        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            dataset._dataset,
            "Uploaded ARFF does not match original one"
        )

        # Check that SparseDataFrame are supported properly
        sparse_data = scipy.sparse.coo_matrix((
            [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
            ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
        ))
        column_names = ['input1', 'input2', 'y']
        df = pd.SparseDataFrame(sparse_data, columns=column_names)
        # meta-information
        description = 'Synthetic dataset created from a Pandas SparseDataFrame'
        dataset = openml.datasets.functions.create_dataset(
            name=name,
            description=description,
            creator=creator,
            contributor=None,
            collection_date=collection_date,
            language=language,
            licence=licence,
            default_target_attribute=default_target_attribute,
            row_id_attribute=None,
            ignore_attribute=None,
            citation=citation,
            attributes='auto',
            data=df,
            version_label='test',
            original_data_url=original_data_url,
            paper_url=paper_url
        )
        upload_did = dataset.publish()
        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            dataset._dataset,
            "Uploaded ARFF does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(upload_did),
            'sparse_arff',
            "Wrong format for dataset"
        )

        # Check that we can overwrite the attributes
        data = [['a'], ['b'], ['c'], ['d'], ['e']]
        column_names = ['rnd_str']
        df = pd.DataFrame(data, columns=column_names)
        df['rnd_str'] = df['rnd_str'].astype('category')
        attributes = {'rnd_str': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}
        dataset = openml.datasets.functions.create_dataset(
            name=name,
            description=description,
            creator=creator,
            contributor=None,
            collection_date=collection_date,
            language=language,
            licence=licence,
            default_target_attribute=default_target_attribute,
            row_id_attribute=None,
            ignore_attribute=None,
            citation=citation,
            attributes=attributes,
            data=df,
            version_label='test',
            original_data_url=original_data_url,
            paper_url=paper_url
        )
        upload_did = dataset.publish()
        downloaded_data = _get_online_dataset_arff(upload_did)
        self.assertEqual(
            downloaded_data,
            dataset._dataset,
            "Uploaded ARFF does not match original one"
        )
        self.assertTrue(
            '@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}' in downloaded_data)
    def test_create_dataset_sparse(self):

        # test the scipy.sparse.coo_matrix
        sparse_data = scipy.sparse.coo_matrix((
            [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
            ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
        ))

        column_names = [
            ('input1', 'REAL'),
            ('input2', 'REAL'),
            ('y', 'REAL'),
        ]

        xor_dataset = create_dataset(
            name="%s-XOR" % self._get_sentinel(),
            description='Dataset representing the XOR operation',
            creator=None,
            contributor=None,
            collection_date=None,
            language='English',
            licence=None,
            default_target_attribute='y',
            row_id_attribute=None,
            ignore_attribute=None,
            citation=None,
            attributes=column_names,
            data=sparse_data,
            version_label='test',
        )

        upload_did = xor_dataset.publish()
        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            xor_dataset._dataset,
            "Uploaded ARFF does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(upload_did),
            'sparse_arff',
            "Wrong format for dataset"
        )

        # test the list of dicts sparse representation
        sparse_data = [
            {0: 0.0},
            {1: 1.0, 2: 1.0},
            {0: 1.0, 2: 1.0},
            {0: 1.0, 1: 1.0}
        ]

        xor_dataset = create_dataset(
            name="%s-XOR" % self._get_sentinel(),
            description='Dataset representing the XOR operation',
            creator=None,
            contributor=None,
            collection_date=None,
            language='English',
            licence=None,
            default_target_attribute='y',
            row_id_attribute=None,
            ignore_attribute=None,
            citation=None,
            attributes=column_names,
            data=sparse_data,
            version_label='test',
        )

        upload_did = xor_dataset.publish()
        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            xor_dataset._dataset,
            "Uploaded ARFF does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(upload_did),
            'sparse_arff',
            "Wrong format for dataset"
        )