Пример #1
0
    def test_arffreader_performance(self):

        attributes = "\n".join(
            [f"@attribute {i} {{1,2}}" for i in range(1000)])
        data_line = ",".join(["1"] * 1000)
        data_lines = "\n".join([data_line] * 700)

        arff = f"{attributes}\n@data\n{data_lines}".split('\n')

        reader = ArffReader()
        time = timeit.timeit(lambda: list(reader.filter(arff)), number=1)

        #.045 was my final time
        self.assertLess(time, 0.45)
Пример #2
0
    def test_no_lazy_encoding_no_header_indexes_sparse(self):
        lines = [
            "@relation news20",
            "@attribute a numeric",
            "@attribute b numeric",
            "@attribute c {class_B, class_C, class_D}",
            "@data",
            "{0 1,1 2,2 class_B}",
            "{0 2,1 3,2 class_C}",
        ]

        expected = [{
            0: 1,
            1: 2,
            2: (0, 1, 0, 0)
        }, {
            0: 2,
            1: 3,
            2: (0, 0, 1, 0)
        }]

        actual = list(
            ArffReader(lazy_encoding=False,
                       header_indexing=False).filter(lines))

        self.assertEqual(expected, actual)
        self.assertIsInstance(actual[0], dict)
        self.assertIsInstance(actual[1], dict)
Пример #3
0
    def test_sparse_with_spaces_after_comma(self):
        lines = [
            "@relation news20",
            "@attribute a numeric",
            "@attribute b numeric",
            "@attribute c {class_B, class_C, class_D}",
            "@data",
            "{0 2, 1 3}",
            "{0 1, 1 1,2 class_B}",
            "{1 1}",
            "{0 1, 2 class_D}",
        ]

        expected = [{
            0: 2,
            1: 3,
            2: (1, 0, 0, 0)
        }, {
            0: 1,
            1: 1,
            2: (0, 1, 0, 0)
        }, {
            1: 1,
            2: (1, 0, 0, 0)
        }, {
            0: 1,
            2: (0, 0, 0, 1)
        }]

        self.assertEqual(expected, list(ArffReader().filter(lines)))
Пример #4
0
    def test_dense_with_strings(self):
        lines = [
            "@relation news20", "@attribute a string", "@attribute b string",
            "@attribute c {0, class_B, class_C, class_D}", "@data",
            "1,2,class_B", "2,3,0"
        ]

        expected = [['1', '2', (0, 1, 0, 0)], ['2', '3', (1, 0, 0, 0)]]

        self.assertEqual(expected, list(ArffReader().filter(lines)))
Пример #5
0
    def test_dense_with_empty_lines(self):
        lines = [
            "@relation news20", "@attribute A numeric", "@attribute B numeric",
            "@attribute C {0, class_B, class_C, class_D}", "@data", "", "",
            "1,2,class_B", "2,3,0", ""
        ]

        expected = [[1, 2, (0, 1, 0, 0)], [2, 3, (1, 0, 0, 0)]]

        self.assertEqual(expected, list(ArffReader().filter(lines)))
Пример #6
0
    def test_dense_with_comments(self):
        lines = [
            "%This is a comment", "@relation news20", "@attribute a numeric",
            "@attribute b numeric",
            "@attribute c {0, class_B, class_C, class_D}", "@data",
            "1,2,class_B", "2,3,0"
        ]

        expected = [[1, 2, (0, 1, 0, 0)], [2, 3, (1, 0, 0, 0)]]

        self.assertEqual(expected, list(ArffReader().filter(lines)))
Пример #7
0
    def test_leading_and_trailing_comments(self):
        lines = [
            "%", "%", "@relation news20", "@attribute a string",
            "@attribute b string",
            "@attribute c {0, class_B, class_C, class_D}", "@data",
            "1,2,class_B", "2,3,0", "%"
        ]

        expected = [['1', '2', (0, 1, 0, 0)], ['2', '3', (1, 0, 0, 0)]]

        self.assertEqual(expected, list(ArffReader().filter(lines)))
Пример #8
0
    def test_sans_data(self):
        lines = [
            "@relation news20",
            "@attribute a numeric",
            "@attribute B numeric",
            "@attribute c {0, class_B, class_C, class_D}",
            "@data",
        ]

        expected = []

        self.assertEqual(expected, list(ArffReader().filter(lines)))
Пример #9
0
    def read(self) -> Iterable[Tuple[Any, Any]]:
        """Read and parse the openml source."""
        try:
            dataset_description = self._get_dataset_description(self._data_id)

            if dataset_description['status'] == 'deactivated':
                raise CobaException(
                    f"Openml {self._data_id} has been deactivated. This is often due to flags on the data."
                )

            feature_descriptions = self._get_feature_descriptions(
                self._data_id)
            task_descriptions = self._get_task_descriptions(self._data_id)

            is_ignore = lambda r: (r['is_ignore'] == 'true' or r[
                'is_row_identifier'] == 'true' or r['data_type'] not in
                                   ['numeric', 'nominal'])

            ignore = [
                self._name_cleaning(f['name']) for f in feature_descriptions
                if is_ignore(f)
            ]
            target = self._name_cleaning(
                self._get_target_for_problem_type(task_descriptions))

            if target in ignore: ignore.pop(ignore.index(target))

            def row_has_missing_values(row):
                row_values = row._values.values() if isinstance(
                    row, SparseWithMeta) else row._values
                return "?" in row_values or "" in row_values

            source = ListSource(
                self._get_dataset_lines(dataset_description["file_id"], None))
            reader = ArffReader(cat_as_str=self._cat_as_str)
            drop = Drop(drop_cols=ignore, drop_row=row_has_missing_values)
            structure = Structure([None, target])

            return Pipes.join(source, reader, drop, structure).read()

        except KeyboardInterrupt:
            #we don't want to clear the cache in the case of a KeyboardInterrupt
            raise

        except CobaException:
            #we don't want to clear the cache if it is an error we know about (the original raise should clear if needed)
            raise

        except Exception:
            #if something unexpected went wrong clear the cache just in case it was corrupted somehow
            self._clear_cache()
            raise
Пример #10
0
    def test_all_good_tipes_do_not_raise_exception(self):
        lines = [
            "@relation news20",
            "@ATTRIBUTE a numeric",
            "@ATTRIBUTE b integer",
            "@ATTRIBUTE c real",
            "@attribute d    date",
            "@attribute e   {class_B, class_C, class_D}",
            "@attribute f relational",
            "@data",
        ]

        list(ArffReader().filter(lines))
Пример #11
0
    def test_dense_with_missing_value(self):
        lines = [
            "@relation news20",
            "@attribute a numeric",
            "@attribute B numeric",
            "@attribute c {class_B, class_C, class_D}",
            "@data",
            "1,2,class_B",
            "2,3,?",
        ]

        expected = [[1, 2, (1, 0, 0)], [2, 3, None]]

        self.assertEqual(expected, list(ArffReader().filter(lines)))
Пример #12
0
    def test_str_as_cat(self):
        lines = [
            "@relation news20",
            "@attribute A numeric",
            "@attribute C {0, class_B, class_C, class_D}",
            "@data",
            "1,class_B",
            "2,0",
        ]

        expected = [[1, "class_B"], [2, "0"]]

        self.assertEqual(expected,
                         list(ArffReader(cat_as_str=True).filter(lines)))
Пример #13
0
    def test_headers_with_quotes_and_pct(self):
        lines = [
            "@relation news20",
            "@attribute 'a%3' string",
            "@attribute 'b%4' string",
            "@attribute 'c%5' {class_B, class_C, class_D}",
            "@data",
            "1,2,class_B",
            "2,3,class_C",
        ]

        expected = [['1', '2', (1, 0, 0)], ['2', '3', (0, 1, 0)]]

        self.assertEqual(expected, list(ArffReader().filter(lines)))
Пример #14
0
    def test_dense_with_spaces_after_commas(self):
        lines = [
            "@relation news20",
            "@attribute a numeric",
            "@attribute B numeric",
            "@attribute c {class_B, class_C, class_D}",
            "@data",
            "1,  2,  class_B",
            "2,  3,  class_C",
        ]

        expected = [[1, 2, (1, 0, 0)], [2, 3, (0, 1, 0)]]

        self.assertEqual(expected, list(ArffReader().filter(lines)))
Пример #15
0
    def test_quotes_from_hell_dense_cat_as_str_true_bad_categories(self):
        lines = [
            "@relation news20",
            "@attribute 'A  a' numeric",
            "@attribute '\"' {0, \"class'B\", '\"class_C\"', 'class\",D'}",
            "@attribute '\'' {0, \"class'B\", '\"class_C\"', 'class\",D'}",
            "@attribute ',' {0, \"class'B\", '\"class_C\"', 'class\",D'}",
            "@data",
            "1,    'class\'B', '\"class_C\"', 'class\",G'",
        ]

        with self.assertRaises(CobaException):
            list(
                ArffReader(cat_as_str=True, lazy_encoding=False).filter(lines))
Пример #16
0
    def test_skip_encoding(self):
        lines = [
            "@relation news20",
            "@attribute a numeric",
            "@attribute B numeric",
            "@attribute c {class_B, class_C, class_D}",
            "@data",
            "1,  2,  class_B",
            "2,  3,  class_C",
        ]

        expected = [['1', '2', 'class_B'], ['2', '3', 'class_C']]

        self.assertEqual(expected,
                         list(ArffReader(skip_encoding=True).filter(lines)))
Пример #17
0
    def test_tab_delimieted_attributes(self):
        lines = [
            "@relation news20",
            "@attribute a	string",
            '@attribute b	string',
            "@attribute c	{class_B, class_C, class_D}",
            "@data",
            "1,2,class_B",
        ]

        expected = [
            ['1', '2', (1, 0, 0)],
        ]

        self.assertEqual(expected, list(ArffReader().filter(lines)))
Пример #18
0
    def test_spaces_in_attribute_name(self):
        lines = [
            "@relation news20",
            "@attribute 'a a' string",
            '@attribute "b b" string',
            "@attribute 'c c' {class_B, class_C, class_D}",
            "@data",
            "1,2,class_B",
        ]

        expected = [
            ['1', '2', (1, 0, 0)],
        ]

        self.assertEqual(expected, list(ArffReader().filter(lines)))
Пример #19
0
    def test_capitalized_attribute(self):
        lines = [
            "@relation news20",
            "@ATTRIBUTE 'a\\'a' numeric",
            "@attribute 'b b' string",
            "@attribute 'c c' {class_B, class_C, class_D}",
            "@data",
            "1,2,class_B",
        ]

        expected = [
            [1, '2', (1, 0, 0)],
        ]

        self.assertEqual(expected, list(ArffReader().filter(lines)))
Пример #20
0
    def test_bad_tipe_raises_exception(self):
        lines = [
            "@relation news20",
            "@ATTRIBUTE 'a\\'a' numeric",
            "@attribute 'b b' abcd",
            "@attribute 'c c' {class_B, class_C, class_D}",
            "@data",
            "1,2,class_B",
        ]

        with self.assertRaises(CobaException) as ex:
            list(ArffReader().filter(lines))

        self.assertEqual(
            'An unrecognized encoding was found in the arff attributes: abcd.',
            str(ex.exception))
Пример #21
0
    def test_bad_class_labels_throws_exception(self):
        lines = [
            "@relation news20",
            "@attribute 'a' string",
            "@attribute 'b' string",
            "@attribute 'c' {class_B, class_C, class_D}",
            "@data",
            "1,2,class_A",
        ]

        with self.assertRaises(CobaException) as e:
            list(list(ArffReader().filter(lines))[0])

        self.assertIn(
            "We were unable to find one of the categorical values in the arff data.",
            str(e.exception))
Пример #22
0
    def test_max_unknown_sparse_elements(self):
        lines = [
            "@relation news20",
            "@attribute A numeric",
            "@attribute C {0, class_B, class_C, class_D}",
            "@data",
            "{0 2,1 3,2 4}",
        ]

        with self.assertRaises(CobaException) as e:
            list(ArffReader().filter(lines))

        self.assertEqual(
            str(e.exception),
            "We were unable to parse line 0 in a way that matched the expected attributes."
        )
Пример #23
0
    def test_quotes_with_csv(self):
        lines = [
            "@relation news20",
            "@attribute 'value' numeric",
            "@attribute 'class' {'0','1'}",
            "@data",
            "1,'0'",
        ]

        expected = [[1, (1, 0)]]

        items = list(ArffReader(cat_as_str=False).filter(lines))

        self.assertEqual(expected, items)
        self.assertEqual(1, items[0]['value'])
        self.assertEqual((1, 0), items[0]['class'])
Пример #24
0
    def __init__(self,
                 source: Union[str, Source[Iterable[str]]],
                 cat_as_str: bool = False,
                 skip_encoding: bool = False,
                 lazy_encoding: bool = True,
                 header_indexing: bool = True) -> None:
        """Instantiate an ArffSource.

        Args:
            source: The data source. Accepts either a string representing the source location or another Source.
            cat_as_str: Indicates that categorical features should be encoded as a string rather than one hot encoded. 
            skip_encoding: Indicates that features should not be encoded (this means all features will be strings).
            lazy_encoding: Indicates that features should be encoded lazily (this can save time if rows will be dropped).
            header_indexing: Indicates that header data should be preserved so rows can be indexed by header name. 
        """
        source = UrlSource(source) if isinstance(source, str) else source
        reader = ArffReader(cat_as_str, skip_encoding, lazy_encoding,
                            header_indexing)
        self._source = Pipes.join(source, reader)
Пример #25
0
    def test_quotes_from_hell_dense_cat_as_str_true_good_categories(self):
        lines = [
            "@relation news20",
            "@attribute 'A  a' numeric",
            "@attribute '\"' {0, \"class'B\", '\"class_C\"', 'class\",D'}",
            "@attribute '\'' {0, \"class'B\", '\"class_C\"', 'class\",D'}",
            "@attribute ',' {0, \"class'B\", '\"class_C\"', 'class\",D'}",
            "@data",
            "1,    'class\'B', '\"class_C\"', 'class\",D'",
        ]

        expected = [[1, "class'B", '"class_C"', 'class",D']]

        items = list(ArffReader(cat_as_str=True).filter(lines))

        self.assertEqual(expected, items)
        self.assertEqual(1, items[0]['A  a'])
        self.assertEqual("class'B", items[0]['"'])
        self.assertEqual('"class_C"', items[0]["'"])
        self.assertEqual('class",D', items[0][","])
Пример #26
0
    def test_no_lazy_encoding_no_header_indexes_dense(self):
        lines = [
            "@relation news20",
            "@attribute a numeric",
            "@attribute b numeric",
            "@attribute c {class_B, class_C, class_D}",
            "@data",
            "1,2,class_B",
            "2,3,class_C",
        ]

        expected = [[1, 2, (1, 0, 0)], [2, 3, (0, 1, 0)]]

        actual = list(
            ArffReader(lazy_encoding=False,
                       header_indexing=False).filter(lines))

        self.assertEqual(expected, actual)
        self.assertIsInstance(actual[0], list)
        self.assertIsInstance(actual[1], list)
Пример #27
0
    def test_quotes_from_hell_dense_cat_as_str_false(self):
        lines = [
            "@relation news20",
            "@attribute 'A  a' numeric",
            "@attribute '\"' {0, \"class'B\", '\"class_C\"', 'class\",D', 'class\\',E', 'class\\'   ,F'}",
            "@attribute '\'' {0, \"class'B\", '\"class_C\"', 'class\",D', 'class\\',E', 'class\\'   ,F'}",
            "@attribute ','  {0, \"class'B\", '\"class_C\"', 'class\",D', 'class\\',E', 'class\\'   ,F'}",
            "@data",
            "1,    'class\\'B', '\"class_C\"', 'class\",D'",
        ]

        expected = [[
            1, (0, 1, 0, 0, 0, 0), (0, 0, 1, 0, 0, 0), (0, 0, 0, 1, 0, 0)
        ]]

        items = list(ArffReader(cat_as_str=False).filter(lines))

        self.assertEqual(expected, items)
        self.assertEqual(1, items[0]['A  a'])
        self.assertEqual((0, 1, 0, 0, 0, 0), items[0]['"'])
        self.assertEqual((0, 0, 1, 0, 0, 0), items[0]["'"])
        self.assertEqual((0, 0, 0, 1, 0, 0), items[0][","])
Пример #28
0
    def test_sparse_categorical_0_value(self):

        #this is a bug in ARFF, it is not uncommon for the first class value in an ARFF class list
        #to be dropped from the actual data because it is encoded as 0. Therefore our ARFF reader
        #automatically adds a 0 value to all categorical one-hot encoders to protect against this.
        #Below is what a dataset with this bug would look like, there is no class_B, instead all
        #class_B's are encoded as 0.
        lines = [
            "@relation news20",
            "@attribute a numeric",
            "@attribute b numeric",
            "@attribute c {class_B, class_C, class_D}",
            "@data",
            "{0 2,1 3}",
            "{0 1,1 1,2 class_C}",
            "{1 1}",
            "{0 1,2 class_D}",
        ]

        expected = [{
            0: 2,
            1: 3,
            2: (1, 0, 0, 0)
        }, {
            0: 1,
            1: 1,
            2: (0, 0, 1, 0)
        }, {
            1: 1,
            2: (1, 0, 0, 0)
        }, {
            0: 1,
            2: (0, 0, 0, 1)
        }]

        self.assertEqual(expected, list(ArffReader().filter(lines)))
Пример #29
0
    def read(self) -> Tuple[Sequence[Sequence[Any]], Sequence[Any]]:

        #placing some of these at the top would cause circular references
        from coba.encodings import Encoder, NumericEncoder, OneHotEncoder, StringEncoder
        from coba.pipes import ArffReader, CsvReader, Encode, Flatten, Transpose

        d_key = None
        t_key = None
        o_key = None

        try:
            data_id = self._data_id
            md5_checksum = self._md5_checksum

            d_key = f'https://www.openml.org/api/v1/json/data/{data_id}'
            t_key = f'https://www.openml.org/api/v1/json/data/features/{data_id}'

            d_bytes = self._query(d_key, "descr")
            d_object = json.loads(
                d_bytes.decode('utf-8'))["data_set_description"]

            if d_object['status'] == 'deactivated':
                raise Exception(
                    f"Openml {data_id} has been deactivated. This is often due to flags on the data."
                )

            t_bytes = self._query(t_key, "types")
            t_object = json.loads(
                t_bytes.decode('utf-8'))["data_features"]["feature"]

            headers: List[str] = []
            encoders: List[Encoder] = []
            ignored: List[bool] = []
            target: str = ""

            for tipe in t_object:

                headers.append(tipe['name'].lower())
                ignored.append(tipe['is_ignore'] == 'true'
                               or tipe['is_row_identifier'] == 'true')

                if tipe['is_target'] == 'true':
                    target = tipe['name'].lower()

                if tipe['data_type'] == 'numeric':
                    encoders.append(NumericEncoder())
                elif tipe['data_type'] == 'nominal':
                    encoders.append(OneHotEncoder(singular_if_binary=True))
                else:
                    encoders.append(StringEncoder())

            if target == "" or isinstance(encoders[headers.index(target)],
                                          NumericEncoder):
                target = self._get_classification_target(data_id)

            ignored[headers.index(target)] = False
            encoders[headers.index(target)] = StringEncoder()

            csv_url = f"http://www.openml.org/data/v1/get_csv/{d_object['file_id']}"
            arff_url = f"http://www.openml.org/data/v1/download/{d_object['file_id']}"

            try:
                if csv_url in CobaConfig.Cacher or arff_url not in CobaConfig.Cacher:
                    o_key = csv_url
                    o_bytes = self._query(o_key, "obser", md5_checksum)
                    file_rows = list(CsvReader().filter(
                        o_bytes.decode('utf-8').splitlines()))
                else:
                    o_key = arff_url
                    o_bytes = self._query(o_key, "obser", md5_checksum)
                    file_rows = list(
                        ArffReader(skip_encoding=[target]).filter(
                            o_bytes.decode('utf-8').splitlines()))
            except:
                if o_key == csv_url:
                    o_key = arff_url
                    o_bytes = self._query(o_key, "obser", md5_checksum)
                    file_rows = list(
                        ArffReader(skip_encoding=[target]).filter(
                            o_bytes.decode('utf-8').splitlines()))
                else:
                    o_key = csv_url
                    o_bytes = self._query(o_key, "obser", md5_checksum)
                    file_rows = list(CsvReader().filter(
                        o_bytes.decode('utf-8').splitlines()))

            is_sparse_data = isinstance(file_rows[0], tuple) and len(
                file_rows[0]) == 2

            if is_sparse_data:
                file_headers = [
                    header.lower() for header in file_rows.pop(0)[1]
                ]
            else:
                file_headers = [header.lower() for header in file_rows.pop(0)]

            file_cols = list(Transpose().filter(file_rows))

            for ignored_header in compress(headers, ignored):
                if ignored_header in file_headers:
                    file_cols.pop(file_headers.index(ignored_header))
                    file_headers.remove(ignored_header)

            file_encoders = [
                encoders[headers.index(file_header)]
                for file_header in file_headers
            ]

            file_cols = list(Encode(file_encoders).filter(file_cols))
            label_col = file_cols.pop(file_headers.index(target))
            feature_rows = list(Transpose().filter(
                Flatten().filter(file_cols)))

            #we only cache after all the data has been successfully loaded
            for key, bytes in [(d_key, d_bytes), (t_key, t_bytes),
                               (o_key, o_bytes)]:
                if key not in CobaConfig.Cacher:
                    CobaConfig.Cacher.put(key, bytes)

            if is_sparse_data:
                dense_label_col = ['0'] * len(feature_rows)

                for index, value in zip(label_col[0], label_col[1]):
                    dense_label_col[index] = value
            else:
                dense_label_col = list(label_col)

            return feature_rows, dense_label_col

        except KeyboardInterrupt:
            #we don't want to clear the cache in the case of a KeyboardInterrupt
            raise

        except Exception:
            #if something went wrong we want to clear the
            #cache just in case it was corrupted somehow
            for k in [d_key, t_key, o_key]:
                if k is not None: CobaConfig.Cacher.rmv(k)

            raise
Пример #30
0
 def __init__(self, source: Union[str, Source[Iterable[str]]],
              label_column: Union[str, int]) -> None:
     super().__init__(ArffReader(skip_encoding=[label_column]), source,
                      label_column)