def test_arffreader_performance(self): attributes = "\n".join( [f"@attribute {i} {{1,2}}" for i in range(1000)]) data_line = ",".join(["1"] * 1000) data_lines = "\n".join([data_line] * 700) arff = f"{attributes}\n@data\n{data_lines}".split('\n') reader = ArffReader() time = timeit.timeit(lambda: list(reader.filter(arff)), number=1) #.045 was my final time self.assertLess(time, 0.45)
def test_no_lazy_encoding_no_header_indexes_sparse(self): lines = [ "@relation news20", "@attribute a numeric", "@attribute b numeric", "@attribute c {class_B, class_C, class_D}", "@data", "{0 1,1 2,2 class_B}", "{0 2,1 3,2 class_C}", ] expected = [{ 0: 1, 1: 2, 2: (0, 1, 0, 0) }, { 0: 2, 1: 3, 2: (0, 0, 1, 0) }] actual = list( ArffReader(lazy_encoding=False, header_indexing=False).filter(lines)) self.assertEqual(expected, actual) self.assertIsInstance(actual[0], dict) self.assertIsInstance(actual[1], dict)
def test_sparse_with_spaces_after_comma(self): lines = [ "@relation news20", "@attribute a numeric", "@attribute b numeric", "@attribute c {class_B, class_C, class_D}", "@data", "{0 2, 1 3}", "{0 1, 1 1,2 class_B}", "{1 1}", "{0 1, 2 class_D}", ] expected = [{ 0: 2, 1: 3, 2: (1, 0, 0, 0) }, { 0: 1, 1: 1, 2: (0, 1, 0, 0) }, { 1: 1, 2: (1, 0, 0, 0) }, { 0: 1, 2: (0, 0, 0, 1) }] self.assertEqual(expected, list(ArffReader().filter(lines)))
def test_dense_with_strings(self): lines = [ "@relation news20", "@attribute a string", "@attribute b string", "@attribute c {0, class_B, class_C, class_D}", "@data", "1,2,class_B", "2,3,0" ] expected = [['1', '2', (0, 1, 0, 0)], ['2', '3', (1, 0, 0, 0)]] self.assertEqual(expected, list(ArffReader().filter(lines)))
def test_dense_with_empty_lines(self): lines = [ "@relation news20", "@attribute A numeric", "@attribute B numeric", "@attribute C {0, class_B, class_C, class_D}", "@data", "", "", "1,2,class_B", "2,3,0", "" ] expected = [[1, 2, (0, 1, 0, 0)], [2, 3, (1, 0, 0, 0)]] self.assertEqual(expected, list(ArffReader().filter(lines)))
def test_dense_with_comments(self): lines = [ "%This is a comment", "@relation news20", "@attribute a numeric", "@attribute b numeric", "@attribute c {0, class_B, class_C, class_D}", "@data", "1,2,class_B", "2,3,0" ] expected = [[1, 2, (0, 1, 0, 0)], [2, 3, (1, 0, 0, 0)]] self.assertEqual(expected, list(ArffReader().filter(lines)))
def test_leading_and_trailing_comments(self): lines = [ "%", "%", "@relation news20", "@attribute a string", "@attribute b string", "@attribute c {0, class_B, class_C, class_D}", "@data", "1,2,class_B", "2,3,0", "%" ] expected = [['1', '2', (0, 1, 0, 0)], ['2', '3', (1, 0, 0, 0)]] self.assertEqual(expected, list(ArffReader().filter(lines)))
def test_sans_data(self): lines = [ "@relation news20", "@attribute a numeric", "@attribute B numeric", "@attribute c {0, class_B, class_C, class_D}", "@data", ] expected = [] self.assertEqual(expected, list(ArffReader().filter(lines)))
def read(self) -> Iterable[Tuple[Any, Any]]: """Read and parse the openml source.""" try: dataset_description = self._get_dataset_description(self._data_id) if dataset_description['status'] == 'deactivated': raise CobaException( f"Openml {self._data_id} has been deactivated. This is often due to flags on the data." ) feature_descriptions = self._get_feature_descriptions( self._data_id) task_descriptions = self._get_task_descriptions(self._data_id) is_ignore = lambda r: (r['is_ignore'] == 'true' or r[ 'is_row_identifier'] == 'true' or r['data_type'] not in ['numeric', 'nominal']) ignore = [ self._name_cleaning(f['name']) for f in feature_descriptions if is_ignore(f) ] target = self._name_cleaning( self._get_target_for_problem_type(task_descriptions)) if target in ignore: ignore.pop(ignore.index(target)) def row_has_missing_values(row): row_values = row._values.values() if isinstance( row, SparseWithMeta) else row._values return "?" in row_values or "" in row_values source = ListSource( self._get_dataset_lines(dataset_description["file_id"], None)) reader = ArffReader(cat_as_str=self._cat_as_str) drop = Drop(drop_cols=ignore, drop_row=row_has_missing_values) structure = Structure([None, target]) return Pipes.join(source, reader, drop, structure).read() except KeyboardInterrupt: #we don't want to clear the cache in the case of a KeyboardInterrupt raise except CobaException: #we don't want to clear the cache if it is an error we know about (the original raise should clear if needed) raise except Exception: #if something unexpected went wrong clear the cache just in case it was corrupted somehow self._clear_cache() raise
def test_all_good_tipes_do_not_raise_exception(self): lines = [ "@relation news20", "@ATTRIBUTE a numeric", "@ATTRIBUTE b integer", "@ATTRIBUTE c real", "@attribute d date", "@attribute e {class_B, class_C, class_D}", "@attribute f relational", "@data", ] list(ArffReader().filter(lines))
def test_dense_with_missing_value(self): lines = [ "@relation news20", "@attribute a numeric", "@attribute B numeric", "@attribute c {class_B, class_C, class_D}", "@data", "1,2,class_B", "2,3,?", ] expected = [[1, 2, (1, 0, 0)], [2, 3, None]] self.assertEqual(expected, list(ArffReader().filter(lines)))
def test_str_as_cat(self): lines = [ "@relation news20", "@attribute A numeric", "@attribute C {0, class_B, class_C, class_D}", "@data", "1,class_B", "2,0", ] expected = [[1, "class_B"], [2, "0"]] self.assertEqual(expected, list(ArffReader(cat_as_str=True).filter(lines)))
def test_headers_with_quotes_and_pct(self): lines = [ "@relation news20", "@attribute 'a%3' string", "@attribute 'b%4' string", "@attribute 'c%5' {class_B, class_C, class_D}", "@data", "1,2,class_B", "2,3,class_C", ] expected = [['1', '2', (1, 0, 0)], ['2', '3', (0, 1, 0)]] self.assertEqual(expected, list(ArffReader().filter(lines)))
def test_dense_with_spaces_after_commas(self): lines = [ "@relation news20", "@attribute a numeric", "@attribute B numeric", "@attribute c {class_B, class_C, class_D}", "@data", "1, 2, class_B", "2, 3, class_C", ] expected = [[1, 2, (1, 0, 0)], [2, 3, (0, 1, 0)]] self.assertEqual(expected, list(ArffReader().filter(lines)))
def test_quotes_from_hell_dense_cat_as_str_true_bad_categories(self): lines = [ "@relation news20", "@attribute 'A a' numeric", "@attribute '\"' {0, \"class'B\", '\"class_C\"', 'class\",D'}", "@attribute '\'' {0, \"class'B\", '\"class_C\"', 'class\",D'}", "@attribute ',' {0, \"class'B\", '\"class_C\"', 'class\",D'}", "@data", "1, 'class\'B', '\"class_C\"', 'class\",G'", ] with self.assertRaises(CobaException): list( ArffReader(cat_as_str=True, lazy_encoding=False).filter(lines))
def test_skip_encoding(self): lines = [ "@relation news20", "@attribute a numeric", "@attribute B numeric", "@attribute c {class_B, class_C, class_D}", "@data", "1, 2, class_B", "2, 3, class_C", ] expected = [['1', '2', 'class_B'], ['2', '3', 'class_C']] self.assertEqual(expected, list(ArffReader(skip_encoding=True).filter(lines)))
def test_tab_delimieted_attributes(self): lines = [ "@relation news20", "@attribute a string", '@attribute b string', "@attribute c {class_B, class_C, class_D}", "@data", "1,2,class_B", ] expected = [ ['1', '2', (1, 0, 0)], ] self.assertEqual(expected, list(ArffReader().filter(lines)))
def test_spaces_in_attribute_name(self): lines = [ "@relation news20", "@attribute 'a a' string", '@attribute "b b" string', "@attribute 'c c' {class_B, class_C, class_D}", "@data", "1,2,class_B", ] expected = [ ['1', '2', (1, 0, 0)], ] self.assertEqual(expected, list(ArffReader().filter(lines)))
def test_capitalized_attribute(self): lines = [ "@relation news20", "@ATTRIBUTE 'a\\'a' numeric", "@attribute 'b b' string", "@attribute 'c c' {class_B, class_C, class_D}", "@data", "1,2,class_B", ] expected = [ [1, '2', (1, 0, 0)], ] self.assertEqual(expected, list(ArffReader().filter(lines)))
def test_bad_tipe_raises_exception(self): lines = [ "@relation news20", "@ATTRIBUTE 'a\\'a' numeric", "@attribute 'b b' abcd", "@attribute 'c c' {class_B, class_C, class_D}", "@data", "1,2,class_B", ] with self.assertRaises(CobaException) as ex: list(ArffReader().filter(lines)) self.assertEqual( 'An unrecognized encoding was found in the arff attributes: abcd.', str(ex.exception))
def test_bad_class_labels_throws_exception(self): lines = [ "@relation news20", "@attribute 'a' string", "@attribute 'b' string", "@attribute 'c' {class_B, class_C, class_D}", "@data", "1,2,class_A", ] with self.assertRaises(CobaException) as e: list(list(ArffReader().filter(lines))[0]) self.assertIn( "We were unable to find one of the categorical values in the arff data.", str(e.exception))
def test_max_unknown_sparse_elements(self): lines = [ "@relation news20", "@attribute A numeric", "@attribute C {0, class_B, class_C, class_D}", "@data", "{0 2,1 3,2 4}", ] with self.assertRaises(CobaException) as e: list(ArffReader().filter(lines)) self.assertEqual( str(e.exception), "We were unable to parse line 0 in a way that matched the expected attributes." )
def test_quotes_with_csv(self): lines = [ "@relation news20", "@attribute 'value' numeric", "@attribute 'class' {'0','1'}", "@data", "1,'0'", ] expected = [[1, (1, 0)]] items = list(ArffReader(cat_as_str=False).filter(lines)) self.assertEqual(expected, items) self.assertEqual(1, items[0]['value']) self.assertEqual((1, 0), items[0]['class'])
def __init__(self, source: Union[str, Source[Iterable[str]]], cat_as_str: bool = False, skip_encoding: bool = False, lazy_encoding: bool = True, header_indexing: bool = True) -> None: """Instantiate an ArffSource. Args: source: The data source. Accepts either a string representing the source location or another Source. cat_as_str: Indicates that categorical features should be encoded as a string rather than one hot encoded. skip_encoding: Indicates that features should not be encoded (this means all features will be strings). lazy_encoding: Indicates that features should be encoded lazily (this can save time if rows will be dropped). header_indexing: Indicates that header data should be preserved so rows can be indexed by header name. """ source = UrlSource(source) if isinstance(source, str) else source reader = ArffReader(cat_as_str, skip_encoding, lazy_encoding, header_indexing) self._source = Pipes.join(source, reader)
def test_quotes_from_hell_dense_cat_as_str_true_good_categories(self): lines = [ "@relation news20", "@attribute 'A a' numeric", "@attribute '\"' {0, \"class'B\", '\"class_C\"', 'class\",D'}", "@attribute '\'' {0, \"class'B\", '\"class_C\"', 'class\",D'}", "@attribute ',' {0, \"class'B\", '\"class_C\"', 'class\",D'}", "@data", "1, 'class\'B', '\"class_C\"', 'class\",D'", ] expected = [[1, "class'B", '"class_C"', 'class",D']] items = list(ArffReader(cat_as_str=True).filter(lines)) self.assertEqual(expected, items) self.assertEqual(1, items[0]['A a']) self.assertEqual("class'B", items[0]['"']) self.assertEqual('"class_C"', items[0]["'"]) self.assertEqual('class",D', items[0][","])
def test_no_lazy_encoding_no_header_indexes_dense(self): lines = [ "@relation news20", "@attribute a numeric", "@attribute b numeric", "@attribute c {class_B, class_C, class_D}", "@data", "1,2,class_B", "2,3,class_C", ] expected = [[1, 2, (1, 0, 0)], [2, 3, (0, 1, 0)]] actual = list( ArffReader(lazy_encoding=False, header_indexing=False).filter(lines)) self.assertEqual(expected, actual) self.assertIsInstance(actual[0], list) self.assertIsInstance(actual[1], list)
def test_quotes_from_hell_dense_cat_as_str_false(self): lines = [ "@relation news20", "@attribute 'A a' numeric", "@attribute '\"' {0, \"class'B\", '\"class_C\"', 'class\",D', 'class\\',E', 'class\\' ,F'}", "@attribute '\'' {0, \"class'B\", '\"class_C\"', 'class\",D', 'class\\',E', 'class\\' ,F'}", "@attribute ',' {0, \"class'B\", '\"class_C\"', 'class\",D', 'class\\',E', 'class\\' ,F'}", "@data", "1, 'class\\'B', '\"class_C\"', 'class\",D'", ] expected = [[ 1, (0, 1, 0, 0, 0, 0), (0, 0, 1, 0, 0, 0), (0, 0, 0, 1, 0, 0) ]] items = list(ArffReader(cat_as_str=False).filter(lines)) self.assertEqual(expected, items) self.assertEqual(1, items[0]['A a']) self.assertEqual((0, 1, 0, 0, 0, 0), items[0]['"']) self.assertEqual((0, 0, 1, 0, 0, 0), items[0]["'"]) self.assertEqual((0, 0, 0, 1, 0, 0), items[0][","])
def test_sparse_categorical_0_value(self): #this is a bug in ARFF, it is not uncommon for the first class value in an ARFF class list #to be dropped from the actual data because it is encoded as 0. Therefore our ARFF reader #automatically adds a 0 value to all categorical one-hot encoders to protect against this. #Below is what a dataset with this bug would look like, there is no class_B, instead all #class_B's are encoded as 0. lines = [ "@relation news20", "@attribute a numeric", "@attribute b numeric", "@attribute c {class_B, class_C, class_D}", "@data", "{0 2,1 3}", "{0 1,1 1,2 class_C}", "{1 1}", "{0 1,2 class_D}", ] expected = [{ 0: 2, 1: 3, 2: (1, 0, 0, 0) }, { 0: 1, 1: 1, 2: (0, 0, 1, 0) }, { 1: 1, 2: (1, 0, 0, 0) }, { 0: 1, 2: (0, 0, 0, 1) }] self.assertEqual(expected, list(ArffReader().filter(lines)))
def read(self) -> Tuple[Sequence[Sequence[Any]], Sequence[Any]]: #placing some of these at the top would cause circular references from coba.encodings import Encoder, NumericEncoder, OneHotEncoder, StringEncoder from coba.pipes import ArffReader, CsvReader, Encode, Flatten, Transpose d_key = None t_key = None o_key = None try: data_id = self._data_id md5_checksum = self._md5_checksum d_key = f'https://www.openml.org/api/v1/json/data/{data_id}' t_key = f'https://www.openml.org/api/v1/json/data/features/{data_id}' d_bytes = self._query(d_key, "descr") d_object = json.loads( d_bytes.decode('utf-8'))["data_set_description"] if d_object['status'] == 'deactivated': raise Exception( f"Openml {data_id} has been deactivated. This is often due to flags on the data." ) t_bytes = self._query(t_key, "types") t_object = json.loads( t_bytes.decode('utf-8'))["data_features"]["feature"] headers: List[str] = [] encoders: List[Encoder] = [] ignored: List[bool] = [] target: str = "" for tipe in t_object: headers.append(tipe['name'].lower()) ignored.append(tipe['is_ignore'] == 'true' or tipe['is_row_identifier'] == 'true') if tipe['is_target'] == 'true': target = tipe['name'].lower() if tipe['data_type'] == 'numeric': encoders.append(NumericEncoder()) elif tipe['data_type'] == 'nominal': encoders.append(OneHotEncoder(singular_if_binary=True)) else: encoders.append(StringEncoder()) if target == "" or isinstance(encoders[headers.index(target)], NumericEncoder): target = self._get_classification_target(data_id) ignored[headers.index(target)] = False encoders[headers.index(target)] = StringEncoder() csv_url = f"http://www.openml.org/data/v1/get_csv/{d_object['file_id']}" arff_url = f"http://www.openml.org/data/v1/download/{d_object['file_id']}" try: if csv_url in CobaConfig.Cacher or arff_url not in CobaConfig.Cacher: o_key = csv_url o_bytes = self._query(o_key, "obser", md5_checksum) file_rows = list(CsvReader().filter( o_bytes.decode('utf-8').splitlines())) else: o_key = arff_url o_bytes = self._query(o_key, "obser", md5_checksum) file_rows = list( ArffReader(skip_encoding=[target]).filter( o_bytes.decode('utf-8').splitlines())) except: if o_key == csv_url: o_key = arff_url o_bytes = self._query(o_key, "obser", md5_checksum) file_rows = list( ArffReader(skip_encoding=[target]).filter( o_bytes.decode('utf-8').splitlines())) else: o_key = csv_url o_bytes = self._query(o_key, "obser", md5_checksum) file_rows = list(CsvReader().filter( o_bytes.decode('utf-8').splitlines())) is_sparse_data = isinstance(file_rows[0], tuple) and len( file_rows[0]) == 2 if is_sparse_data: file_headers = [ header.lower() for header in file_rows.pop(0)[1] ] else: file_headers = [header.lower() for header in file_rows.pop(0)] file_cols = list(Transpose().filter(file_rows)) for ignored_header in compress(headers, ignored): if ignored_header in file_headers: file_cols.pop(file_headers.index(ignored_header)) file_headers.remove(ignored_header) file_encoders = [ encoders[headers.index(file_header)] for file_header in file_headers ] file_cols = list(Encode(file_encoders).filter(file_cols)) label_col = file_cols.pop(file_headers.index(target)) feature_rows = list(Transpose().filter( Flatten().filter(file_cols))) #we only cache after all the data has been successfully loaded for key, bytes in [(d_key, d_bytes), (t_key, t_bytes), (o_key, o_bytes)]: if key not in CobaConfig.Cacher: CobaConfig.Cacher.put(key, bytes) if is_sparse_data: dense_label_col = ['0'] * len(feature_rows) for index, value in zip(label_col[0], label_col[1]): dense_label_col[index] = value else: dense_label_col = list(label_col) return feature_rows, dense_label_col except KeyboardInterrupt: #we don't want to clear the cache in the case of a KeyboardInterrupt raise except Exception: #if something went wrong we want to clear the #cache just in case it was corrupted somehow for k in [d_key, t_key, o_key]: if k is not None: CobaConfig.Cacher.rmv(k) raise
def __init__(self, source: Union[str, Source[Iterable[str]]], label_column: Union[str, int]) -> None: super().__init__(ArffReader(skip_encoding=[label_column]), source, label_column)