def test_unite_records(): assert (discover([{ 'name': 'Alice', 'balance': 100 }, { 'name': 'Bob', 'balance': '' }]) == 2 * Record([['balance', Option(int64)], ['name', string]])) assert (discover([{ 'name': 'Alice', 's': 'foo' }, { 'name': 'Bob', 's': None }]) == 2 * Record([['name', string], ['s', Option(string)]])) assert (discover([{ 'name': 'Alice', 's': 'foo', 'f': 1.0 }, { 'name': 'Bob', 's': None, 'f': None }]) == 2 * Record([['f', Option(float64)], ['name', string], ['s', Option(string)]]))
def test_discover_mixed(): i = discover(1) f = discover(1.0) exp = 10 * Tuple([i, i, f, f]) assert dshape(discover([[1, 2, 1.0, 2.0]] * 10)) == exp exp = 10 * (4 * f) assert dshape(discover([[1, 2, 1.0, 2.0], [1.0, 2.0, 1, 2]] * 5)) == exp
def test_unite_tuples(): assert (discover([[1, 1, 'hello'], [1, '', ''], [1, 1, 'hello']]) == 3 * Tuple([int64, Option(int64), Option(string)])) assert (discover([[1, 1, 'hello', 1], [1, '', '', 1], [1, 1, 'hello', 1]]) == 3 * Tuple([int64, Option(int64), Option(string), int64]))
def test_unite_tuples(): assert discover([[1, 1, "hello"], [1, "", ""], [1, 1, "hello"]]) == 3 * Tuple( [int64, Option(int64), Option(string)] ) assert discover([[1, 1, "hello", 1], [1, "", "", 1], [1, 1, "hello", 1]]) == 3 * Tuple( [int64, Option(int64), Option(string), int64] )
def test_discover_mock(): try: from unittest.mock import Mock except ImportError: from mock import Mock # This used to segfault because we were sending mocks into numpy with pytest.raises(NotImplementedError): discover(Mock())
def test_unite_records(): assert discover([{"name": "Alice", "balance": 100}, {"name": "Bob", "balance": ""}]) == 2 * Record( [["balance", Option(int64)], ["name", string]] ) assert discover([{"name": "Alice", "s": "foo"}, {"name": "Bob", "s": None}]) == 2 * Record( [["name", string], ["s", Option(string)]] ) assert discover([{"name": "Alice", "s": "foo", "f": 1.0}, {"name": "Bob", "s": None, "f": None}]) == 2 * Record( [["f", Option(float64)], ["name", string], ["s", Option(string)]] )
def test_unite_records(): assert (discover([{'name': 'Alice', 'balance': 100}, {'name': 'Bob', 'balance': ''}]) == 2 * Record([['balance', Option(int64)], ['name', string]])) assert (discover([{'name': 'Alice', 's': 'foo'}, {'name': 'Bob', 's': None}]) == 2 * Record([['name', string], ['s', Option(string)]])) assert (discover([{'name': 'Alice', 's': 'foo', 'f': 1.0}, {'name': 'Bob', 's': None, 'f': None}]) == 2 * Record([['f', Option(float64)], ['name', string], ['s', Option(string)]]))
def test_list_of_dicts_difference(): data = [{'name': 'Alice', 'amount': 100}, {'name': 'Bob', 'house_color': 'blue'}] result = discover(data) s = '2 * {amount: ?int64, house_color: ?string, name: string}' expected = dshape(s) assert result == expected
def __init__(self, path, mode='rt', schema=None, dshape=None, open=open, nrows_discovery=50): self.path = path self._abspath = os.path.abspath(path) self.mode = mode self.open = open if dshape: dshape = datashape.dshape(dshape) if schema: schema = datashape.dshape(schema) if dshape and not schema and isdimension(dshape[0]): schema = dshape.subshape[0] if schema and not dshape: dshape = var * schema if not schema and not dshape: try: f = open(self.path, 'r') except: raise ValueError('No schema detected') data = list(map(json.loads, islice(f, 1, nrows_discovery))) f.close() dshape = discover(data) schema = dshape.subshape[0] # Initially the array is not loaded (is this necessary?) self._cache_arr = None self._schema = schema self._dshape = dshape
def test_integrative(): data = [{'name': 'Alice', 'amount': '100'}, {'name': 'Bob', 'amount': '200'}, {'name': 'Charlie', 'amount': '300'}] assert (dshape(discover(data)) == dshape('3 * {amount: int64, name: string}'))
def test_discover_array_like(): class MyArray(object): def __init__(self, shape, dtype): self.shape = shape self.dtype = dtype assert discover(MyArray((4, 3), "f4")) == dshape("4 * 3 * float32")
def __init__(self, path, mode='rt', schema=None, dshape=None, open=open, nrows_discovery=50): self.path = path self.mode = mode self.open = open if dshape: dshape = datashape.dshape(dshape) if schema: schema = datashape.dshape(schema) if dshape and not schema and isdimension(dshape[0]): schema = dshape.subshape[0] if schema and not dshape: dshape = var * schema if not schema and not dshape: try: f = open(self.path, 'r') except: raise ValueError('No schema detected') data = list(map(json.loads, islice(f, 1, nrows_discovery))) f.close() dshape = discover(data) schema = dshape.subshape[0] # Initially the array is not loaded (is this necessary?) self._cache_arr = None self._schema = schema self._dshape = dshape
def __init__(self, path, mode='rt', schema=None, dshape=None, open=open, **kwargs): self.path = path self._abspath = os.path.abspath(path) self.mode = mode self.open = open if dshape: dshape = datashape.dshape(dshape) if schema: schema = datashape.dshape(schema) if dshape and not schema and isdimension(dshape[0]): schema = dshape.subarray(1) if not schema and not dshape: try: f = open(self.path, 'r') except: raise ValueError('No schema detected') dshape = discover(json.load(f)) f.close() # Initially the array is not loaded (is this necessary?) self._cache_arr = None self._schema = schema self._dshape = dshape
def test_timedelta_strings(): inputs = ["1 day", "-2 hours", "3 seconds", "1 microsecond", "1003 milliseconds"] for ts in inputs: assert discover(ts) == TimeDelta(unit=ts.split()[1]) with pytest.raises(ValueError): TimeDelta(unit="buzz light-years")
def test_timedelta_strings(): inputs = [ "1 day", "-2 hours", "3 seconds", "1 microsecond", "1003 milliseconds" ] for ts in inputs: assert discover(ts) == TimeDelta(unit=ts.split()[1]) with pytest.raises(ValueError): TimeDelta(unit='buzz light-years')
def test_discover_array_like(): class MyArray(object): def __init__(self, shape, dtype): self.shape = shape self.dtype = dtype with catch_warnings(record=True) as wl: simplefilter('always') assert discover(MyArray((4, 3), 'f4')) == dshape('4 * 3 * float32') assert len(wl) == 1 assert issubclass(wl[0].category, DeprecationWarning) assert 'MyArray' in str(wl[0].message)
def test_simple(): assert discover(3) == int64 assert discover(3.0) == float64 assert discover(3.0 + 1j) == complex128 assert discover("Hello") == string assert discover(True) == bool_ assert discover(None) == null
def test_time_string(): assert discover("12:00:01") == time_ assert discover("12:00:01.000") == time_ assert discover("12:00:01.123456") == time_ assert discover("12:00:01.1234") == time_ assert discover("10-10-01T12:00:01") == datetime_ assert discover("10-10-01 12:00:01") == datetime_
def test_time_string(): assert discover('12:00:01') == time_ assert discover('12:00:01.000') == time_ assert discover('12:00:01.123456') == time_ assert discover('12:00:01.1234') == time_ assert discover('10-10-01T12:00:01') == datetime_ assert discover('10-10-01 12:00:01') == datetime_
def test_simple(): assert discover(3) == int64 assert discover(3.0) == float64 assert discover(3.0 + 1j) == complex128 assert discover('Hello') == string assert discover(True) == bool_ assert discover(None) == null
def test_datetime(): inputs = ["1991-02-03 04:05:06", "11/12/1822 06:47:26.00", "1822-11-12T06:47:26", "Fri Dec 19 15:10:11 1997", "Friday, November 11, 2005 17:56:21", "1982-2-20 5:02:00", "20030331 05:59:59.9", "Jul 6 2030 5:55PM", "1994-10-20 T 11:15", "2013-03-04T14:38:05.123", datetime(2014, 1, 1, 12, 1, 1), # "15MAR1985:14:15:22", # "201303041438" ] for dt in inputs: assert discover(dt) == datetime_
def test_letters_only_strings(): strings = ( "sunday", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "a", "b", "now", "yesterday", "tonight", ) for s in strings: assert discover(s) == string
def discover_csv(path, encoding=DEFAULT_ENCODING, nrows_discovery=50, header=None, dialect=None, types=None, columns=None, typehints=None): """ Discover datashape of CSV file """ df = pd.read_csv(path, dtype='O', encoding=encoding, chunksize=nrows_discovery, compression={ 'gz': 'gzip', 'bz2': 'bz2' }.get(ext(path)), header=0 if header else None, **clean_dialect(dialect)).get_chunk() if not types: L = (df.fillna('').to_records(index=False).tolist()) rowtype = discover(L).subshape[0] if isinstance(rowtype[0], Tuple): types = rowtype[0].dshapes types = [unpack(t) for t in types] types = [string if t == null else t for t in types] types = [safely_option(t) for t in types] elif (isinstance(rowtype[0], Fixed) and isinstance(rowtype[1], Unit)): types = int(rowtype[0]) * [rowtype[1]] else: raise ValueError("Could not discover schema from data.\n" "Please specify schema.") if not columns: if header: columns = list(df.columns) else: columns = ['_%d' % i for i in range(len(types))] if typehints: types = [typehints.get(c, t) for c, t in zip(columns, types)] return dshape(Record(list(zip(columns, types))))
def discover_csv(path, encoding=DEFAULT_ENCODING, nrows_discovery=50, header=None, dialect=None, types=None, columns=None, typehints=None): """ Discover datashape of CSV file """ df = pd.read_csv(path, dtype='O', encoding=encoding, chunksize=nrows_discovery, compression={'gz': 'gzip', 'bz2': 'bz2'}.get(ext(path)), header=0 if header else None, **clean_dialect(dialect)).get_chunk() if not types: L = (df.fillna('') .to_records(index=False) .tolist()) rowtype = discover(L).subshape[0] if isinstance(rowtype[0], Tuple): types = rowtype[0].dshapes types = [unpack(t) for t in types] types = [string if t == null else t for t in types] types = [safely_option(t) for t in types] elif (isinstance(rowtype[0], Fixed) and isinstance(rowtype[1], Unit)): types = int(rowtype[0]) * [rowtype[1]] else: raise ValueError("Could not discover schema from data.\n" "Please specify schema.") if not columns: if header: columns = list(df.columns) else: columns = ['_%d' % i for i in range(len(types))] if typehints: types = [typehints.get(c, t) for c, t in zip(columns, types)] return dshape(Record(list(zip(columns, types))))
def test_string(): assert discover('1') == discover(1) assert discover('1.0') == discover(1.0) assert discover('True') == discover(True) assert discover('true') == discover(True)
def test_timedelta(): objs = starmap(timedelta, (range(10, 10 - i, -1) for i in range(1, 8))) for ts in objs: assert discover(ts) == timedelta_
def test_date(): assert discover("2014-01-01") == date_ assert discover(date(2014, 1, 1)) == date_
def test_string(): assert discover("1") == discover(1) assert discover("1.0") == discover(1.0) assert discover("True") == discover(True) assert discover("true") == discover(True)
def test_set(): assert discover(set([1])) == 1 * discover(1)
def test_time(): assert discover(time(12, 0, 1)) == time_
def test_string_that_looks_like_date(): # GH 91 assert discover("31-DEC-99 12.00.00.000000000") == string
def test_single_space_string_is_not_date(): assert discover(' ') == string
def test_date(): assert discover('2014-01-01') == date_ assert discover(date(2014, 1, 1)) == date_
def test_frozenset(): assert discover(frozenset([1])) == 1 * discover(1)
def test_record(): assert (discover({'name': 'Alice', 'amount': 100}) == Record([['amount', discover(100)], ['name', discover('Alice')]]))
def test_list_of_dicts_difference(): data = [{"name": "Alice", "amount": 100}, {"name": "Bob", "house_color": "blue"}] result = discover(data) s = "2 * {amount: ?int64, house_color: ?string, name: string}" expected = dshape(s) assert result == expected
def test_nested_complex_record_type(): dt = np.dtype([("a", "U7"), ("b", [("c", "int64", 2), ("d", "float64")])]) x = np.zeros(5, dt) s = "5 * {a: string[7, 'U32'], b: {c: 2 * int64, d: float64}}" assert discover(x) == dshape(s)
def test_nested_complex_record_type(): dt = np.dtype([('a', 'U7'), ('b', [('c', 'int64', 2), ('d', 'float64')])]) x = np.zeros(5, dt) s = "5 * {a: string[7, 'U32'], b: {c: 2 * int64, d: float64}}" assert discover(x) == dshape(s)
def test_discover_empty_sequence(seq): assert discover(seq) == var * string
def test_list(): assert discover([1, 2, 3]) == 3 * discover(1) assert discover([1.0, 2.0, 3.0]) == 3 * discover(1.0)
def test_heterogeneous_ordered_container(): assert discover(("Hello", 1)) == Tuple([discover("Hello"), discover(1)])
def test_discover_undiscoverable(): class MyClass(object): pass with pytest.raises(NotImplementedError): discover(MyClass())
def test_record(): assert discover({"name": "Alice", "amount": 100}) == Record( [["amount", discover(100)], ["name", discover("Alice")]] )
def test_discover_bytes(): x = b'abcdefg' assert discover(x) == String('A')
def test_letters_only_strings(): strings = ('sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'a', 'b', 'now', 'yesterday', 'tonight') for s in strings: assert discover(s) == string
def __init__(self, path, mode='rt', schema=None, columns=None, types=None, typehints=None, dialect=None, header=None, open=open, nrows_discovery=50, **kwargs): if 'r' in mode and os.path.isfile(path) is not True: raise ValueError('CSV file "%s" does not exist' % path) if not schema and 'w' in mode: raise ValueError('Please specify schema for writable CSV file') self.path = path self.mode = mode self.open = open if os.path.exists(path) and mode != 'w': f = self.open(path) sample = f.read(16384) try: f.close() except AttributeError: pass else: sample = '' # Pandas uses sep instead of delimiter. # Lets support that too if 'sep' in kwargs: kwargs['delimiter'] = kwargs['sep'] dialect = discover_dialect(sample, dialect, **kwargs) assert dialect if header is None: header = has_header(sample) if not schema and 'w' not in mode: if not types: with open(self.path) as f: data = list(it.islice(csv.reader(f, **dialect), 1, nrows_discovery)) types = discover(data) rowtype = types.subshape[0] if isinstance(rowtype[0], Tuple): types = types.subshape[0][0].dshapes types = [unpack(t) for t in types] types = [string if t == null else t for t in types] types = [t if isinstance(t, Option) or t==string else Option(t) for t in types] elif (isinstance(rowtype[0], Fixed) and isinstance(rowtype[1], CType)): types = int(rowtype[0]) * [rowtype[1]] else: ValueError("Could not discover schema from data.\n" "Please specify schema.") if not columns: if header: with open(self.path) as f: columns = next(csv.reader([next(f)], **dialect)) else: columns = ['_%d' % i for i in range(len(types))] if typehints: types = [typehints.get(c, t) for c, t in zip(columns, types)] schema = dshape(Record(list(zip(columns, types)))) self._schema = schema self.header = header self.dialect = dialect
def test_heterogeneous_ordered_container(): assert discover(('Hello', 1)) == Tuple([discover('Hello'), discover(1)])