예제 #1
0
def test_unite_records():
    assert (discover([{
        'name': 'Alice',
        'balance': 100
    }, {
        'name': 'Bob',
        'balance': ''
    }]) == 2 * Record([['balance', Option(int64)], ['name', string]]))

    assert (discover([{
        'name': 'Alice',
        's': 'foo'
    }, {
        'name': 'Bob',
        's': None
    }]) == 2 * Record([['name', string], ['s', Option(string)]]))

    assert (discover([{
        'name': 'Alice',
        's': 'foo',
        'f': 1.0
    }, {
        'name': 'Bob',
        's': None,
        'f': None
    }]) == 2 * Record([['f', Option(float64)], ['name', string],
                       ['s', Option(string)]]))
def test_discover_mixed():
    i = discover(1)
    f = discover(1.0)
    exp = 10 * Tuple([i, i, f, f])
    assert dshape(discover([[1, 2, 1.0, 2.0]] * 10)) == exp

    exp = 10 * (4 * f)
    assert dshape(discover([[1, 2, 1.0, 2.0], [1.0, 2.0, 1, 2]] * 5)) == exp
예제 #3
0
def test_unite_tuples():
    assert (discover([[1, 1, 'hello'], [1, '', ''], [1, 1, 'hello']]) == 3 *
            Tuple([int64, Option(int64), Option(string)]))

    assert (discover([[1, 1, 'hello', 1], [1, '', '', 1], [1, 1, 'hello',
                                                           1]]) == 3 *
            Tuple([int64, Option(int64),
                   Option(string), int64]))
def test_unite_tuples():
    assert discover([[1, 1, "hello"], [1, "", ""], [1, 1, "hello"]]) == 3 * Tuple(
        [int64, Option(int64), Option(string)]
    )

    assert discover([[1, 1, "hello", 1], [1, "", "", 1], [1, 1, "hello", 1]]) == 3 * Tuple(
        [int64, Option(int64), Option(string), int64]
    )
예제 #5
0
def test_discover_mixed():
    i = discover(1)
    f = discover(1.0)
    exp = 10 * Tuple([i, i, f, f])
    assert dshape(discover([[1, 2, 1.0, 2.0]] * 10)) == exp

    exp = 10 * (4 * f)
    assert dshape(discover([[1, 2, 1.0, 2.0], [1.0, 2.0, 1, 2]] * 5)) == exp
예제 #6
0
def test_discover_mock():
    try:
        from unittest.mock import Mock
    except ImportError:
        from mock import Mock

    # This used to segfault because we were sending mocks into numpy
    with pytest.raises(NotImplementedError):
        discover(Mock())
예제 #7
0
def test_unite_tuples():
    assert (discover([[1, 1, 'hello'],
                     [1, '', ''],
                     [1, 1, 'hello']]) ==
            3 * Tuple([int64, Option(int64), Option(string)]))

    assert (discover([[1, 1, 'hello', 1],
                     [1, '', '', 1],
                     [1, 1, 'hello', 1]]) ==
            3 * Tuple([int64, Option(int64), Option(string), int64]))
def test_unite_records():
    assert discover([{"name": "Alice", "balance": 100}, {"name": "Bob", "balance": ""}]) == 2 * Record(
        [["balance", Option(int64)], ["name", string]]
    )

    assert discover([{"name": "Alice", "s": "foo"}, {"name": "Bob", "s": None}]) == 2 * Record(
        [["name", string], ["s", Option(string)]]
    )

    assert discover([{"name": "Alice", "s": "foo", "f": 1.0}, {"name": "Bob", "s": None, "f": None}]) == 2 * Record(
        [["f", Option(float64)], ["name", string], ["s", Option(string)]]
    )
예제 #9
0
def test_unite_records():
    assert (discover([{'name': 'Alice', 'balance': 100},
                     {'name': 'Bob', 'balance': ''}]) ==
            2 * Record([['balance', Option(int64)], ['name', string]]))

    assert (discover([{'name': 'Alice', 's': 'foo'},
                     {'name': 'Bob', 's': None}]) ==
            2 * Record([['name', string], ['s', Option(string)]]))

    assert (discover([{'name': 'Alice', 's': 'foo', 'f': 1.0},
                     {'name': 'Bob', 's': None, 'f': None}]) ==
            2 * Record([['f', Option(float64)],
                        ['name', string],
                        ['s', Option(string)]]))
예제 #10
0
def test_list_of_dicts_difference():
    data = [{'name': 'Alice', 'amount': 100},
            {'name': 'Bob', 'house_color': 'blue'}]
    result = discover(data)
    s = '2 * {amount: ?int64, house_color: ?string, name: string}'
    expected = dshape(s)
    assert result == expected
예제 #11
0
파일: json.py 프로젝트: vitan/blaze
    def __init__(self, path, mode='rt', schema=None, dshape=None, open=open,
                 nrows_discovery=50):
        self.path = path
        self._abspath = os.path.abspath(path)
        self.mode = mode
        self.open = open
        if dshape:
            dshape = datashape.dshape(dshape)
        if schema:
            schema = datashape.dshape(schema)
        if dshape and not schema and isdimension(dshape[0]):
            schema = dshape.subshape[0]
        if schema and not dshape:
            dshape = var * schema

        if not schema and not dshape:
            try:
                f = open(self.path, 'r')
            except:
                raise ValueError('No schema detected')
            data = list(map(json.loads,
                            islice(f, 1, nrows_discovery)))
            f.close()
            dshape = discover(data)
            schema = dshape.subshape[0]
        # Initially the array is not loaded (is this necessary?)
        self._cache_arr = None

        self._schema = schema
        self._dshape = dshape
예제 #12
0
def test_integrative():
    data = [{'name': 'Alice', 'amount': '100'},
            {'name': 'Bob', 'amount': '200'},
            {'name': 'Charlie', 'amount': '300'}]

    assert (dshape(discover(data)) ==
            dshape('3 * {amount: int64, name: string}'))
def test_discover_array_like():
    class MyArray(object):
        def __init__(self, shape, dtype):
            self.shape = shape
            self.dtype = dtype

    assert discover(MyArray((4, 3), "f4")) == dshape("4 * 3 * float32")
예제 #14
0
파일: json.py 프로젝트: B-Rich/blaze
    def __init__(self, path, mode='rt', schema=None, dshape=None, open=open,
                 nrows_discovery=50):
        self.path = path
        self.mode = mode
        self.open = open
        if dshape:
            dshape = datashape.dshape(dshape)
        if schema:
            schema = datashape.dshape(schema)
        if dshape and not schema and isdimension(dshape[0]):
            schema = dshape.subshape[0]
        if schema and not dshape:
            dshape = var * schema

        if not schema and not dshape:
            try:
                f = open(self.path, 'r')
            except:
                raise ValueError('No schema detected')
            data = list(map(json.loads,
                            islice(f, 1, nrows_discovery)))
            f.close()
            dshape = discover(data)
            schema = dshape.subshape[0]
        # Initially the array is not loaded (is this necessary?)
        self._cache_arr = None

        self._schema = schema
        self._dshape = dshape
예제 #15
0
파일: json.py 프로젝트: vitan/blaze
    def __init__(self, path, mode='rt', schema=None, dshape=None, open=open,
            **kwargs):
        self.path = path
        self._abspath = os.path.abspath(path)
        self.mode = mode
        self.open = open
        if dshape:
            dshape = datashape.dshape(dshape)
        if schema:
            schema = datashape.dshape(schema)
        if dshape and not schema and isdimension(dshape[0]):
            schema = dshape.subarray(1)

        if not schema and not dshape:
            try:
                f = open(self.path, 'r')
            except:
                raise ValueError('No schema detected')
            dshape = discover(json.load(f))
            f.close()
        # Initially the array is not loaded (is this necessary?)
        self._cache_arr = None

        self._schema = schema
        self._dshape = dshape
def test_timedelta_strings():
    inputs = ["1 day", "-2 hours", "3 seconds", "1 microsecond", "1003 milliseconds"]
    for ts in inputs:
        assert discover(ts) == TimeDelta(unit=ts.split()[1])

    with pytest.raises(ValueError):
        TimeDelta(unit="buzz light-years")
예제 #17
0
def test_timedelta_strings():
    inputs = [
        "1 day", "-2 hours", "3 seconds", "1 microsecond", "1003 milliseconds"
    ]
    for ts in inputs:
        assert discover(ts) == TimeDelta(unit=ts.split()[1])

    with pytest.raises(ValueError):
        TimeDelta(unit='buzz light-years')
예제 #18
0
def test_discover_array_like():
    class MyArray(object):
        def __init__(self, shape, dtype):
            self.shape = shape
            self.dtype = dtype

    with catch_warnings(record=True) as wl:
        simplefilter('always')
        assert discover(MyArray((4, 3), 'f4')) == dshape('4 * 3 * float32')
    assert len(wl) == 1
    assert issubclass(wl[0].category, DeprecationWarning)
    assert 'MyArray' in str(wl[0].message)
def test_simple():
    assert discover(3) == int64
    assert discover(3.0) == float64
    assert discover(3.0 + 1j) == complex128
    assert discover("Hello") == string
    assert discover(True) == bool_
    assert discover(None) == null
def test_time_string():
    assert discover("12:00:01") == time_
    assert discover("12:00:01.000") == time_
    assert discover("12:00:01.123456") == time_
    assert discover("12:00:01.1234") == time_
    assert discover("10-10-01T12:00:01") == datetime_
    assert discover("10-10-01 12:00:01") == datetime_
예제 #21
0
def test_time_string():
    assert discover('12:00:01') == time_
    assert discover('12:00:01.000') == time_
    assert discover('12:00:01.123456') == time_
    assert discover('12:00:01.1234') == time_
    assert discover('10-10-01T12:00:01') == datetime_
    assert discover('10-10-01 12:00:01') == datetime_
예제 #22
0
def test_simple():
    assert discover(3) == int64
    assert discover(3.0) == float64
    assert discover(3.0 + 1j) == complex128
    assert discover('Hello') == string
    assert discover(True) == bool_
    assert discover(None) == null
예제 #23
0
def test_datetime():
    inputs = ["1991-02-03 04:05:06",
              "11/12/1822 06:47:26.00",
              "1822-11-12T06:47:26",
              "Fri Dec 19 15:10:11 1997",
              "Friday, November 11, 2005 17:56:21",
              "1982-2-20 5:02:00",
              "20030331 05:59:59.9",
              "Jul  6 2030  5:55PM",
              "1994-10-20 T 11:15",
              "2013-03-04T14:38:05.123",
              datetime(2014, 1, 1, 12, 1, 1),
              # "15MAR1985:14:15:22",
              # "201303041438"
              ]
    for dt in inputs:
        assert discover(dt) == datetime_
def test_letters_only_strings():
    strings = (
        "sunday",
        "monday",
        "tuesday",
        "wednesday",
        "thursday",
        "friday",
        "saturday",
        "a",
        "b",
        "now",
        "yesterday",
        "tonight",
    )
    for s in strings:
        assert discover(s) == string
예제 #25
0
def discover_csv(path,
                 encoding=DEFAULT_ENCODING,
                 nrows_discovery=50,
                 header=None,
                 dialect=None,
                 types=None,
                 columns=None,
                 typehints=None):
    """ Discover datashape of CSV file """
    df = pd.read_csv(path,
                     dtype='O',
                     encoding=encoding,
                     chunksize=nrows_discovery,
                     compression={
                         'gz': 'gzip',
                         'bz2': 'bz2'
                     }.get(ext(path)),
                     header=0 if header else None,
                     **clean_dialect(dialect)).get_chunk()
    if not types:
        L = (df.fillna('').to_records(index=False).tolist())
        rowtype = discover(L).subshape[0]
        if isinstance(rowtype[0], Tuple):
            types = rowtype[0].dshapes
            types = [unpack(t) for t in types]
            types = [string if t == null else t for t in types]
            types = [safely_option(t) for t in types]
        elif (isinstance(rowtype[0], Fixed) and isinstance(rowtype[1], Unit)):
            types = int(rowtype[0]) * [rowtype[1]]
        else:
            raise ValueError("Could not discover schema from data.\n"
                             "Please specify schema.")
    if not columns:
        if header:
            columns = list(df.columns)
        else:
            columns = ['_%d' % i for i in range(len(types))]
    if typehints:
        types = [typehints.get(c, t) for c, t in zip(columns, types)]

    return dshape(Record(list(zip(columns, types))))
예제 #26
0
파일: csv.py 프로젝트: Casolt/blaze
def discover_csv(path, encoding=DEFAULT_ENCODING, nrows_discovery=50,
        header=None, dialect=None, types=None, columns=None,
        typehints=None):
    """ Discover datashape of CSV file """
    df = pd.read_csv(path,
            dtype='O',
            encoding=encoding,
            chunksize=nrows_discovery,
            compression={'gz': 'gzip',
                         'bz2': 'bz2'}.get(ext(path)),
            header=0 if header else None,
            **clean_dialect(dialect)).get_chunk()
    if not types:
        L = (df.fillna('')
                .to_records(index=False)
                .tolist())
        rowtype = discover(L).subshape[0]
        if isinstance(rowtype[0], Tuple):
            types = rowtype[0].dshapes
            types = [unpack(t) for t in types]
            types = [string if t == null else t for t in types]
            types = [safely_option(t) for t in types]
        elif (isinstance(rowtype[0], Fixed) and
                isinstance(rowtype[1], Unit)):
            types = int(rowtype[0]) * [rowtype[1]]
        else:
            raise ValueError("Could not discover schema from data.\n"
                    "Please specify schema.")
    if not columns:
        if header:
            columns = list(df.columns)
        else:
            columns = ['_%d' % i for i in range(len(types))]
    if typehints:
        types = [typehints.get(c, t) for c, t in zip(columns, types)]

    return dshape(Record(list(zip(columns, types))))
예제 #27
0
def test_string():
    assert discover('1') == discover(1)
    assert discover('1.0') == discover(1.0)
    assert discover('True') == discover(True)
    assert discover('true') == discover(True)
def test_timedelta():
    objs = starmap(timedelta, (range(10, 10 - i, -1) for i in range(1, 8)))
    for ts in objs:
        assert discover(ts) == timedelta_
def test_date():
    assert discover("2014-01-01") == date_
    assert discover(date(2014, 1, 1)) == date_
def test_string():
    assert discover("1") == discover(1)
    assert discover("1.0") == discover(1.0)
    assert discover("True") == discover(True)
    assert discover("true") == discover(True)
def test_set():
    assert discover(set([1])) == 1 * discover(1)
예제 #32
0
def test_timedelta():
    objs = starmap(timedelta, (range(10, 10 - i, -1) for i in range(1, 8)))
    for ts in objs:
        assert discover(ts) == timedelta_
예제 #33
0
def test_time():
    assert discover(time(12, 0, 1)) == time_
예제 #34
0
def test_string_that_looks_like_date():
    # GH 91
    assert discover("31-DEC-99 12.00.00.000000000") == string
예제 #35
0
def test_single_space_string_is_not_date():
    assert discover(' ') == string
예제 #36
0
def test_date():
    assert discover('2014-01-01') == date_
    assert discover(date(2014, 1, 1)) == date_
예제 #37
0
def test_frozenset():
    assert discover(frozenset([1])) == 1 * discover(1)
예제 #38
0
def test_record():
    assert (discover({'name': 'Alice', 'amount': 100}) ==
            Record([['amount', discover(100)],
                    ['name', discover('Alice')]]))
def test_list_of_dicts_difference():
    data = [{"name": "Alice", "amount": 100}, {"name": "Bob", "house_color": "blue"}]
    result = discover(data)
    s = "2 * {amount: ?int64, house_color: ?string, name: string}"
    expected = dshape(s)
    assert result == expected
def test_nested_complex_record_type():
    dt = np.dtype([("a", "U7"), ("b", [("c", "int64", 2), ("d", "float64")])])
    x = np.zeros(5, dt)
    s = "5 * {a: string[7, 'U32'], b: {c: 2 * int64, d: float64}}"
    assert discover(x) == dshape(s)
예제 #41
0
def test_set():
    assert discover(set([1])) == 1 * discover(1)
예제 #42
0
def test_nested_complex_record_type():
    dt = np.dtype([('a', 'U7'), ('b', [('c', 'int64', 2), ('d', 'float64')])])
    x = np.zeros(5, dt)
    s = "5 * {a: string[7, 'U32'], b: {c: 2 * int64, d: float64}}"
    assert discover(x) == dshape(s)
예제 #43
0
def test_discover_empty_sequence(seq):
    assert discover(seq) == var * string
def test_list():
    assert discover([1, 2, 3]) == 3 * discover(1)
    assert discover([1.0, 2.0, 3.0]) == 3 * discover(1.0)
예제 #45
0
def test_list():
    assert discover([1, 2, 3]) == 3 * discover(1)
    assert discover([1.0, 2.0, 3.0]) == 3 * discover(1.0)
def test_heterogeneous_ordered_container():
    assert discover(("Hello", 1)) == Tuple([discover("Hello"), discover(1)])
예제 #47
0
def test_discover_undiscoverable():
    class MyClass(object):
        pass
    with pytest.raises(NotImplementedError):
        discover(MyClass())
def test_record():
    assert discover({"name": "Alice", "amount": 100}) == Record(
        [["amount", discover(100)], ["name", discover("Alice")]]
    )
예제 #49
0
def test_discover_bytes():
    x = b'abcdefg'
    assert discover(x) == String('A')
def test_time():
    assert discover(time(12, 0, 1)) == time_
예제 #51
0
def test_letters_only_strings():
    strings = ('sunday', 'monday', 'tuesday', 'wednesday', 'thursday',
               'friday', 'saturday', 'a', 'b', 'now', 'yesterday', 'tonight')
    for s in strings:
        assert discover(s) == string
예제 #52
0
파일: csv.py 프로젝트: holdenk/blaze
    def __init__(self, path, mode='rt',
            schema=None, columns=None, types=None, typehints=None,
            dialect=None, header=None, open=open, nrows_discovery=50,
            **kwargs):
        if 'r' in mode and os.path.isfile(path) is not True:
            raise ValueError('CSV file "%s" does not exist' % path)
        if not schema and 'w' in mode:
            raise ValueError('Please specify schema for writable CSV file')
        self.path = path
        self.mode = mode
        self.open = open

        if os.path.exists(path) and mode != 'w':
            f = self.open(path)
            sample = f.read(16384)
            try:
                f.close()
            except AttributeError:
                pass
        else:
            sample = ''

        # Pandas uses sep instead of delimiter.
        # Lets support that too
        if 'sep' in kwargs:
            kwargs['delimiter'] = kwargs['sep']

        dialect = discover_dialect(sample, dialect, **kwargs)
        assert dialect
        if header is None:
            header = has_header(sample)

        if not schema and 'w' not in mode:
            if not types:
                with open(self.path) as f:
                    data = list(it.islice(csv.reader(f, **dialect), 1, nrows_discovery))
                    types = discover(data)
                    rowtype = types.subshape[0]
                    if isinstance(rowtype[0], Tuple):
                        types = types.subshape[0][0].dshapes
                        types = [unpack(t) for t in types]
                        types = [string if t == null else t
                                        for t in types]
                        types = [t if isinstance(t, Option) or t==string else Option(t)
                                      for t in types]
                    elif (isinstance(rowtype[0], Fixed) and
                          isinstance(rowtype[1], CType)):
                        types = int(rowtype[0]) * [rowtype[1]]
                    else:
                       ValueError("Could not discover schema from data.\n"
                                  "Please specify schema.")
            if not columns:
                if header:
                    with open(self.path) as f:
                        columns = next(csv.reader([next(f)], **dialect))
                else:
                    columns = ['_%d' % i for i in range(len(types))]
            if typehints:
                types = [typehints.get(c, t) for c, t in zip(columns, types)]

            schema = dshape(Record(list(zip(columns, types))))

        self._schema = schema

        self.header = header
        self.dialect = dialect
예제 #53
0
def test_heterogeneous_ordered_container():
    assert discover(('Hello', 1)) == Tuple([discover('Hello'), discover(1)])