예제 #1
0
 def test_json_date_parse(self):
     a = nd.parse_json('var * date', '["2012-03-17", "1922-12-30"]')
     self.assertEqual(nd.as_py(a), [date(2012, 3, 17), date(1922, 12, 30)])
     self.assertRaises(ValueError, nd.parse_json, 'var * date',
                       '["2012-03-17T17:00:15-0600", "1922-12-30 Thursday"]')
     a = nd.parse_json('var * date',
                       '["2012-06-17T17:00:15-0600", "1921-12-30 Thursday"]',
                       ectx=nd.eval_context(errmode='nocheck'))
     self.assertEqual(nd.as_py(a), [date(2012, 6, 17), date(1921, 12, 30)])
예제 #2
0
 def test_json_date_parse(self):
     a = nd.parse_json('var * date', '["2012-03-17", "1922-12-30"]')
     self.assertEqual(nd.as_py(a), [date(2012, 3, 17), date(1922, 12, 30)])
     self.assertRaises(ValueError, nd.parse_json, 'var * date',
                       '["2012-03-17T17:00:15-0600", "1922-12-30 Thursday"]')
     a = nd.parse_json('var * date',
                       '["2012-06-17T17:00:15-0600", "1921-12-30 Thursday"]',
                       ectx=nd.eval_context(errmode='nocheck'))
     self.assertEqual(nd.as_py(a), [date(2012, 6, 17), date(1921, 12, 30)])
예제 #3
0
 def test_struct(self):
     a = nd.parse_json('{x:int32, y:string, z:float32}',
                       '{"x":20, "y":"testing one two three", "z":-3.25}')
     self.assertEqual(nd.type_of(a),
                      ndt.type('{x:int32, y:string, z:float32}'))
     self.assertEqual(nd.type_of(a[...]),
                      ndt.type('{x:int32, y:string, z:float32}'))
     self.assertEqual(nd.type_of(a[0]), ndt.int32)
     self.assertEqual(nd.type_of(a[1]), ndt.string)
     self.assertEqual(nd.type_of(a[2]), ndt.float32)
     self.assertEqual(nd.type_of(a[-3]), ndt.int32)
     self.assertEqual(nd.type_of(a[-2]), ndt.string)
     self.assertEqual(nd.type_of(a[-1]), ndt.float32)
     self.assertEqual(
         nd.type_of(a[1:]),
         ndt.make_struct([ndt.string, ndt.float32], ['y', 'z']))
     self.assertEqual(nd.type_of(a[::-2]),
                      ndt.make_struct([ndt.float32, ndt.int32], ['z', 'x']))
     self.assertEqual(nd.as_py(a[0]), 20)
     self.assertEqual(nd.as_py(a[1]), "testing one two three")
     self.assertEqual(nd.as_py(a[2]), -3.25)
     self.assertEqual(nd.as_py(a[1:]), {
         'y': 'testing one two three',
         'z': -3.25
     })
     self.assertEqual(nd.as_py(a[::-2]), {'x': 20, 'z': -3.25})
예제 #4
0
 def dynd_arr(self):
     from ..io.client import requests
     """Downloads the data and returns a local in-memory nd.array"""
     # TODO: Need binary serialization
     j = requests.get_remote_json(self.url)
     tp = ndt.type(str(self.dshape))
     return nd.parse_json(tp, j)
 def dynd_arr(self):
     from ..io.client import requests
     """Downloads the data and returns a local in-memory nd.array"""
     # TODO: Need binary serialization
     j = requests.get_remote_json(self.url)
     tp = ndt.type(str(self.dshape))
     return nd.parse_json(tp, j)
예제 #6
0
 def _arr_cache(self):
     if self._cache_arr is not None:
         return self._cache_arr
     with open(self.path, mode=self.mode) as jsonfile:
         # This will read everything in-memory (but a memmap approach
         # is in the works)
         self._cache_arr = nd.parse_json(self.schema, jsonfile.read())
     return self._cache_arr
예제 #7
0
 def _arr_cache(self):
     if self._cache_arr is not None:
         return self._cache_arr
     with open(self.filename) as jsonfile:
         # This will read everything in-memory (but a memmap approach
         # is in the works)
         self._cache_arr = nd.parse_json(
             self.schema, jsonfile.read())
     return self._cache_arr
예제 #8
0
파일: json.py 프로젝트: dalejung/blaze
 def _iterchunks(self, blen=100):
     f = self.open(self.path)
     for chunk in partition_all(blen, f):
         text = "[" + ",\r\n".join(chunk) + "]"
         dshape = str(len(chunk)) + " * " + self.schema
         yield nd.parse_json(dshape, text)
     try:
         f.close()
     except AttributeError:
         pass
예제 #9
0
파일: json.py 프로젝트: vitan/blaze
 def _chunks(self, blen=100):
     f = self.open(self.path)
     for chunk in partition_all(blen, f):
         text = '[' + ',\r\n'.join(chunk) + ']'
         dshape = str(len(chunk) * self.schema)
         yield nd.parse_json(dshape, text)
     try:
         f.close()
     except AttributeError:
         pass
예제 #10
0
파일: json.py 프로젝트: B-Rich/blaze
 def _chunks(self, blen=100):
     f = self.open(self.path)
     for chunk in partition_all(blen, f):
         text = '[' + ',\r\n'.join(chunk) + ']'
         dshape = str(len(chunk) * self.schema)
         yield nd.parse_json(dshape, text)
     try:
         f.close()
     except AttributeError:
         pass
예제 #11
0
def load_json_directory_array(root, array_name):
    # Load the datashape
    dsfile = root + ".datashape"
    if not path.isfile(dsfile):
        raise Exception("No datashape file found for array %s" % array_name)
    with open(dsfile) as f:
        dt = ndt.type(f.read())

    # Scan for JSON files, assuming they're just #.json
    # Sort them numerically
    files = sorted([(int(path.splitext(path.basename(x))[0]), x) for x in glob.glob(path.join(root, "*.json"))])
    files = [x[1] for x in files]
    # Make an array with an extra fixed dimension, then
    # read a JSON file into each element of that array
    dt = ndt.make_fixed_dim(len(files), dt)
    arr = nd.empty(dt)
    for i, fname in enumerate(files):
        nd.parse_json(arr[i], nd.memmap(fname))
    arr.flag_as_immutable()
    return array(arr)
예제 #12
0
 def _arr_cache(self):
     if self._cache_arr is not None:
         return self._cache_arr
     jsonfile = self.open(self.path)
     # This will read everything in-memory (but a memmap approach
     # is in the works)
     self._cache_arr = nd.parse_json(str(self.dshape), jsonfile.read())
     try:
         jsonfile.close()
     except:
         pass
     return self._cache_arr
예제 #13
0
파일: json.py 프로젝트: B-Rich/blaze
 def _arr_cache(self):
     if self._cache_arr is not None:
         return self._cache_arr
     jsonfile = self.open(self.path)
     # This will read everything in-memory (but a memmap approach
     # is in the works)
     self._cache_arr = nd.parse_json(str(self.dshape), jsonfile.read())
     try:
         jsonfile.close()
     except:
         pass
     return self._cache_arr
예제 #14
0
파일: json.py 프로젝트: B-Rich/blaze
 def _arr_cache(self):
     if self._cache_arr is not None:
         return self._cache_arr
     jsonfile = self.open(self.path)
     # This will read everything in-memory (but a memmap approach
     # is in the works)
     text = '[' + ', '.join(jsonfile) + ']'
     try:
         jsonfile.close()
     except:
         pass
     self._cache_arr = nd.parse_json(str(self.dshape), text)
     return self._cache_arr
예제 #15
0
 def _arr_cache(self):
     if self._cache_arr is not None:
         return self._cache_arr
     jsonfile = self.open(self.path)
     # This will read everything in-memory (but a memmap approach
     # is in the works)
     text = '[' + ', '.join(jsonfile) + ']'
     try:
         jsonfile.close()
     except:
         pass
     self._cache_arr = nd.parse_json(str(self.dshape), text)
     return self._cache_arr
예제 #16
0
def load_json_file_array(root, array_name):
    # Load the datashape
    dsfile = root + ".datashape"
    if not path.isfile(dsfile):
        dsfile = path.dirname(root) + ".datashape"
        if not path.isfile(dsfile):
            raise Exception("No datashape file found for array %s" % array_name)
    with open(dsfile) as f:
        dt = ndt.type(f.read())

    # Load the JSON
    # TODO: Add stream support to parse_json for compressed JSON, etc.
    arr = nd.parse_json(dt, nd.memmap(root + ".json"))
    return array(arr)
예제 #17
0
def load_json_file_array(root, array_name):
    # Load the datashape
    dsfile = root + '.datashape'
    if not path.isfile(dsfile):
        dsfile = path.dirname(root) + '.datashape'
        if not path.isfile(dsfile):
            raise Exception('No datashape file found for array %s' % array_name)
    with open(dsfile) as f:
        dt = nd.dtype(f.read())

    # Load the JSON
    with open(root + '.json') as f:
        # TODO: Add stream support to parse_json for compressed JSON, etc.
        arr = nd.parse_json(dt, f.read())
    return arr
예제 #18
0
def load_json_file_list_array(root, array_name):
    # Load the datashape
    dsfile = root + '.datashape'
    if not path.isfile(dsfile):
        raise Exception('No datashape file found for array %s' % array_name)
    with open(dsfile) as f:
        dt = ndt.type(f.read())

    # Scan for JSON files -- no assumption on file suffix

    #open list of files and load into python list
    files = root + '.files'
    with open(files) as f:
        l_files = [fs.strip() for fs in f]

    # Make an array with an extra fixed dimension, then
    # read a JSON file into each element of that array
    dt = ndt.make_fixed_dim(len(l_files), dt)
    arr = nd.empty(dt)
    for i, fname in enumerate(l_files):
        with open(fname) as f:
            nd.parse_json(arr[i], f.read())
    arr.flag_as_immutable()
    return array(arr)
예제 #19
0
 def test_simple_computed_column(self):
     def computed_col(dst, src):
         for d, s in zip(dst, src):
             d.fullname = nd.as_py(s.firstname) + ' ' + nd.as_py(s.lastname)
             d.firstname = s.firstname
             d.lastname = s.lastname
             d.country = s.country
     a = nd.parse_json('2 * {firstname: string, lastname: string, country: string}',
                     """[{"firstname":"Mike", "lastname":"Myers", "country":"Canada"},
                     {"firstname":"Seth", "lastname":"Green", "country":"USA"}]""")
     b = nd.elwise_map([a], computed_col, ndt.type(
                     '{fullname: string, firstname: string, lastname: string, country: string}'))
     self.assertEqual(nd.as_py(b.fullname), ['Mike Myers', 'Seth Green'])
     self.assertEqual(nd.as_py(b.firstname), ['Mike', 'Seth'])
     self.assertEqual(nd.as_py(b.lastname), ['Myers', 'Green'])
     self.assertEqual(nd.as_py(b.country), ['Canada', 'USA'])
예제 #20
0
 def test_struct(self):
     a = nd.parse_json('{x:int32, y:string, z:float32}',
                     '{"x":20, "y":"testing one two three", "z":-3.25}')
     self.assertEqual(nd.type_of(a), ndt.type('{x:int32, y:string, z:float32}'))
     self.assertEqual(nd.type_of(a[...]), ndt.type('{x:int32, y:string, z:float32}'))
     self.assertEqual(nd.type_of(a[0]), ndt.int32)
     self.assertEqual(nd.type_of(a[1]), ndt.string)
     self.assertEqual(nd.type_of(a[2]), ndt.float32)
     self.assertEqual(nd.type_of(a[-3]), ndt.int32)
     self.assertEqual(nd.type_of(a[-2]), ndt.string)
     self.assertEqual(nd.type_of(a[-1]), ndt.float32)
     self.assertEqual(nd.type_of(a[1:]), ndt.make_struct([ndt.string, ndt.float32], ['y', 'z']))
     self.assertEqual(nd.type_of(a[::-2]), ndt.make_struct([ndt.float32, ndt.int32], ['z', 'x']))
     self.assertEqual(nd.as_py(a[0]), 20)
     self.assertEqual(nd.as_py(a[1]), "testing one two three")
     self.assertEqual(nd.as_py(a[2]), -3.25)
     self.assertEqual(nd.as_py(a[1:]), {'y':'testing one two three', 'z':-3.25})
     self.assertEqual(nd.as_py(a[::-2]), {'x':20, 'z':-3.25})
예제 #21
0
def load_blaze_array(conf, dir):
    """Loads a blaze array from the catalog configuration and catalog path"""
    # This is a temporary hack, need to transition to using the
    # deferred data descriptors for various formats.
    fsdir = conf.get_fsdir(dir)
    if not path.isfile(fsdir + '.array'):
        raise RuntimeError('Could not find blaze array description file %r' %
                           (fsdir + '.array'))
    with open(fsdir + '.array') as f:
        arrmeta = yaml.load(f)
    tp = arrmeta['type']
    imp = arrmeta['import']
    ds_str = arrmeta.get('datashape')  # optional. HDF5 does not need that.

    if tp == 'csv':
        with open(fsdir + '.csv', 'r') as f:
            rd = csv.reader(f)
            if imp.get('headers', False):
                # Skip the header line
                next(rd)
            dat = list(rd)
        arr = nd.array(dat, ndt.type(ds_str))[:]
        return blaze.array(arr)
    elif tp == 'json':
        arr = nd.parse_json(ds_str, nd.memmap(fsdir + '.json'))
        return blaze.array(arr)
    elif tp == 'hdf5':
        import tables as tb
        from blaze.datadescriptor import HDF5DataDescriptor
        fname = fsdir + '.h5'  # XXX .h5 assumed for HDF5
        with tb.open_file(fname, 'r') as f:
            dp = imp.get('datapath')  # specifies a path in HDF5
            try:
                dparr = f.get_node(f.root, dp, 'Leaf')
            except tb.NoSuchNodeError:
                raise RuntimeError('HDF5 file does not have a dataset in %r' %
                                   dp)
            dd = HDF5DataDescriptor(fname, dp)
        return blaze.array(dd)
    elif tp == 'npy':
        import numpy as np
        use_memmap = imp.get('memmap', False)
        if use_memmap:
            arr = np.load(fsdir + '.npy', 'r')
        else:
            arr = np.load(fsdir + '.npy')
        arr = nd.array(arr)
        arr = blaze.array(arr)
        ds = datashape.dshape(ds_str)
        if not compatible_array_dshape(arr, ds):
            raise RuntimeError(
                ('NPY file for blaze catalog path %r ' +
                 'has the wrong datashape (%r instead of ' + '%r)') %
                (arr.dshape, ds))
        return arr
    elif tp == 'py':
        ds = datashape.dshape(ds_str)
        # The script is run with the following globals,
        # and should put the loaded array in a global
        # called 'result'.
        gbl = {
            'catconf': conf,  # Catalog configuration object
            'impdata': imp,  # Import data from the .array file
            'catpath': dir,  # Catalog path
            'fspath': fsdir,  # Equivalent filesystem path
            'dshape': ds  # Datashape the result should have
        }
        if py2help.PY2:
            execfile(fsdir + '.py', gbl, gbl)
        else:
            with open(fsdir + '.py') as f:
                code = compile(f.read(), fsdir + '.py', 'exec')
                exec(code, gbl, gbl)
        arr = gbl.get('result', None)
        if arr is None:
            raise RuntimeError(
                ('Script for blaze catalog path %r did not ' +
                 'return anything in "result" variable') % (dir))
        elif not isinstance(arr, blaze.Array):
            raise RuntimeError(
                ('Script for blaze catalog path %r returned ' +
                 'wrong type of object (%r instead of ' + 'blaze.Array)') %
                (type(arr)))
        if not compatible_array_dshape(arr, ds):
            raise RuntimeError(
                ('Script for blaze catalog path %r returned ' +
                 'array with wrong datashape (%r instead of ' + '%r)') %
                (arr.dshape, ds))
        return arr
    else:
        raise ValueError(
            ('Unsupported array type %r from ' + 'blaze catalog entry %r') %
            (tp, dir))
def load_blaze_array(conf, dir):
    """Loads a blaze array from the catalog configuration and catalog path"""
    # This is a temporary hack, need to transition to using the
    # deferred data descriptors for various formats.
    fsdir = conf.get_fsdir(dir)
    if not path.isfile(fsdir + '.array'):
        raise RuntimeError('Could not find blaze array description file %r'
                           % (fsdir + '.array'))
    with open(fsdir + '.array') as f:
        arrmeta = yaml.load(f)
    tp = arrmeta['type']
    imp = arrmeta['import']
    ds_str = arrmeta.get('datashape')  # optional. HDF5 does not need that.

    if tp == 'csv':
        with open(fsdir + '.csv', 'r') as f:
            rd = csv.reader(f)
            if imp.get('headers', False):
                # Skip the header line
                next(rd)
            dat = list(rd)
        arr = nd.array(dat, ndt.type(ds_str))[:]
        return blaze.array(arr)
    elif tp == 'json':
        arr = nd.parse_json(ds_str, nd.memmap(fsdir + '.json'))
        return blaze.array(arr)
    elif tp == 'hdf5':
        import tables as tb
        from blaze.datadescriptor import HDF5_DDesc
        fname = fsdir + '.h5'   # XXX .h5 assumed for HDF5
        with tb.open_file(fname, 'r') as f:
            dp = imp.get('datapath')  # specifies a path in HDF5
            try:
                dparr = f.get_node(f.root, dp, 'Leaf')
            except tb.NoSuchNodeError:
                raise RuntimeError(
                    'HDF5 file does not have a dataset in %r' % dp)
            dd = HDF5_DDesc(fname, dp)
        return blaze.array(dd)
    elif tp == 'npy':
        import numpy as np
        use_memmap = imp.get('memmap', False)
        if use_memmap:
            arr = np.load(fsdir + '.npy', 'r')
        else:
            arr = np.load(fsdir + '.npy')
        arr = nd.array(arr)
        arr = blaze.array(arr)
        ds = datashape.dshape(ds_str)
        if not matches_datashape_pattern(arr.dshape, ds):
            raise RuntimeError(('NPY file for blaze catalog path %r ' +
                                'has the wrong datashape (%r instead of ' +
                                '%r)') % (arr.dshape, ds))
        return arr
    elif tp == 'py':
        ds = datashape.dshape(ds_str)
        # The script is run with the following globals,
        # and should put the loaded array in a global
        # called 'result'.
        gbl = {'catconf': conf,  # Catalog configuration object
               'impdata': imp,   # Import data from the .array file
               'catpath': dir,   # Catalog path
               'fspath': fsdir,  # Equivalent filesystem path
               'dshape': ds      # Datashape the result should have
               }
        if py2help.PY2:
            execfile(fsdir + '.py', gbl, gbl)
        else:
            with open(fsdir + '.py') as f:
                code = compile(f.read(), fsdir + '.py', 'exec')
                exec(code, gbl, gbl)
        arr = gbl.get('result', None)
        if arr is None:
            raise RuntimeError(('Script for blaze catalog path %r did not ' +
                                'return anything in "result" variable')
                               % (dir))
        elif not isinstance(arr, blaze.Array):
            raise RuntimeError(('Script for blaze catalog path %r returned ' +
                                'wrong type of object (%r instead of ' +
                                'blaze.Array)') % (type(arr)))
        if not matches_datashape_pattern(arr.dshape, ds):
            raise RuntimeError(('Script for blaze catalog path %r returned ' +
                                'array with wrong datashape (%r instead of ' +
                                '%r)') % (arr.dshape, ds))
        return arr
    else:
        raise ValueError(('Unsupported array type %r from ' +
                          'blaze catalog entry %r')
                         % (tp, dir))
예제 #23
0
파일: rarray.py 프로젝트: imclab/blaze
 def get_dynd(self):
     """Downloads the data and returns a local in-memory nd.array"""
     j = requests.get_remote_json(self.url)
     tp = ndt.type(str(self.dshape))
     return nd.parse_json(tp, j)
예제 #24
0
 def test_basic(self):
     dd = JSON(self.filename, 'r', dshape=self.dshape)
     self.assertEqual(list(dd),
                      [nd.as_py(nd.parse_json(self.dshape,
                          json.dumps(self.data)))])
예제 #25
0
 def _chunks(self, blen=100):
     with self.open(self.path) as f:
         for chunk in partition_all(blen, f):
             text = '[' + ',\r\n'.join(chunk) + ']'
             dshape = str(len(chunk) * self.schema)
             yield nd.parse_json(dshape, text)
예제 #26
0
 def _chunks(self, blen=100):
     with self.open(self.path) as f:
         for chunk in partition_all(blen, f):
             text = '[' + ',\r\n'.join(chunk) + ']'
             dshape = str(len(chunk) * self.schema)
             yield nd.parse_json(dshape, text)
예제 #27
0
파일: rarray.py 프로젝트: dreamfrog/blaze
 def get_dynd(self):
     """Downloads the data and returns a local in-memory nd.array"""
     j = requests.get_remote_json(self.url)
     return nd.parse_json(self.dtype, j)