示例#1
0
    def process_temp_bundle(self, ds_name, path):
        """
        Merge the temp bundle into the main bundle for the specified
        data source.

        Parameters
        ----------
        ds_name
        path

        Returns
        -------

        """
        tmp_bundle = extract_bundle(path)
        bundle_folder = get_data_source_folder(ds_name)
        ensure_directory(bundle_folder)
        if os.listdir(bundle_folder):
            zsource = bcolz.ctable(rootdir=tmp_bundle, mode='r')
            ztarget = bcolz.ctable(rootdir=bundle_folder, mode='r')
            merge_bundles(zsource, ztarget)

        else:
            os.rename(tmp_bundle, bundle_folder)

        pass
示例#2
0
文件: bcolz.py 项目: quasiben/odo
def resource_bcolz(uri, dshape=None, expected_dshape=None, **kwargs):
    if os.path.exists(uri):
        try:
            return ctable(rootdir=uri)
        except IOError:  # __rootdirs__ doesn't exist because we aren't a ctable
            return carray(rootdir=uri)
    else:
        if not dshape:
            raise ValueError("Must specify either existing bcolz directory or"
                             " valid datashape")
        dshape = datashape.dshape(dshape)

        dt = datashape.to_numpy_dtype(dshape)
        shape_tail = tuple(map(int, dshape.shape[1:]))  # tail of shape
        if dshape.shape[0] == datashape.var:
            shape = (0,) + shape_tail
        else:
            shape = (int(dshape.shape[0]),) + shape_tail

        x = np.empty(shape=shape, dtype=dt)

        kwargs = keyfilter(keywords.__contains__, kwargs)
        expectedlen = kwargs.pop('expectedlen',
                                 int(expected_dshape[0])
                                 if expected_dshape is not None and
                                 isinstance(expected_dshape[0], datashape.Fixed)
                                 else None)

        if datashape.predicates.isrecord(dshape.measure):
            return ctable(x, rootdir=uri, expectedlen=expectedlen, **kwargs)
        else:
            return carray(x, rootdir=uri, expectedlen=expectedlen, **kwargs)
示例#3
0
    def process_temp_bundle(self, ds_name, path):
        """
        Merge the temp bundle into the main bundle for the specified
        data source.

        Parameters
        ----------
        ds_name
        path

        Returns
        -------

        """
        tmp_bundle = extract_bundle(path)
        bundle_folder = get_data_source_folder(ds_name)
        ensure_directory(bundle_folder)
        if os.listdir(bundle_folder):
            zsource = bcolz.ctable(rootdir=tmp_bundle, mode='r')
            ztarget = bcolz.ctable(rootdir=bundle_folder, mode='a')
            ztarget.append(zsource)

        else:
            shutil.rmtree(bundle_folder, ignore_errors=True)
            os.rename(tmp_bundle, bundle_folder)
示例#4
0
文件: bcolz.py 项目: chdoig/blaze
def into(a, b, types=None, **kwargs):
    if isinstance(b[0], (tuple, list)):
        if not types:
            types=[None] * len(b[0])
        return ctable([into(np.ndarray(0), c2, dtype=dt)
                        for (c2, dt) in zip(zip(*b), types)],
                      **kwargs)
    else:
        return ctable([into(np.ndarray(0), b, dtype=types)],
                      **kwargs)
示例#5
0
def resource_bcolz(rootdir, **kwargs):
    if os.path.exists(rootdir):
        kwargs = keyfilter(keywords(ctable).__contains__, kwargs)
        return ctable(rootdir=rootdir, **kwargs)
    else:
        if 'dshape' in kwargs:
            dtype = to_numpy_dtype(kwargs['dshape'])
            kwargs = keyfilter(keywords(ctable).__contains__, kwargs)
            return ctable(np.empty(0, dtype), rootdir=rootdir, **kwargs)
        else:
            raise ValueError("File does not exist and no `dshape=` given")
示例#6
0
文件: bcolz.py 项目: leolujuyi/blaze
def resource_bcolz(rootdir, **kwargs):
    if os.path.exists(rootdir):
        kwargs = keyfilter(carray_keywords.__contains__, kwargs)
        return ctable(rootdir=rootdir, **kwargs)
    else:
        if 'dshape' in kwargs:
            dtype = to_numpy_dtype(kwargs['dshape'])
            kwargs = keyfilter(carray_keywords.__contains__, kwargs)
            return ctable(np.empty(0, dtype), rootdir=rootdir, **kwargs)
        else:
            raise ValueError("File does not exist and no `dshape=` given")
示例#7
0
def into(a, b, names=None, types=None, **kwargs):

    if isinstance(b[0], (tuple, list)):
        if not types:
            types=[None] * len(b[0])
        return ctable([into(np.ndarray(0), c2, dtype=dt)
                        for (c2, dt) in zip(zip(*b), types)], names,
                      **kwargs)
    else:
        if not names:
            names =[None] * len(b)
        arr = into(np.ndarray(0), b, dtype=np.dtype(list(zip(names, types))))
        return ctable(arr, names, **kwargs)
示例#8
0
文件: bcolz.py 项目: leolujuyi/blaze
def into(a, b, names=None, types=None, **kwargs):

    if isinstance(b[0], (tuple, list)):
        if not types:
            types=[None] * len(b[0])
        return ctable([into(np.ndarray(0), c2, dtype=dt)
                        for (c2, dt) in zip(zip(*b), types)], names,
                      **kwargs)
    else:
        if not names:
            names =[None] * len(b)
        arr = into(np.ndarray(0), b, dtype=np.dtype(list(zip(names, types))))
        return ctable(arr, names, **kwargs)
示例#9
0
    def _init_ctable(self, path):
        """
        Create empty ctable for given path.

        Parameters:
        -----------
        path : string
            The path to rootdir of the new ctable.
        """
        # Only create the subdir on container creation.
        sid_dirname = os.path.dirname(path)
        os.makedirs(sid_dirname)
        initial_array = np.empty(0, np.uint32)
        table = ctable(
            rootdir=path,
            columns=[
                initial_array,
                initial_array,
                initial_array,
                initial_array,
                initial_array,
            ],
            names=[
                'open',
                'high',
                'low',
                'close',
                'volume'
            ],
            expectedlen=self._expectedlen,
            mode='w',
        )
        table.flush()
        return table
示例#10
0
文件: bcolz.py 项目: jreback/into
def resource_bcolz(uri, dshape=None, **kwargs):
    if os.path.exists(uri):
        return ctable(rootdir=uri)
    else:
        if not dshape:
            raise ValueError("Must specify either existing bcolz directory or"
                    "valid datashape")
        dshape = datashape.dshape(dshape)

        dt = datashape.to_numpy_dtype(dshape)
        x = np.empty(shape=(0,), dtype=dt)

        if datashape.predicates.isrecord(dshape.measure):
            return ctable(x, rootdir=uri, **keyfilter(keywords.__contains__, kwargs))
        else:
            return carray(x, rootdir=uri, **keyfilter(keywords.__contains__, kwargs))
示例#11
0
 def to_bundle(self,base_data_path,bcolz_data_path,metas,dtypes):            
     base_data = []
     for i in range(len(metas)):
         base_data.append(np.array((),dtype = dtypes[i]))
     
     "create a table "
     data = bcolz.ctable(base_data, rootdir=bcolz_data_path, 
                      mode='w',names = metas)
     
     files = os.listdir(base_data_path)
     line_map  = {}
     begin_line = 0
     for file in files:
         print("load" + file)
         a =  LocalFileToDataFrame(os.path.join(base_data_path,file),file)
         if a is not None:
             for meta in metas:
                 a[meta] = change(a,meta)
                 
             for index in a.index:
                 data.append(list(a.loc[index]))
             end_line = begin_line + a.index.size - 1
             line_map.update({file:[begin_line,end_line]})
             begin_line = end_line + 1
     
     data.attrs['line_map'] = line_map
示例#12
0
 def to_bcolz(security_list, data, unit):
     # shape: (num, dims, length)
     for i, security in enumerate(security_list):
         if unit == '1d':
             names = [
                 'date', 'factor', 'open', 'high', 'low', 'close', 'volume',
                 'high_limit', 'low_limit', 'paused'
             ]
         else:
             names = [
                 'date', 'factor', 'open', 'high', 'low', 'close', 'volume'
             ]
         path = DataFunction.get_path(security, unit)
         array = data[i].astype('float')
         if not os.path.exists(path):
             os.makedirs(path, exist_ok=True)
             table = bcolz.ctable(rootdir=path,
                                  columns=list(array),
                                  names=names,
                                  mode='w')
             table.flush()
         else:
             # 进行数据检查
             table = bcolz.open(path, mode='a')
             date_index = table.names.index('date')
             array = array[:, array[0, :] > table[-1][date_index]]
             array = list(map(lambda x: tuple(x), array))
             table.append(array)
             table.flush()
示例#13
0
文件: toplevel.py 项目: B-Rich/bcolz
def open(rootdir, mode='a'):
    """
    open(rootdir, mode='a')

    Open a disk-based carray/ctable.

    Parameters
    ----------
    rootdir : pathname (string)
        The directory hosting the carray/ctable object.
    mode : the open mode (string)
        Specifies the mode in which the object is opened.  The supported
        values are:

          * 'r' for read-only
          * 'w' for emptying the previous underlying data
          * 'a' for allowing read/write on top of existing data

    Returns
    -------
    out : a carray/ctable object or None (if not objects are found)

    """
    # First try with a carray
    obj = None
    try:
        obj = bcolz.carray(rootdir=rootdir, mode=mode)
    except IOError:
        # Not a carray.  Now with a ctable
        try:
            obj = bcolz.ctable(rootdir=rootdir, mode=mode)
        except IOError:
            # Not a ctable
            pass
    return obj
示例#14
0
    def _init_ctable(self, path):
        """
        Create empty ctable for given path.

        Parameters:
        -----------
        path : string
            The path to rootdir of the new ctable.
        """
        # Only create the containing subdir on creation.
        # This is not to be confused with the `.bcolz` directory, but is the
        # directory up one level from the `.bcolz` directories.
        sid_containing_dirname = os.path.dirname(path)
        if not os.path.exists(sid_containing_dirname):
            # Other sids may have already created the containing directory.
            os.makedirs(sid_containing_dirname)
        initial_array = np.empty(0, np.uint32)
        table = ctable(
            rootdir=path,
            columns=[initial_array, initial_array, initial_array, initial_array, initial_array],
            names=["open", "high", "low", "close", "volume"],
            expectedlen=self._expectedlen,
            mode="w",
        )
        table.flush()
        return table
示例#15
0
    def _init_ctable(self, path):
        """
        Create empty ctable for given path.

        Parameters:
        -----------
        path : string
            The path to rootdir of the new ctable.
        """
        # Only create the containing subdir on creation.
        # This is not to be confused with the `.bcolz` directory, but is the
        # directory up one level from the `.bcolz` directories.
        sid_containing_dirname = os.path.dirname(path)
        if not os.path.exists(sid_containing_dirname):
            # Other sids may have already created the containing directory.
            os.makedirs(sid_containing_dirname)
        initial_array = np.empty(0, np.uint32)
        table = ctable(
            rootdir=path,
            columns=[
                initial_array,
                initial_array,
                initial_array,
                initial_array,
                initial_array,
            ],
            names=['open', 'high', 'low', 'close', 'volume'],
            expectedlen=self._expectedlen,
            mode='w',
        )
        table.flush()
        return table
示例#16
0
 def _ensure_ctable(self, sid):
     """Ensure that a ctable exists for ``sid``, then return it."""
     sid_path = self._sid_path(sid)
     print('sid_path', sid_path)
     if not os.path.exists(sid_path):
         return self._init_ctable(sid_path)
     return bcolz.ctable(rootdir=sid_path, mode='a')
示例#17
0
	def __init__(self, source_path):
		'''
			Create the Neighborhood, for finding nearest neighbors.

			Args:
			source_path (string): path to a bcolz database with three carray
			columns: 'id', 'vector' and 'norm'

		'''

		self.source_path = source_path

		# open bcolz datastores
		self.vectors = bvec.carray(rootdir=source_path + "/vector")
		self.norms = bvec.carray(rootdir=source_path + "/norm")
		self.source_table = bcolz.ctable(rootdir=source_path)

		#print("Created similarity object from BCOLZ files: source {0}; target: {1}".format(source_path, target_path))

		# create similarity object
		self.similarity = sim.Similarity(self.vectors, self.norms)

		# create domain <-> index maps

		# dictionary taking ids to indeces (source)
		self.id_index_map = self._create_id_index_map(self.source_table)

		self.index_id_map = self._create_index_id_map(self.source_table)
示例#18
0
def get_quantized_ctable(dtype, cparams, quantize=None, expectedlen=None):
    """Return a ctable with the quantize filter enabled for floating point cols.
    
    License
        This function is taken from the reflexible package (https://github.com/spectraphilic/reflexible/tree/master/reflexible).
        Authored by John F Burkhart <*****@*****.**> with contributions Francesc Alted <*****@*****.**>.
        Licensed under: 'This script follows creative commons usage.'
    """
    columns, names = [], []
    for fname, ftype in dtype.descr:
        names.append(fname)
        if 'f' in ftype:
            cparams2 = bcolz.cparams(clevel=cparams.clevel,
                                     cname=cparams.cname,
                                     quantize=quantize)
            columns.append(
                bcolz.zeros(0,
                            dtype=ftype,
                            cparams=cparams2,
                            expectedlen=expectedlen))
        else:
            columns.append(
                bcolz.zeros(0,
                            dtype=ftype,
                            cparams=cparams,
                            expectedlen=expectedlen))
    return bcolz.ctable(columns=columns, names=names)
示例#19
0
    def _init_ctable(self, path):
        """
        Create empty ctable for given path.

        Parameters:
        -----------
        path : string
            The path to rootdir of the new ctable.
        """
        # Only create the subdir on container creation.
        sid_dirname = os.path.dirname(path)
        os.makedirs(sid_dirname)
        initial_array = np.empty(0, np.uint32)
        table = ctable(
            rootdir=path,
            columns=[
                initial_array,
                initial_array,
                initial_array,
                initial_array,
                initial_array,
            ],
            names=['open', 'high', 'low', 'close', 'volume'],
            expectedlen=self._expectedlen,
            mode='w',
        )
        table.flush()
        return table
示例#20
0
文件: io.py 项目: amatthies/dask
def from_bcolz(x, chunksize=None, categorize=True, index=None, **kwargs):
    """ Read dask Dataframe from bcolz.ctable

    Parameters
    ----------

    x : bcolz.ctable
        Input data
    chunksize : int (optional)
        The size of blocks to pull out from ctable.  Ideally as large as can
        comfortably fit in memory
    categorize : bool (defaults to True)
        Automatically categorize all string dtypes
    index : string (optional)
        Column to make the index

    See Also
    --------

    from_array: more generic function not optimized for bcolz
    """
    import dask.array as da
    import bcolz
    if isinstance(x, (str, unicode)):
        x = bcolz.ctable(rootdir=x)
    bc_chunklen = max(x[name].chunklen for name in x.names)
    if chunksize is None and bc_chunklen > 10000:
        chunksize = bc_chunklen

    categories = dict()
    if categorize:
        for name in x.names:
            if (np.issubdtype(x.dtype[name], np.string_) or
                    np.issubdtype(x.dtype[name], np.unicode_) or
                    np.issubdtype(x.dtype[name], np.object_)):
                a = da.from_array(x[name], chunks=(chunksize * len(x.names),))
                categories[name] = da.unique(a)

    columns = tuple(x.dtype.names)
    divisions = (0,) + tuple(range(-1, len(x), chunksize))[1:]
    if divisions[-1] != len(x) - 1:
        divisions = divisions + (len(x) - 1,)
    new_name = 'from_bcolz' + next(tokens)
    dsk = dict(((new_name, i),
                (dataframe_from_ctable,
                 x,
                 (slice(i * chunksize, (i + 1) * chunksize),),
                 None, categories))
               for i in range(0, int(ceil(len(x) / chunksize))))

    result = DataFrame(dsk, new_name, columns, divisions)

    if index:
        assert index in x.names
        a = da.from_array(x[index], chunks=(chunksize * len(x.names),))
        q = np.linspace(0, 100, len(x) // chunksize + 2)
        divisions = da.percentile(a, q).compute()
        return set_partition(result, index, divisions, **kwargs)
    else:
        return result
示例#21
0
def open(rootdir, mode='a'):
    """
    open(rootdir, mode='a')

    Open a disk-based carray/ctable.

    Parameters
    ----------
    rootdir : pathname (string)
        The directory hosting the carray/ctable object.
    mode : the open mode (string)
        Specifies the mode in which the object is opened.  The supported
        values are:

          * 'r' for read-only
          * 'w' for emptying the previous underlying data
          * 'a' for allowing read/write on top of existing data

    Returns
    -------
    out : a carray/ctable object or IOError (if not objects are found)

    """
    # First try with a carray
    rootsfile = os.path.join(rootdir, ROOTDIRS)
    if os.path.exists(rootsfile):
        return bcolz.ctable(rootdir=rootdir, mode=mode)
    else:
        return bcolz.carray(rootdir=rootdir, mode=mode)
示例#22
0
def open(rootdir, mode='a'):
    """
    open(rootdir, mode='a')

    Open a disk-based carray/ctable.

    Parameters
    ----------
    rootdir : pathname (string)
        The directory hosting the carray/ctable object.
    mode : the open mode (string)
        Specifies the mode in which the object is opened.  The supported
        values are:

          * 'r' for read-only
          * 'w' for emptying the previous underlying data
          * 'a' for allowing read/write on top of existing data

    Returns
    -------
    out : a carray/ctable object or IOError (if not objects are found)

    """
    # First try with a carray
    rootsfile = os.path.join(rootdir, ROOTDIRS)
    if os.path.exists(rootsfile):
        return bcolz.ctable(rootdir=rootdir, mode=mode)
    else:
        return bcolz.carray(rootdir=rootdir, mode=mode)
示例#23
0
文件: io.py 项目: ogrisel/dask
def from_bcolz(x, chunksize=None, categorize=True, index=None, **kwargs):
    """ Read dask Dataframe from bcolz.ctable

    Parameters
    ----------

    x : bcolz.ctable
        Input data
    chunksize : int (optional)
        The size of blocks to pull out from ctable.  Ideally as large as can
        comfortably fit in memory
    categorize : bool (defaults to True)
        Automatically categorize all string dtypes
    index : string (optional)
        Column to make the index

    See Also
    --------

    from_array: more generic function not optimized for bcolz
    """
    import dask.array as da
    import bcolz
    if isinstance(x, (str, unicode)):
        x = bcolz.ctable(rootdir=x)
    bc_chunklen = max(x[name].chunklen for name in x.names)
    if chunksize is None and bc_chunklen > 10000:
        chunksize = bc_chunklen

    categories = dict()
    if categorize:
        for name in x.names:
            if (np.issubdtype(x.dtype[name], np.string_) or
                    np.issubdtype(x.dtype[name], np.unicode_) or
                    np.issubdtype(x.dtype[name], np.object_)):
                a = da.from_array(x[name], chunks=(chunksize * len(x.names),))
                categories[name] = da.unique(a)

    columns = tuple(x.dtype.names)
    divisions = (0,) + tuple(range(-1, len(x), chunksize))[1:]
    if divisions[-1] != len(x) - 1:
        divisions = divisions + (len(x) - 1,)
    new_name = 'from_bcolz' + next(tokens)
    dsk = dict(((new_name, i),
                (dataframe_from_ctable,
                 x,
                 (slice(i * chunksize, (i + 1) * chunksize),),
                 None, categories))
               for i in range(0, int(ceil(len(x) / chunksize))))

    result = DataFrame(dsk, new_name, columns, divisions)

    if index:
        assert index in x.names
        a = da.from_array(x[index], chunks=(chunksize * len(x.names),))
        q = np.linspace(0, 100, len(x) // chunksize + 2)
        divisions = da.percentile(a, q).compute()
        return set_partition(result, index, divisions, **kwargs)
    else:
        return result
示例#24
0
    def _init_ctable(self, path):
        """
        Create empty ctable for given path.
        Obtain 、Create 、Append、Attr empty ctable for given path.
        addcol(newcol[, name, pos, move])	Add a new newcol object as column.
        append(cols)	Append cols to this ctable -- e.g. : ctable
        Flush data in internal buffers to disk:
        This call should typically be done after performing modifications
        (__settitem__(), append()) in persistence mode. If you don’t do this,
        you risk losing part of your modifications.

        Parameters
        ----------
        path : string
            The path to rootdir of the new ctable.
        """
        bcolz_dir = os.path.dirname(path)
        print('bcolz_dir', bcolz_dir)
        if not os.path.exists(bcolz_dir):
            os.makedirs(bcolz_dir)
            print('path', path)
        initial_array = np.empty(0, np.uint32)
        # 配置bcolz
        bcolz.set_nthreads(Num * bcolz.detect_number_of_cores())
        # Print all the versions of packages that bcolz relies on.
        bcolz.print_versions()
        """
        clevel : int (0 <= clevel < 10) The compression level.
        shuffle : int The shuffle filter to be activated. Allowed values are bcolz.NOSHUFFLE (0), 
                bcolz.SHUFFLE (1) and bcolz.BITSHUFFLE (2). The default is bcolz.SHUFFLE.
        cname : string (‘blosclz’, ‘lz4’, ‘lz4hc’, ‘snappy’, ‘zlib’, ‘zstd’)
                Select the compressor to use inside Blosc.
        quantize : int (number of significant digits)
                Quantize data to improve (lossy) compression. Data is quantized using np.around(scale*data)/scale,
                 where scale is 2**bits, and bits is determined from the quantize value. For example,
                  if quantize=1, bits will be 4. 0 means that the quantization is disabled.
        default : cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
        """
        params = bcolz.cparams(clevel=9)
        table = bcolz.ctable(
            rootdir=path,
            columns=[
                initial_array,
                initial_array,
                initial_array,
                initial_array,
                initial_array,
                initial_array,
                initial_array,
            ],
            names=self._bcolz_fields,
            mode='w',
            cparams=params
        )
        print('cparams', table.cparams)
        table.flush()
        table = self._init_attr(table, path)
        # table.attrs['metadata'] = self._init_metadata(path)
        return table
示例#25
0
文件: bcolz.py 项目: leolujuyi/blaze
def into(a, b, **kwargs):
    if isinstance(a, type):
        kwargs = keyfilter(carray_keywords.__contains__, kwargs)
        return ctable(b, **kwargs)
    else:
        a.append(b)
        a.flush()
        return a
示例#26
0
 def __init__(self,
              daily_dir,
              default_ratio=OHLC_RATIO):
     self._tdx_dir = daily_dir
     self._root_dir = os.path.join(BcolzDir, 'daily')
     self._default_ohlc_ratio = default_ratio
     self._bcolz_fields = BcolzDailyFields
     self.c_table = bcolz.ctable(columns=BcolzDailyFields)
示例#27
0
文件: into.py 项目: dalejung/blaze
def into(a, b, **kwargs):
    names = dshape(nd.dshape_of(b))[1].names
    columns = [getattr(b, name) for name in names]
    columns = [np.asarray(nd.as_py(c))
            if to_numpy_dtype(dshape(nd.dshape_of(c))) == np.dtype('O')
            else into(np.ndarray(0), c) for c in columns]

    return bcolz.ctable(columns, names=names, **kwargs)
示例#28
0
文件: bcolz.py 项目: holdenk/blaze
def into(a, b, **kwargs):
    chunks = partition_all(1024, b)
    chunk = next(chunks)
    a = ctable([into(np.ndarray(0), c2) for c2 in zip(*chunk)], **kwargs)
    for chunk in chunks:
        a.append(list(zip(*chunk)))
    a.flush()
    return a
示例#29
0
 def __init__(self,
              minutes_dir,
              default_ratio=OHLC_RATIO):
     # tdx_dir --- 通达信数据所在
     self._tdx_dir = minutes_dir
     self._default_ohlc_ratio = default_ratio
     self._root_dir = os.path.join(BcolzDir, 'minute')
     self._bcolz_fields = BcolzMinuteFields
     self.c_table = bcolz.ctable(columns=BcolzMinuteFields)
示例#30
0
def into(a, b, **kwargs):
    names = dshape(nd.dshape_of(b))[1].names
    columns = [getattr(b, name) for name in names]
    columns = [
        np.asarray(nd.as_py(c)) if to_numpy_dtype(dshape(nd.dshape_of(c)))
        == np.dtype('O') else into(np.ndarray(0), c) for c in columns
    ]

    return bcolz.ctable(columns, names=names, **kwargs)
示例#31
0
def tobcolz(table, dtype=None, sample=1000, **kwargs):
    """Load data into a bcolz ctable, e.g.::

        >>> import petl as etl
        >>> table = [('foo', 'bar', 'baz'),
        ...          ('apples', 1, 2.5),
        ...          ('oranges', 3, 4.4),
        ...          ('pears', 7, .1)]
        >>> ctbl = etl.tobcolz(table)
        >>> ctbl
        ctable((3,), [('foo', '<U7'), ('bar', '<i8'), ('baz', '<f8')])
          nbytes: 132; cbytes: 1023.98 KB; ratio: 0.00
          cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
        [('apples', 1, 2.5) ('oranges', 3, 4.4) ('pears', 7, 0.1)]
        >>> ctbl.names
        ['foo', 'bar', 'baz']
        >>> ctbl['foo']
        carray((3,), <U7)
          nbytes := 84; cbytes := 511.98 KB; ratio: 0.00
          cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
          chunklen := 18724; chunksize: 524272; blocksize: 0
        ['apples' 'oranges' 'pears']

    Other keyword arguments are passed through to the ctable constructor.

    .. versionadded:: 1.1.0

    """

    import bcolz
    import numpy as np

    it = iter(table)
    peek, it = iterpeek(it, sample)
    hdr = next(it)
    # numpy is fussy about having tuples, need to make sure
    it = (tuple(row) for row in it)
    flds = list(map(text_type, hdr))
    dtype = construct_dtype(flds, peek, dtype)

    # create ctable
    kwargs.setdefault('expectedlen', 1000000)
    kwargs.setdefault('mode', 'w')
    ctbl = bcolz.ctable(np.array([], dtype=dtype), **kwargs)

    # fill chunk-wise
    chunklen = sum(ctbl.cols[name].chunklen
                   for name in ctbl.names) // len(ctbl.names)
    while True:
        data = list(itertools.islice(it, chunklen))
        data = np.array(data, dtype=dtype)
        ctbl.append(data)
        if len(data) < chunklen:
            break

    ctbl.flush()
    return ctbl
示例#32
0
 def table(self, data, names=None, expectedlen=None, **kwargs):
     names, columns = _util.check_table_like(data, names=names)
     kwargs = self._set_defaults(kwargs)
     ctbl = bcolz.ctable(columns, names=names, expectedlen=expectedlen,
                         **kwargs)
     # patch append method
     ctbl.append_original = ctbl.append
     ctbl.append = MethodType(_table_append, ctbl)
     return ctbl
示例#33
0
def _update_bcolz_data(symbol):
    item = {}
    item['req'] = 'market.%s.detail' % (symbol)
    item['id'] = cfg.get_id()
    tradeStr = json.dumps(item)
    rootdir = cfg.get_bcolz_tick_path(symbol)
    bcolz_exist = os.path.exists(os.path.join(rootdir, "__rootdirs__"))
    while (1):
        try:
            ws = create_connection("wss://api.huobipro.com/ws")
            break
        except:
            print('connect ws error,retry...')
            time.sleep(5)

    ws.send(tradeStr)
    values = []
    if bcolz_exist:
        ct = bcolz.open(rootdir)
    while (1):
        compressData = ws.recv()
        result = gzip.decompress(compressData).decode('utf-8')
        if result[:7] == '{"ping"':
            ts = result[8:21]
            pong = '{"pong":' + ts + '}'
            ws.send(pong)
            ws.send(tradeStr)
        else:
            res = json.loads(result)
            data = res['data']
            #print(data)
            v = [
                res['ts'], data['low'], data['count'], data['close'],
                data['vol'], data['id'], data['amount'], data['version'],
                data['high'], data['open']
            ]
            values.append(v)
            if len(values) == 10:
                a = np.array(values).reshape(len(values), 10)
                columns = list(a.T)
                if bcolz_exist:
                    #ct = bcolz.open(rootdir)
                    ct.append(columns)
                    ct.flush()
                else:
                    names = [
                        'ts', 'low', 'count', 'close', 'vol', 'id', 'amount',
                        'version', 'high', 'open'
                    ]
                    ba = bcolz.ctable(columns=columns,
                                      mode='w',
                                      names=names,
                                      rootdir=rootdir)
                    ba.flush()
                    bcolz_exist = True
                del values[:]
    pass
示例#34
0
    def _write_internal(self, filename, calendar, iterator):
        """
        Internal implementation of write.

        `iterator` should be an iterator yielding pairs of (asset, ctable).
        """
        total_rows = 0
        first_row = {}
        last_row = {}
        calendar_offset = {}

        # Maps column name -> output carray.
        columns = {k: carray(array([], dtype=uint32)) for k in US_EQUITY_PRICING_BCOLZ_COLUMNS}

        for asset_id, table in iterator:
            nrows = len(table)
            for column_name in columns:
                if column_name == "id":
                    # We know what the content of this column is, so don't
                    # bother reading it.
                    columns["id"].append(full((nrows,), asset_id, uint32))
                    continue
                columns[column_name].append(self.to_uint32(table[column_name][:], column_name))

            # Bcolz doesn't support ints as keys in `attrs`, so convert
            # assets to strings for use as attr keys.
            asset_key = str(asset_id)

            # Calculate the index into the array of the first and last row
            # for this asset. This allows us to efficiently load single
            # assets when querying the data back out of the table.
            first_row[asset_key] = total_rows
            last_row[asset_key] = total_rows + nrows - 1
            total_rows += nrows

            # Calculate the number of trading days between the first date
            # in the stored data and the first date of **this** asset. This
            # offset used for output alignment by the reader.

            # HACK: Index with a list so that we get back an array we can pass
            # to self.to_uint32.  We could try to extract this in the loop
            # above, but that makes the logic a lot messier.
            asset_first_day = self.to_uint32(table["day"][[0]], "day")[0]
            calendar_offset[asset_key] = calendar.get_loc(Timestamp(asset_first_day, unit="s", tz="UTC"))

        # This writes the table to disk.
        full_table = ctable(
            columns=[columns[colname] for colname in US_EQUITY_PRICING_BCOLZ_COLUMNS],
            names=US_EQUITY_PRICING_BCOLZ_COLUMNS,
            rootdir=filename,
            mode="w",
        )
        full_table.attrs["first_row"] = first_row
        full_table.attrs["last_row"] = last_row
        full_table.attrs["calendar_offset"] = calendar_offset
        full_table.attrs["calendar"] = calendar.asi8.tolist()
        return full_table
示例#35
0
    def truncate(self, date):
        """Truncate data beyond this date in all ctables."""
        truncate_slice_end = self.data_len_for_day(date)

        glob_path = os.path.join(self._rootdir, "*", "*", "*.bcolz")
        sid_paths = glob(glob_path)

        for sid_path in sid_paths:
            file_name = os.path.basename(sid_path)

            try:
                table = bcolz.open(rootdir=sid_path)
            except IOError:
                continue
            if table.len <= truncate_slice_end:
                logger.info("{0} not past truncate date={1}.", file_name, date)
                continue

            logger.info(
                "Truncting {0} back at end_date={1}", file_name, date.date()
            )

            new_table = table[:truncate_slice_end]
            tmp_path = sid_path + '.bak'
            shutil.move(sid_path, tmp_path)
            try:
                bcolz.ctable(new_table, rootdir=sid_path)
                try:
                    shutil.rmtree(tmp_path)
                except Exception as err:
                    logger.info(
                        "Could not delete tmp_path={0}, err={1}", tmp_path, err
                    )
            except Exception as err:
                # On any ctable write error, restore the original table.
                logger.warn(
                    "Could not write {0}, err={1}", file_name, err
                )
                shutil.move(tmp_path, sid_path)

        # Update end session in metadata.
        metadata = BcolzMinuteBarMetadata.read(self._rootdir)
        metadata.end_session = date
        metadata.write(self._rootdir)
示例#36
0
 def test_strings2(self):
     """Testing that we can use strings in a variable (II)"""
     dtype = np.dtype([("STATE", "|S32"),
                       ("b", np.int32)])
     recarr = np.array([('California', 1), ('Dakota', 9)], dtype=dtype)
     t = bcolz.ctable(recarr)
     res = [tuple(row) for row in t.where(
         "STATE == b'California'", outcols=["nrow__", "b"])]
     self.assertTrue(res == [(0, 1)],
                     "querying strings not working correctly")
示例#37
0
 def test02(self):
     """Testing `fetchwhere` method with a `outcols` with 1 field"""
     N = self.N
     ra = np.fromiter(((i, i, i * 3) for i in xrange(N)), dtype='i4,f8,i8')
     t = bcolz.ctable(ra)
     ct = t.fetchwhere('f1 < f2', outcols=('f1',))
     self.assertEqual(ct.names, ['f1'])
     l, s = len(ct), ct['f1'].sum()
     self.assertEqual(l, N - 1)
     self.assertEqual(s, (N - 1) * (N / 2))  # Gauss summation formula
示例#38
0
 def _raw_coex(self, scores, significance_threshold):
     path = os.path.expanduser(
         os.path.join(cf.options.basedir, 'databases',
                      "{}.{}.{}".format(self.type, self.name, 'coex')))
     self._global('current_significance_threshold', significance_threshold)
     sigs = scores >= significance_threshold
     return bcz.ctable(columns=[scores, sigs],
                       names=['score', 'significant'],
                       mode='w',
                       rootdir=path)
示例#39
0
def remove_empty_experiments(experiments):
    valid_experiments = []
    for experiment in experiments[:]:
        tl_dirname = os.path.join(experiment, 'trial_log')
        table = bcolz.ctable(rootdir=tl_dirname)
        if len(table) == 0:
            shutil.rmtree(experiment)
        else:
            valid_experiments.append(experiment)
    return valid_experiments
示例#40
0
 def test00(self):
     """Testing `fetchwhere` method with only an expression"""
     N = self.N
     ra = np.fromiter(((i, i * 2., i * 3)
                       for i in xrange(N)), dtype='i4,f8,i8')
     t = bcolz.ctable(ra)
     ct = t.fetchwhere('f1 < f2')
     l, s = len(ct), ct['f0'].sum()
     self.assertEqual(l, N - 1)
     self.assertEqual(s, (N - 1) * (N / 2))  # Gauss summation formula
示例#41
0
 def test03(self):
     """Testing `fetchwhere` method with a `limit`, `skip` parameter"""
     N, M = self.N, 101
     ra = np.fromiter(((i, i * 2., i * 3)
                       for i in xrange(N)), dtype='i4,f8,i8')
     t = bcolz.ctable(ra)
     l, s = 0, 0
     ct = t.fetchwhere('f1 < f2', limit=N - M - 2, skip=M)
     l, s = len(ct), ct['f0'].sum()
     self.assertEqual(l, N - M - 2)
     self.assertEqual(s, np.arange(M + 1, N - 1).sum())
示例#42
0
文件: Camoco.py 项目: schae234/Camoco
 def _raw_coex(self,scores,significance_threshold):
     path = os.path.expanduser(
             os.path.join(
                 cf.options.basedir,
                 'databases',
                 "{}.{}.{}".format(self.type, self.name, 'coex')
             )
         )
     self._global('current_significance_threshold',significance_threshold)
     sigs = scores >= significance_threshold
     return bcz.ctable(columns=[scores,sigs], names=['score','significant'], mode='w', rootdir=path)
示例#43
0
 def test04(self):
     """Testing `fetchwhere` method with an `out_flavor` parameter"""
     N = self.N
     ra = np.fromiter(((i, i * 2., i * 3)
                       for i in xrange(N)), dtype='i4,f8,i8')
     t = bcolz.ctable(ra)
     ct = t.fetchwhere('f1 < f2', out_flavor="numpy")
     self.assertEqual(type(ct), np.ndarray)
     l, s = len(ct), ct['f0'].sum()
     self.assertEqual(l, N - 1)
     self.assertEqual(s, (N - 1) * (N / 2))  # Gauss summation formula
示例#44
0
 def table(self, data, names=None, expectedlen=None, **kwargs):
     names, columns = _util.check_table_like(data, names=names)
     kwargs = self._set_defaults(kwargs)
     ctbl = bcolz.ctable(columns,
                         names=names,
                         expectedlen=expectedlen,
                         **kwargs)
     # patch append method
     ctbl.append_original = ctbl.append
     ctbl.append = MethodType(_table_append, ctbl)
     return ctbl
示例#45
0
 def test_strings2(self):
     """Testing that we can use strings in a variable (II)"""
     dtype = np.dtype([("STATE", "|S32"), ("b", np.int32)])
     recarr = np.array([('California', 1), ('Dakota', 9)], dtype=dtype)
     t = bcolz.ctable(recarr)
     res = [
         tuple(row) for row in t.where("STATE == b'California'",
                                       outcols=["nrow__", "b"])
     ]
     self.assertTrue(res == [(0, 1)],
                     "querying strings not working correctly")
示例#46
0
 def test07(self):
     """Testing `whereblocks` method with a `limit`, `skip` parameter"""
     N, M = self.N, 101
     ra = np.fromiter(((i, i * 2., i * 3)
                       for i in xrange(N)), dtype='i4,f8,i8')
     t = bcolz.ctable(ra)
     l, s = 0, 0
     for block in t.whereblocks('f1 < f2', limit=N - M - 2, skip=M):
         l += len(block)
         s += block['f0'].sum()
     self.assertEqual(l, N - M - 2)
     self.assertEqual(s, np.arange(M + 1, N - 1).sum())
示例#47
0
 def test05(self):
     """Testing `fetchwhere` method with global and local variables"""
     N = self.N
     lvar = GVAR
     ra = np.fromiter(((i, i * 2., i * 3)
                       for i in xrange(N)), dtype='i4,f8,i8')
     t = bcolz.ctable(ra)
     ct = t.fetchwhere('(f1 + lvar) < (f2 + GVAR)', out_flavor="numpy")
     self.assertEqual(type(ct), np.ndarray)
     l, s = len(ct), ct['f0'].sum()
     self.assertEqual(l, N - 1)
     self.assertEqual(s, (N - 1) * (N / 2))  # Gauss summation formula
示例#48
0
 def test03(self):
     """Testing `whereblocks` method with a `outfields` with 1 field"""
     N = self.N
     ra = np.fromiter(((i, i, i * 3) for i in xrange(N)), dtype='i4,f8,i8')
     t = bcolz.ctable(ra)
     l, s = 0, 0
     for block in t.whereblocks('f1 < f2', outfields=('f1',)):
         self.assertEqual(block.dtype.names, ('f1',))
         l += len(block)
         s += block['f1'].sum()
     self.assertEqual(l, N - 1)
     self.assertEqual(s, (N - 1) * (N / 2))  # Gauss summation formula
示例#49
0
 def test07(self):
     """Testing `whereblocks` method with a `limit`, `skip` parameter"""
     N, M = self.N, 101
     ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)),
                      dtype='i4,f8,i8')
     t = bcolz.ctable(ra)
     l, s = 0, 0
     for block in t.whereblocks('f1 < f2', limit=N - M - 2, skip=M):
         l += len(block)
         s += block['f0'].sum()
     self.assertEqual(l, N - M - 2)
     self.assertEqual(s, np.arange(M + 1, N - 1).sum())
示例#50
0
 def test05(self):
     """Testing `whereblocks` method with a `limit` parameter"""
     N, M = self.N, 101
     ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)),
                      dtype='i4,f8,i8')
     t = bcolz.ctable(ra)
     l, s = 0, 0
     for block in t.whereblocks('f1 < f2', limit=M):
         l += len(block)
         s += block['f0'].sum()
     self.assertEqual(l, M)
     self.assertEqual(s, M * ((M + 1) / 2))  # Gauss summation formula
示例#51
0
 def test00(self):
     """Testing `whereblocks` method with only an expression"""
     N = self.N
     ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)),
                      dtype='i4,f8,i8')
     t = bcolz.ctable(ra)
     l, s = 0, 0
     for block in t.whereblocks('f1 < f2'):
         l += len(block)
         s += block['f0'].sum()
     self.assertEqual(l, N - 1)
     self.assertEqual(s, (N - 1) * (N / 2))  # Gauss summation formula
示例#52
0
 def test05(self):
     """Testing `whereblocks` method with a `limit` parameter"""
     N, M = self.N, 101
     ra = np.fromiter(((i, i * 2., i * 3)
                       for i in xrange(N)), dtype='i4,f8,i8')
     t = bcolz.ctable(ra)
     l, s = 0, 0
     for block in t.whereblocks('f1 < f2', limit=M):
         l += len(block)
         s += block['f0'].sum()
     self.assertEqual(l, M)
     self.assertEqual(s, M * ((M + 1) / 2))  # Gauss summation formula
示例#53
0
 def test00(self):
     """Testing `whereblocks` method with only an expression"""
     N = self.N
     ra = np.fromiter(((i, i * 2., i * 3)
                       for i in xrange(N)), dtype='i4,f8,i8')
     t = bcolz.ctable(ra)
     l, s = 0, 0
     for block in t.whereblocks('f1 < f2'):
         l += len(block)
         s += block['f0'].sum()
     self.assertEqual(l, N - 1)
     self.assertEqual(s, (N - 1) * (N / 2))  # Gauss summation formula
示例#54
0
文件: ctable.py 项目: mrocklin/bquery
    def create_agg_ctable(self, groupby_cols, agg_list, nr_groups, rootdir):
        # create output table
        dtype_list = []

        for col in groupby_cols:
            dtype_list.append((col, self[col].dtype))

        agg_cols = []
        agg_ops = []
        op_translation = {
            'sum': 1,
            'sum_na': 2
        }

        for agg_info in agg_list:

            if not isinstance(agg_info, list):
                # straight forward sum (a ['m1', 'm2', ...] parameter)
                output_col = agg_info
                input_col = agg_info
                agg_op = 1
            else:
                # input/output settings [['mnew1', 'm1'], ['mnew2', 'm2], ...]
                output_col = agg_info[0]
                input_col = agg_info[1]
                if len(agg_info) == 2:
                    agg_op = 1
                else:
                    # input/output settings [['mnew1', 'm1', 'sum'], ['mnew2', 'm1, 'avg'], ...]
                    agg_op = agg_info[2]
                    if agg_op not in op_translation:
                        raise NotImplementedError(
                            'Unknown Aggregation Type: ' + unicode(agg_op))
                    agg_op = op_translation[agg_op]

            col_dtype = self[input_col].dtype
            # TODO: check if the aggregation columns is numeric
            # NB: we could build a concatenation for strings like pandas, but I would really prefer to see that as a
            # separate operation

            # save output
            agg_cols.append(output_col)
            agg_ops.append((input_col, agg_op))
            dtype_list.append((output_col, col_dtype))

        # create aggregation table
        ct_agg = bcolz.ctable(
            np.zeros(0, dtype_list),
            expectedlen=nr_groups,
            rootdir=rootdir)

        return ct_agg, dtype_list, agg_ops