def process_temp_bundle(self, ds_name, path): """ Merge the temp bundle into the main bundle for the specified data source. Parameters ---------- ds_name path Returns ------- """ tmp_bundle = extract_bundle(path) bundle_folder = get_data_source_folder(ds_name) ensure_directory(bundle_folder) if os.listdir(bundle_folder): zsource = bcolz.ctable(rootdir=tmp_bundle, mode='r') ztarget = bcolz.ctable(rootdir=bundle_folder, mode='r') merge_bundles(zsource, ztarget) else: os.rename(tmp_bundle, bundle_folder) pass
def resource_bcolz(uri, dshape=None, expected_dshape=None, **kwargs): if os.path.exists(uri): try: return ctable(rootdir=uri) except IOError: # __rootdirs__ doesn't exist because we aren't a ctable return carray(rootdir=uri) else: if not dshape: raise ValueError("Must specify either existing bcolz directory or" " valid datashape") dshape = datashape.dshape(dshape) dt = datashape.to_numpy_dtype(dshape) shape_tail = tuple(map(int, dshape.shape[1:])) # tail of shape if dshape.shape[0] == datashape.var: shape = (0,) + shape_tail else: shape = (int(dshape.shape[0]),) + shape_tail x = np.empty(shape=shape, dtype=dt) kwargs = keyfilter(keywords.__contains__, kwargs) expectedlen = kwargs.pop('expectedlen', int(expected_dshape[0]) if expected_dshape is not None and isinstance(expected_dshape[0], datashape.Fixed) else None) if datashape.predicates.isrecord(dshape.measure): return ctable(x, rootdir=uri, expectedlen=expectedlen, **kwargs) else: return carray(x, rootdir=uri, expectedlen=expectedlen, **kwargs)
def process_temp_bundle(self, ds_name, path): """ Merge the temp bundle into the main bundle for the specified data source. Parameters ---------- ds_name path Returns ------- """ tmp_bundle = extract_bundle(path) bundle_folder = get_data_source_folder(ds_name) ensure_directory(bundle_folder) if os.listdir(bundle_folder): zsource = bcolz.ctable(rootdir=tmp_bundle, mode='r') ztarget = bcolz.ctable(rootdir=bundle_folder, mode='a') ztarget.append(zsource) else: shutil.rmtree(bundle_folder, ignore_errors=True) os.rename(tmp_bundle, bundle_folder)
def into(a, b, types=None, **kwargs): if isinstance(b[0], (tuple, list)): if not types: types=[None] * len(b[0]) return ctable([into(np.ndarray(0), c2, dtype=dt) for (c2, dt) in zip(zip(*b), types)], **kwargs) else: return ctable([into(np.ndarray(0), b, dtype=types)], **kwargs)
def resource_bcolz(rootdir, **kwargs): if os.path.exists(rootdir): kwargs = keyfilter(keywords(ctable).__contains__, kwargs) return ctable(rootdir=rootdir, **kwargs) else: if 'dshape' in kwargs: dtype = to_numpy_dtype(kwargs['dshape']) kwargs = keyfilter(keywords(ctable).__contains__, kwargs) return ctable(np.empty(0, dtype), rootdir=rootdir, **kwargs) else: raise ValueError("File does not exist and no `dshape=` given")
def resource_bcolz(rootdir, **kwargs): if os.path.exists(rootdir): kwargs = keyfilter(carray_keywords.__contains__, kwargs) return ctable(rootdir=rootdir, **kwargs) else: if 'dshape' in kwargs: dtype = to_numpy_dtype(kwargs['dshape']) kwargs = keyfilter(carray_keywords.__contains__, kwargs) return ctable(np.empty(0, dtype), rootdir=rootdir, **kwargs) else: raise ValueError("File does not exist and no `dshape=` given")
def into(a, b, names=None, types=None, **kwargs): if isinstance(b[0], (tuple, list)): if not types: types=[None] * len(b[0]) return ctable([into(np.ndarray(0), c2, dtype=dt) for (c2, dt) in zip(zip(*b), types)], names, **kwargs) else: if not names: names =[None] * len(b) arr = into(np.ndarray(0), b, dtype=np.dtype(list(zip(names, types)))) return ctable(arr, names, **kwargs)
def _init_ctable(self, path): """ Create empty ctable for given path. Parameters: ----------- path : string The path to rootdir of the new ctable. """ # Only create the subdir on container creation. sid_dirname = os.path.dirname(path) os.makedirs(sid_dirname) initial_array = np.empty(0, np.uint32) table = ctable( rootdir=path, columns=[ initial_array, initial_array, initial_array, initial_array, initial_array, ], names=[ 'open', 'high', 'low', 'close', 'volume' ], expectedlen=self._expectedlen, mode='w', ) table.flush() return table
def resource_bcolz(uri, dshape=None, **kwargs): if os.path.exists(uri): return ctable(rootdir=uri) else: if not dshape: raise ValueError("Must specify either existing bcolz directory or" "valid datashape") dshape = datashape.dshape(dshape) dt = datashape.to_numpy_dtype(dshape) x = np.empty(shape=(0,), dtype=dt) if datashape.predicates.isrecord(dshape.measure): return ctable(x, rootdir=uri, **keyfilter(keywords.__contains__, kwargs)) else: return carray(x, rootdir=uri, **keyfilter(keywords.__contains__, kwargs))
def to_bundle(self,base_data_path,bcolz_data_path,metas,dtypes): base_data = [] for i in range(len(metas)): base_data.append(np.array((),dtype = dtypes[i])) "create a table " data = bcolz.ctable(base_data, rootdir=bcolz_data_path, mode='w',names = metas) files = os.listdir(base_data_path) line_map = {} begin_line = 0 for file in files: print("load" + file) a = LocalFileToDataFrame(os.path.join(base_data_path,file),file) if a is not None: for meta in metas: a[meta] = change(a,meta) for index in a.index: data.append(list(a.loc[index])) end_line = begin_line + a.index.size - 1 line_map.update({file:[begin_line,end_line]}) begin_line = end_line + 1 data.attrs['line_map'] = line_map
def to_bcolz(security_list, data, unit): # shape: (num, dims, length) for i, security in enumerate(security_list): if unit == '1d': names = [ 'date', 'factor', 'open', 'high', 'low', 'close', 'volume', 'high_limit', 'low_limit', 'paused' ] else: names = [ 'date', 'factor', 'open', 'high', 'low', 'close', 'volume' ] path = DataFunction.get_path(security, unit) array = data[i].astype('float') if not os.path.exists(path): os.makedirs(path, exist_ok=True) table = bcolz.ctable(rootdir=path, columns=list(array), names=names, mode='w') table.flush() else: # 进行数据检查 table = bcolz.open(path, mode='a') date_index = table.names.index('date') array = array[:, array[0, :] > table[-1][date_index]] array = list(map(lambda x: tuple(x), array)) table.append(array) table.flush()
def open(rootdir, mode='a'): """ open(rootdir, mode='a') Open a disk-based carray/ctable. Parameters ---------- rootdir : pathname (string) The directory hosting the carray/ctable object. mode : the open mode (string) Specifies the mode in which the object is opened. The supported values are: * 'r' for read-only * 'w' for emptying the previous underlying data * 'a' for allowing read/write on top of existing data Returns ------- out : a carray/ctable object or None (if not objects are found) """ # First try with a carray obj = None try: obj = bcolz.carray(rootdir=rootdir, mode=mode) except IOError: # Not a carray. Now with a ctable try: obj = bcolz.ctable(rootdir=rootdir, mode=mode) except IOError: # Not a ctable pass return obj
def _init_ctable(self, path): """ Create empty ctable for given path. Parameters: ----------- path : string The path to rootdir of the new ctable. """ # Only create the containing subdir on creation. # This is not to be confused with the `.bcolz` directory, but is the # directory up one level from the `.bcolz` directories. sid_containing_dirname = os.path.dirname(path) if not os.path.exists(sid_containing_dirname): # Other sids may have already created the containing directory. os.makedirs(sid_containing_dirname) initial_array = np.empty(0, np.uint32) table = ctable( rootdir=path, columns=[initial_array, initial_array, initial_array, initial_array, initial_array], names=["open", "high", "low", "close", "volume"], expectedlen=self._expectedlen, mode="w", ) table.flush() return table
def _init_ctable(self, path): """ Create empty ctable for given path. Parameters: ----------- path : string The path to rootdir of the new ctable. """ # Only create the containing subdir on creation. # This is not to be confused with the `.bcolz` directory, but is the # directory up one level from the `.bcolz` directories. sid_containing_dirname = os.path.dirname(path) if not os.path.exists(sid_containing_dirname): # Other sids may have already created the containing directory. os.makedirs(sid_containing_dirname) initial_array = np.empty(0, np.uint32) table = ctable( rootdir=path, columns=[ initial_array, initial_array, initial_array, initial_array, initial_array, ], names=['open', 'high', 'low', 'close', 'volume'], expectedlen=self._expectedlen, mode='w', ) table.flush() return table
def _ensure_ctable(self, sid): """Ensure that a ctable exists for ``sid``, then return it.""" sid_path = self._sid_path(sid) print('sid_path', sid_path) if not os.path.exists(sid_path): return self._init_ctable(sid_path) return bcolz.ctable(rootdir=sid_path, mode='a')
def __init__(self, source_path): ''' Create the Neighborhood, for finding nearest neighbors. Args: source_path (string): path to a bcolz database with three carray columns: 'id', 'vector' and 'norm' ''' self.source_path = source_path # open bcolz datastores self.vectors = bvec.carray(rootdir=source_path + "/vector") self.norms = bvec.carray(rootdir=source_path + "/norm") self.source_table = bcolz.ctable(rootdir=source_path) #print("Created similarity object from BCOLZ files: source {0}; target: {1}".format(source_path, target_path)) # create similarity object self.similarity = sim.Similarity(self.vectors, self.norms) # create domain <-> index maps # dictionary taking ids to indeces (source) self.id_index_map = self._create_id_index_map(self.source_table) self.index_id_map = self._create_index_id_map(self.source_table)
def get_quantized_ctable(dtype, cparams, quantize=None, expectedlen=None): """Return a ctable with the quantize filter enabled for floating point cols. License This function is taken from the reflexible package (https://github.com/spectraphilic/reflexible/tree/master/reflexible). Authored by John F Burkhart <*****@*****.**> with contributions Francesc Alted <*****@*****.**>. Licensed under: 'This script follows creative commons usage.' """ columns, names = [], [] for fname, ftype in dtype.descr: names.append(fname) if 'f' in ftype: cparams2 = bcolz.cparams(clevel=cparams.clevel, cname=cparams.cname, quantize=quantize) columns.append( bcolz.zeros(0, dtype=ftype, cparams=cparams2, expectedlen=expectedlen)) else: columns.append( bcolz.zeros(0, dtype=ftype, cparams=cparams, expectedlen=expectedlen)) return bcolz.ctable(columns=columns, names=names)
def _init_ctable(self, path): """ Create empty ctable for given path. Parameters: ----------- path : string The path to rootdir of the new ctable. """ # Only create the subdir on container creation. sid_dirname = os.path.dirname(path) os.makedirs(sid_dirname) initial_array = np.empty(0, np.uint32) table = ctable( rootdir=path, columns=[ initial_array, initial_array, initial_array, initial_array, initial_array, ], names=['open', 'high', 'low', 'close', 'volume'], expectedlen=self._expectedlen, mode='w', ) table.flush() return table
def from_bcolz(x, chunksize=None, categorize=True, index=None, **kwargs): """ Read dask Dataframe from bcolz.ctable Parameters ---------- x : bcolz.ctable Input data chunksize : int (optional) The size of blocks to pull out from ctable. Ideally as large as can comfortably fit in memory categorize : bool (defaults to True) Automatically categorize all string dtypes index : string (optional) Column to make the index See Also -------- from_array: more generic function not optimized for bcolz """ import dask.array as da import bcolz if isinstance(x, (str, unicode)): x = bcolz.ctable(rootdir=x) bc_chunklen = max(x[name].chunklen for name in x.names) if chunksize is None and bc_chunklen > 10000: chunksize = bc_chunklen categories = dict() if categorize: for name in x.names: if (np.issubdtype(x.dtype[name], np.string_) or np.issubdtype(x.dtype[name], np.unicode_) or np.issubdtype(x.dtype[name], np.object_)): a = da.from_array(x[name], chunks=(chunksize * len(x.names),)) categories[name] = da.unique(a) columns = tuple(x.dtype.names) divisions = (0,) + tuple(range(-1, len(x), chunksize))[1:] if divisions[-1] != len(x) - 1: divisions = divisions + (len(x) - 1,) new_name = 'from_bcolz' + next(tokens) dsk = dict(((new_name, i), (dataframe_from_ctable, x, (slice(i * chunksize, (i + 1) * chunksize),), None, categories)) for i in range(0, int(ceil(len(x) / chunksize)))) result = DataFrame(dsk, new_name, columns, divisions) if index: assert index in x.names a = da.from_array(x[index], chunks=(chunksize * len(x.names),)) q = np.linspace(0, 100, len(x) // chunksize + 2) divisions = da.percentile(a, q).compute() return set_partition(result, index, divisions, **kwargs) else: return result
def open(rootdir, mode='a'): """ open(rootdir, mode='a') Open a disk-based carray/ctable. Parameters ---------- rootdir : pathname (string) The directory hosting the carray/ctable object. mode : the open mode (string) Specifies the mode in which the object is opened. The supported values are: * 'r' for read-only * 'w' for emptying the previous underlying data * 'a' for allowing read/write on top of existing data Returns ------- out : a carray/ctable object or IOError (if not objects are found) """ # First try with a carray rootsfile = os.path.join(rootdir, ROOTDIRS) if os.path.exists(rootsfile): return bcolz.ctable(rootdir=rootdir, mode=mode) else: return bcolz.carray(rootdir=rootdir, mode=mode)
def _init_ctable(self, path): """ Create empty ctable for given path. Obtain 、Create 、Append、Attr empty ctable for given path. addcol(newcol[, name, pos, move]) Add a new newcol object as column. append(cols) Append cols to this ctable -- e.g. : ctable Flush data in internal buffers to disk: This call should typically be done after performing modifications (__settitem__(), append()) in persistence mode. If you don’t do this, you risk losing part of your modifications. Parameters ---------- path : string The path to rootdir of the new ctable. """ bcolz_dir = os.path.dirname(path) print('bcolz_dir', bcolz_dir) if not os.path.exists(bcolz_dir): os.makedirs(bcolz_dir) print('path', path) initial_array = np.empty(0, np.uint32) # 配置bcolz bcolz.set_nthreads(Num * bcolz.detect_number_of_cores()) # Print all the versions of packages that bcolz relies on. bcolz.print_versions() """ clevel : int (0 <= clevel < 10) The compression level. shuffle : int The shuffle filter to be activated. Allowed values are bcolz.NOSHUFFLE (0), bcolz.SHUFFLE (1) and bcolz.BITSHUFFLE (2). The default is bcolz.SHUFFLE. cname : string (‘blosclz’, ‘lz4’, ‘lz4hc’, ‘snappy’, ‘zlib’, ‘zstd’) Select the compressor to use inside Blosc. quantize : int (number of significant digits) Quantize data to improve (lossy) compression. Data is quantized using np.around(scale*data)/scale, where scale is 2**bits, and bits is determined from the quantize value. For example, if quantize=1, bits will be 4. 0 means that the quantization is disabled. default : cparams(clevel=5, shuffle=1, cname='lz4', quantize=0) """ params = bcolz.cparams(clevel=9) table = bcolz.ctable( rootdir=path, columns=[ initial_array, initial_array, initial_array, initial_array, initial_array, initial_array, initial_array, ], names=self._bcolz_fields, mode='w', cparams=params ) print('cparams', table.cparams) table.flush() table = self._init_attr(table, path) # table.attrs['metadata'] = self._init_metadata(path) return table
def into(a, b, **kwargs): if isinstance(a, type): kwargs = keyfilter(carray_keywords.__contains__, kwargs) return ctable(b, **kwargs) else: a.append(b) a.flush() return a
def __init__(self, daily_dir, default_ratio=OHLC_RATIO): self._tdx_dir = daily_dir self._root_dir = os.path.join(BcolzDir, 'daily') self._default_ohlc_ratio = default_ratio self._bcolz_fields = BcolzDailyFields self.c_table = bcolz.ctable(columns=BcolzDailyFields)
def into(a, b, **kwargs): names = dshape(nd.dshape_of(b))[1].names columns = [getattr(b, name) for name in names] columns = [np.asarray(nd.as_py(c)) if to_numpy_dtype(dshape(nd.dshape_of(c))) == np.dtype('O') else into(np.ndarray(0), c) for c in columns] return bcolz.ctable(columns, names=names, **kwargs)
def into(a, b, **kwargs): chunks = partition_all(1024, b) chunk = next(chunks) a = ctable([into(np.ndarray(0), c2) for c2 in zip(*chunk)], **kwargs) for chunk in chunks: a.append(list(zip(*chunk))) a.flush() return a
def __init__(self, minutes_dir, default_ratio=OHLC_RATIO): # tdx_dir --- 通达信数据所在 self._tdx_dir = minutes_dir self._default_ohlc_ratio = default_ratio self._root_dir = os.path.join(BcolzDir, 'minute') self._bcolz_fields = BcolzMinuteFields self.c_table = bcolz.ctable(columns=BcolzMinuteFields)
def into(a, b, **kwargs): names = dshape(nd.dshape_of(b))[1].names columns = [getattr(b, name) for name in names] columns = [ np.asarray(nd.as_py(c)) if to_numpy_dtype(dshape(nd.dshape_of(c))) == np.dtype('O') else into(np.ndarray(0), c) for c in columns ] return bcolz.ctable(columns, names=names, **kwargs)
def tobcolz(table, dtype=None, sample=1000, **kwargs): """Load data into a bcolz ctable, e.g.:: >>> import petl as etl >>> table = [('foo', 'bar', 'baz'), ... ('apples', 1, 2.5), ... ('oranges', 3, 4.4), ... ('pears', 7, .1)] >>> ctbl = etl.tobcolz(table) >>> ctbl ctable((3,), [('foo', '<U7'), ('bar', '<i8'), ('baz', '<f8')]) nbytes: 132; cbytes: 1023.98 KB; ratio: 0.00 cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0) [('apples', 1, 2.5) ('oranges', 3, 4.4) ('pears', 7, 0.1)] >>> ctbl.names ['foo', 'bar', 'baz'] >>> ctbl['foo'] carray((3,), <U7) nbytes := 84; cbytes := 511.98 KB; ratio: 0.00 cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0) chunklen := 18724; chunksize: 524272; blocksize: 0 ['apples' 'oranges' 'pears'] Other keyword arguments are passed through to the ctable constructor. .. versionadded:: 1.1.0 """ import bcolz import numpy as np it = iter(table) peek, it = iterpeek(it, sample) hdr = next(it) # numpy is fussy about having tuples, need to make sure it = (tuple(row) for row in it) flds = list(map(text_type, hdr)) dtype = construct_dtype(flds, peek, dtype) # create ctable kwargs.setdefault('expectedlen', 1000000) kwargs.setdefault('mode', 'w') ctbl = bcolz.ctable(np.array([], dtype=dtype), **kwargs) # fill chunk-wise chunklen = sum(ctbl.cols[name].chunklen for name in ctbl.names) // len(ctbl.names) while True: data = list(itertools.islice(it, chunklen)) data = np.array(data, dtype=dtype) ctbl.append(data) if len(data) < chunklen: break ctbl.flush() return ctbl
def table(self, data, names=None, expectedlen=None, **kwargs): names, columns = _util.check_table_like(data, names=names) kwargs = self._set_defaults(kwargs) ctbl = bcolz.ctable(columns, names=names, expectedlen=expectedlen, **kwargs) # patch append method ctbl.append_original = ctbl.append ctbl.append = MethodType(_table_append, ctbl) return ctbl
def _update_bcolz_data(symbol): item = {} item['req'] = 'market.%s.detail' % (symbol) item['id'] = cfg.get_id() tradeStr = json.dumps(item) rootdir = cfg.get_bcolz_tick_path(symbol) bcolz_exist = os.path.exists(os.path.join(rootdir, "__rootdirs__")) while (1): try: ws = create_connection("wss://api.huobipro.com/ws") break except: print('connect ws error,retry...') time.sleep(5) ws.send(tradeStr) values = [] if bcolz_exist: ct = bcolz.open(rootdir) while (1): compressData = ws.recv() result = gzip.decompress(compressData).decode('utf-8') if result[:7] == '{"ping"': ts = result[8:21] pong = '{"pong":' + ts + '}' ws.send(pong) ws.send(tradeStr) else: res = json.loads(result) data = res['data'] #print(data) v = [ res['ts'], data['low'], data['count'], data['close'], data['vol'], data['id'], data['amount'], data['version'], data['high'], data['open'] ] values.append(v) if len(values) == 10: a = np.array(values).reshape(len(values), 10) columns = list(a.T) if bcolz_exist: #ct = bcolz.open(rootdir) ct.append(columns) ct.flush() else: names = [ 'ts', 'low', 'count', 'close', 'vol', 'id', 'amount', 'version', 'high', 'open' ] ba = bcolz.ctable(columns=columns, mode='w', names=names, rootdir=rootdir) ba.flush() bcolz_exist = True del values[:] pass
def _write_internal(self, filename, calendar, iterator): """ Internal implementation of write. `iterator` should be an iterator yielding pairs of (asset, ctable). """ total_rows = 0 first_row = {} last_row = {} calendar_offset = {} # Maps column name -> output carray. columns = {k: carray(array([], dtype=uint32)) for k in US_EQUITY_PRICING_BCOLZ_COLUMNS} for asset_id, table in iterator: nrows = len(table) for column_name in columns: if column_name == "id": # We know what the content of this column is, so don't # bother reading it. columns["id"].append(full((nrows,), asset_id, uint32)) continue columns[column_name].append(self.to_uint32(table[column_name][:], column_name)) # Bcolz doesn't support ints as keys in `attrs`, so convert # assets to strings for use as attr keys. asset_key = str(asset_id) # Calculate the index into the array of the first and last row # for this asset. This allows us to efficiently load single # assets when querying the data back out of the table. first_row[asset_key] = total_rows last_row[asset_key] = total_rows + nrows - 1 total_rows += nrows # Calculate the number of trading days between the first date # in the stored data and the first date of **this** asset. This # offset used for output alignment by the reader. # HACK: Index with a list so that we get back an array we can pass # to self.to_uint32. We could try to extract this in the loop # above, but that makes the logic a lot messier. asset_first_day = self.to_uint32(table["day"][[0]], "day")[0] calendar_offset[asset_key] = calendar.get_loc(Timestamp(asset_first_day, unit="s", tz="UTC")) # This writes the table to disk. full_table = ctable( columns=[columns[colname] for colname in US_EQUITY_PRICING_BCOLZ_COLUMNS], names=US_EQUITY_PRICING_BCOLZ_COLUMNS, rootdir=filename, mode="w", ) full_table.attrs["first_row"] = first_row full_table.attrs["last_row"] = last_row full_table.attrs["calendar_offset"] = calendar_offset full_table.attrs["calendar"] = calendar.asi8.tolist() return full_table
def truncate(self, date): """Truncate data beyond this date in all ctables.""" truncate_slice_end = self.data_len_for_day(date) glob_path = os.path.join(self._rootdir, "*", "*", "*.bcolz") sid_paths = glob(glob_path) for sid_path in sid_paths: file_name = os.path.basename(sid_path) try: table = bcolz.open(rootdir=sid_path) except IOError: continue if table.len <= truncate_slice_end: logger.info("{0} not past truncate date={1}.", file_name, date) continue logger.info( "Truncting {0} back at end_date={1}", file_name, date.date() ) new_table = table[:truncate_slice_end] tmp_path = sid_path + '.bak' shutil.move(sid_path, tmp_path) try: bcolz.ctable(new_table, rootdir=sid_path) try: shutil.rmtree(tmp_path) except Exception as err: logger.info( "Could not delete tmp_path={0}, err={1}", tmp_path, err ) except Exception as err: # On any ctable write error, restore the original table. logger.warn( "Could not write {0}, err={1}", file_name, err ) shutil.move(tmp_path, sid_path) # Update end session in metadata. metadata = BcolzMinuteBarMetadata.read(self._rootdir) metadata.end_session = date metadata.write(self._rootdir)
def test_strings2(self): """Testing that we can use strings in a variable (II)""" dtype = np.dtype([("STATE", "|S32"), ("b", np.int32)]) recarr = np.array([('California', 1), ('Dakota', 9)], dtype=dtype) t = bcolz.ctable(recarr) res = [tuple(row) for row in t.where( "STATE == b'California'", outcols=["nrow__", "b"])] self.assertTrue(res == [(0, 1)], "querying strings not working correctly")
def test02(self): """Testing `fetchwhere` method with a `outcols` with 1 field""" N = self.N ra = np.fromiter(((i, i, i * 3) for i in xrange(N)), dtype='i4,f8,i8') t = bcolz.ctable(ra) ct = t.fetchwhere('f1 < f2', outcols=('f1',)) self.assertEqual(ct.names, ['f1']) l, s = len(ct), ct['f1'].sum() self.assertEqual(l, N - 1) self.assertEqual(s, (N - 1) * (N / 2)) # Gauss summation formula
def _raw_coex(self, scores, significance_threshold): path = os.path.expanduser( os.path.join(cf.options.basedir, 'databases', "{}.{}.{}".format(self.type, self.name, 'coex'))) self._global('current_significance_threshold', significance_threshold) sigs = scores >= significance_threshold return bcz.ctable(columns=[scores, sigs], names=['score', 'significant'], mode='w', rootdir=path)
def remove_empty_experiments(experiments): valid_experiments = [] for experiment in experiments[:]: tl_dirname = os.path.join(experiment, 'trial_log') table = bcolz.ctable(rootdir=tl_dirname) if len(table) == 0: shutil.rmtree(experiment) else: valid_experiments.append(experiment) return valid_experiments
def test00(self): """Testing `fetchwhere` method with only an expression""" N = self.N ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)), dtype='i4,f8,i8') t = bcolz.ctable(ra) ct = t.fetchwhere('f1 < f2') l, s = len(ct), ct['f0'].sum() self.assertEqual(l, N - 1) self.assertEqual(s, (N - 1) * (N / 2)) # Gauss summation formula
def test03(self): """Testing `fetchwhere` method with a `limit`, `skip` parameter""" N, M = self.N, 101 ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)), dtype='i4,f8,i8') t = bcolz.ctable(ra) l, s = 0, 0 ct = t.fetchwhere('f1 < f2', limit=N - M - 2, skip=M) l, s = len(ct), ct['f0'].sum() self.assertEqual(l, N - M - 2) self.assertEqual(s, np.arange(M + 1, N - 1).sum())
def _raw_coex(self,scores,significance_threshold): path = os.path.expanduser( os.path.join( cf.options.basedir, 'databases', "{}.{}.{}".format(self.type, self.name, 'coex') ) ) self._global('current_significance_threshold',significance_threshold) sigs = scores >= significance_threshold return bcz.ctable(columns=[scores,sigs], names=['score','significant'], mode='w', rootdir=path)
def test04(self): """Testing `fetchwhere` method with an `out_flavor` parameter""" N = self.N ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)), dtype='i4,f8,i8') t = bcolz.ctable(ra) ct = t.fetchwhere('f1 < f2', out_flavor="numpy") self.assertEqual(type(ct), np.ndarray) l, s = len(ct), ct['f0'].sum() self.assertEqual(l, N - 1) self.assertEqual(s, (N - 1) * (N / 2)) # Gauss summation formula
def test_strings2(self): """Testing that we can use strings in a variable (II)""" dtype = np.dtype([("STATE", "|S32"), ("b", np.int32)]) recarr = np.array([('California', 1), ('Dakota', 9)], dtype=dtype) t = bcolz.ctable(recarr) res = [ tuple(row) for row in t.where("STATE == b'California'", outcols=["nrow__", "b"]) ] self.assertTrue(res == [(0, 1)], "querying strings not working correctly")
def test07(self): """Testing `whereblocks` method with a `limit`, `skip` parameter""" N, M = self.N, 101 ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)), dtype='i4,f8,i8') t = bcolz.ctable(ra) l, s = 0, 0 for block in t.whereblocks('f1 < f2', limit=N - M - 2, skip=M): l += len(block) s += block['f0'].sum() self.assertEqual(l, N - M - 2) self.assertEqual(s, np.arange(M + 1, N - 1).sum())
def test05(self): """Testing `fetchwhere` method with global and local variables""" N = self.N lvar = GVAR ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)), dtype='i4,f8,i8') t = bcolz.ctable(ra) ct = t.fetchwhere('(f1 + lvar) < (f2 + GVAR)', out_flavor="numpy") self.assertEqual(type(ct), np.ndarray) l, s = len(ct), ct['f0'].sum() self.assertEqual(l, N - 1) self.assertEqual(s, (N - 1) * (N / 2)) # Gauss summation formula
def test03(self): """Testing `whereblocks` method with a `outfields` with 1 field""" N = self.N ra = np.fromiter(((i, i, i * 3) for i in xrange(N)), dtype='i4,f8,i8') t = bcolz.ctable(ra) l, s = 0, 0 for block in t.whereblocks('f1 < f2', outfields=('f1',)): self.assertEqual(block.dtype.names, ('f1',)) l += len(block) s += block['f1'].sum() self.assertEqual(l, N - 1) self.assertEqual(s, (N - 1) * (N / 2)) # Gauss summation formula
def test05(self): """Testing `whereblocks` method with a `limit` parameter""" N, M = self.N, 101 ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)), dtype='i4,f8,i8') t = bcolz.ctable(ra) l, s = 0, 0 for block in t.whereblocks('f1 < f2', limit=M): l += len(block) s += block['f0'].sum() self.assertEqual(l, M) self.assertEqual(s, M * ((M + 1) / 2)) # Gauss summation formula
def test00(self): """Testing `whereblocks` method with only an expression""" N = self.N ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)), dtype='i4,f8,i8') t = bcolz.ctable(ra) l, s = 0, 0 for block in t.whereblocks('f1 < f2'): l += len(block) s += block['f0'].sum() self.assertEqual(l, N - 1) self.assertEqual(s, (N - 1) * (N / 2)) # Gauss summation formula
def create_agg_ctable(self, groupby_cols, agg_list, nr_groups, rootdir): # create output table dtype_list = [] for col in groupby_cols: dtype_list.append((col, self[col].dtype)) agg_cols = [] agg_ops = [] op_translation = { 'sum': 1, 'sum_na': 2 } for agg_info in agg_list: if not isinstance(agg_info, list): # straight forward sum (a ['m1', 'm2', ...] parameter) output_col = agg_info input_col = agg_info agg_op = 1 else: # input/output settings [['mnew1', 'm1'], ['mnew2', 'm2], ...] output_col = agg_info[0] input_col = agg_info[1] if len(agg_info) == 2: agg_op = 1 else: # input/output settings [['mnew1', 'm1', 'sum'], ['mnew2', 'm1, 'avg'], ...] agg_op = agg_info[2] if agg_op not in op_translation: raise NotImplementedError( 'Unknown Aggregation Type: ' + unicode(agg_op)) agg_op = op_translation[agg_op] col_dtype = self[input_col].dtype # TODO: check if the aggregation columns is numeric # NB: we could build a concatenation for strings like pandas, but I would really prefer to see that as a # separate operation # save output agg_cols.append(output_col) agg_ops.append((input_col, agg_op)) dtype_list.append((output_col, col_dtype)) # create aggregation table ct_agg = bcolz.ctable( np.zeros(0, dtype_list), expectedlen=nr_groups, rootdir=rootdir) return ct_agg, dtype_list, agg_ops