def to_bcolz(df, rootdir, expectedlen=None, reset_index=True, compute=True, overwrite=True, get=get_sync): """ Save dask DataFrame to BColz table Parameters ---------- df: da.DataFrame rootdir: directory to save BColz table expectedlen: expected length of table """ from bcolz import ctable if os.path.exists(rootdir): if overwrite: import shutil shutil.rmtree(rootdir) else: raise ValueError('Directory already exists') name = 'to-bcolz-' + uuid.uuid1().hex # Create empty ctable and append solution # dtype = [(name, dtype.str) for name, dtype in zip(ddf.columns, ddf.dtypes)] # dsk[(name, -1)] = (bcolz.fromiter, (), dtype, 0) # Create empty table if expectedlen is None: # row_bytes = sum([d.itemsize for d in df.dtypes]) # if reset_index: # row_bytes += df.index.dtype.itemsize # chunksize = np.ceil(1e7 / row_bytes).astype(int) expectedlen = (lambda df, n: len(df)*n, (df._name, 0), df.npartitions) if reset_index: df = df.reset_index() dsk = dict() dsk[(name, -1)] = expectedlen dsk[(name, 0)] = (lambda df, exp_len, rt: ctable.fromdataframe( df, expectedlen=int(exp_len), rootdir=rt), (df._name, 0), (name, -1), rootdir) for i in range(1, df.npartitions): task = (lambda ct, df: ct.append(ctable.fromdataframe(df)), (name, 0), (df._name, i)) dsk[(name, i)] = task dsk = merge(df.dask, dsk) keys = [(name, df.npartitions - 1)] if compute: return DataFrame._get(dsk, keys, get=get) else: return delayed([Delayed(key, [dsk]) for key in keys])
def to_ctable(self, raw_data, invalid_data_behavior): if isinstance(raw_data, ctable): # we already have a ctable so do nothing return raw_data # windorise the pricing fields plus volume and open interest winsorise_uint32(raw_data, invalid_data_behavior, "volume", *PRICING) winsorise_uint32(raw_data, invalid_data_behavior, "open_interest", *PRICING) # process the pricing fields and greeks separatly (greeks signed) processed_pricing = (raw_data[list(PRICING)] * 1000).round().astype("uint32") processed_greeks = (raw_data[list(GREEKS)] * 1000).round().astype("int64") processed = pd.concat([processed_pricing, processed_greeks], axis=1) # process the dates dates = raw_data.index.values.astype("datetime64[s]") days_to_expiration = raw_data.days_to_expiration.values.astype( "timedelta64[D]") check_uint32_safe(dates.max().view(np.int64), "day") check_uint32_safe(days_to_expiration.max().view(np.int64), "days_to_expiration") processed["day"] = dates.astype("uint32") processed["days_to_expiration"] = days_to_expiration.astype("uint32") processed["volume"] = raw_data.volume.astype("uint32") processed["open_interest"] = raw_data.open_interest.astype("uint32") return ctable.fromdataframe(processed)
def to_ctable(raw_data, invalid_data_behavior): if isinstance(raw_data, ctable): # we already have a ctable so do nothing return raw_data winsorise_uint32(raw_data, invalid_data_behavior, 'volume', *OHLC) processed = (raw_data[list(OHLC)] * OHLC_RATIO).astype('uint32') dates = raw_data.index.values.astype('datetime64[s]') check_uint32_safe(dates.max().view(np.int64), 'day') processed['day'] = dates.astype('uint32') processed['volume'] = raw_data.volume.astype('uint32') # case of options. Ideally, make separate functions / switch cases try: processed['open_interest'] = raw_data.open_interest.astype('uint32') # FIXME bis and ask should be uint for consistency processed['bid'] = (raw_data.bid * OHLC_RATIO).astype(numpy.uint32) processed['ask'] = (raw_data.ask * OHLC_RATIO).astype(numpy.uint32) processed['delta'] = (raw_data.delta * OHLC_RATIO).astype(numpy.int32) processed['gamma'] = (raw_data.gamma * OHLC_RATIO).astype(numpy.int32) processed['theta'] = (raw_data.theta * OHLC_RATIO).astype(numpy.int32) processed['vega'] = (raw_data.vega * OHLC_RATIO).astype(numpy.int32) processed['rho'] = (raw_data.rho * OHLC_RATIO).astype(numpy.int32) processed['iv'] = (raw_data.iv * OHLC_RATIO).astype(numpy.int32) except: pass return ctable.fromdataframe(processed)
def shards(bcolz_dir, taxi_df): single_bcolz = str(bcolz_dir.join('yellow_tripdata_2016-01.bcolz')) ct = ctable.fromdataframe(taxi_df, rootdir=single_bcolz) step, remainder = divmod(len(ct), NR_SHARDS) count = 0 shards = [single_bcolz] for idx in range(0, len(ct), step): print("Creating shard {}".format(count + 1)) if idx == len(ct) * (NR_SHARDS - 1): step = step + remainder shard_file = str(bcolz_dir.join('tripdata_2016-01-%s.bcolzs' % count)) ct_shard = bcolz.fromiter(ct.iter(idx, idx + step), ct.dtype, step, rootdir=shard_file, mode='w') shards.append(shard_file) ct_shard.flush() count += 1 yield shards
def _raw_data_for_asset(self, asset_id): """ Generate 'raw' data that encodes information about the asset. See class docstring for a description of the data format. """ # Get the dates for which this asset existed according to our asset # info. dates = self._calendar[self._calendar.slice_indexer( self.asset_start(asset_id), self.asset_end(asset_id))] data = full( (len(dates), len(US_EQUITY_PRICING_BCOLZ_COLUMNS)), asset_id * (100 * 1000), dtype=uint32, ) # Add 10,000 * column-index to OHLCV columns data[:, :5] += arange(5, dtype=uint32) * (10 * 1000) # Add days since Jan 1 2001 for OHLCV columns. data[:, :5] += (dates - self.PSEUDO_EPOCH).days[:, None].astype(uint32) frame = DataFrame( data, index=dates, columns=US_EQUITY_PRICING_BCOLZ_COLUMNS, ) frame['day'] = nanos_to_seconds(dates.asi8) frame['id'] = asset_id return ctable.fromdataframe(frame)
def test_downloader(redis_server, downloader, tmpdir): # Make a bcolz from a pandas DataFrame data_df = pd.DataFrame( data=np.random.rand(100, 10), columns=['col_{}'.format(i+1) for i in range(10)]) local_bcolz = str(tmpdir.join('test_bcolz')) ctable.fromdataframe(data_df, rootdir=local_bcolz) assert os.path.isdir(local_bcolz) # Zip up the bcolz directory and upload to S3 upload_dir = tmpdir.mkdir('upload') zipfile_path = bqueryd.util.zip_to_file(local_bcolz, str(upload_dir))[0] assert os.path.isfile(zipfile_path) upload_file = str(upload_dir.join('test.bcolz')) shutil.move(zipfile_path, upload_file) assert os.path.isfile(upload_file) s3_conn = downloader._get_s3_conn()[-1] with clean_bucket(s3_conn, 'bcolz') as bucket: bucket.put_object(Key='test.bcolz', Body=open(upload_file, 'rb')) uploads = [key.key for key in bucket.objects.all()] assert uploads == ['test.bcolz'] # Construct the redis entry that the downloader is looking for progress_slot = '%s_%s' % (time.time() - 60, -1) node_filename_slot = '%s_%s' % (socket.gethostname(), 's3://bcolz/test.bcolz') ticket = str(uuid4()) incoming_dir = os.path.join(bqueryd.INCOMING, ticket) assert not os.path.isdir(incoming_dir) redis_server.hset(bqueryd.REDIS_TICKET_KEY_PREFIX + ticket, node_filename_slot, progress_slot) # wait for the downloader to catch up sleep(10) # Check that incoming dir now has the test.bcolz file. assert os.listdir(incoming_dir) == ['test.bcolz'] # Check that the progress slot has been updated updated_slot = redis_server.hget(bqueryd.REDIS_TICKET_KEY_PREFIX + ticket, node_filename_slot) assert updated_slot.split('_')[-1] == 'DONE'
def gen_tables(self, assets): """ Read CSVs as DataFrames from our asset map. """ dtypes = self._csv_dtypes for asset in assets: path = self._asset_map.get(asset) if path is None: raise KeyError("No path supplied for asset %s" % asset) data = read_csv(path, parse_dates=['day'], dtype=dtypes) yield asset, ctable.fromdataframe(data)
def to_ctable(raw_data, invalid_data_behavior): if isinstance(raw_data, ctable): # we already have a ctable so do nothing return raw_data winsorise_uint32(raw_data, invalid_data_behavior, 'volume', *OHLC) processed = (raw_data[list(OHLC)] * 1000).astype('uint32') dates = raw_data.index.values.astype('datetime64[s]') check_uint32_safe(dates.max().view(np.int64), 'day') processed['day'] = dates.astype('uint32') processed['volume'] = raw_data.volume.astype('uint32') return ctable.fromdataframe(processed)
def to_ctable(self, raw_data, invalid_data_behavior): if isinstance(raw_data, ctable): # we already have a ctable so do nothing return raw_data winsorise_uint32(raw_data, invalid_data_behavior, "volume", *OHLC) processed = (raw_data[list(OHLC)] * 1000).round().astype("uint32") dates = raw_data.index.values.astype("datetime64[s]") check_uint32_safe(dates.max().view(np.int64), "day") processed["day"] = dates.astype("uint32") processed["volume"] = raw_data.volume.astype("uint32") return ctable.fromdataframe(processed)
def to_ctable(self, raw_data, invalid_data_behavior): if isinstance(raw_data, ctable): # we already have a ctable so do nothing return raw_data # # 检查OHLCV + 附加列数值是否溢出 winsorise_uint32(raw_data, invalid_data_behavior, 'volume', *OHLC.union(EXTRA_COLUMNS)) # # 值列统一调整为uint32 #processed = (raw_data[list(OHLC)] * 1000).astype('uint32') processed = (raw_data[list(OHLC.union(EXTRA_COLUMNS).union(['volume']))]).astype('uint32') dates = raw_data.index.values.astype('datetime64[s]') check_uint32_safe(dates.max().view(np.int64), 'day') processed['day'] = dates.astype('uint32') processed['volume'] = raw_data.volume.astype('uint32') return ctable.fromdataframe(processed)
def to_ctable(self, raw_data, invalid_data_behavior): if isinstance(raw_data, ctable): # we already have a ctable so do nothing return raw_data winsorise_uint32(raw_data, invalid_data_behavior, 'volume', *OHLC) processed = (raw_data[list(OHLC)] * 1000).round().astype('uint32') dates = raw_data.index.values.astype('datetime64[s]') check_uint32_safe(dates.max().view(np.int64), 'day') processed['day'] = dates.astype('uint32') processed['volume'] = raw_data.volume.astype('uint32') # 附加列同样转换为uint32 for c in NON_ADJUSTED_COLUMN_FACTOR.keys(): if c in raw_data.columns: processed[c] = ( raw_data.loc[:, c] * NON_ADJUSTED_COLUMN_FACTOR.get(c, 1)).astype('uint32') return ctable.fromdataframe(processed)
def _raw_data_for_asset(self, asset_id): """ Generate 'raw' data that encodes information about the asset. See class docstring for a description of the data format. """ # Get the dates for which this asset existed according to our asset # info. dates = self._calendar[self._calendar.slice_indexer(self.asset_start(asset_id), self.asset_end(asset_id))] data = full((len(dates), len(US_EQUITY_PRICING_BCOLZ_COLUMNS)), asset_id * (100 * 1000), dtype=uint32) # Add 10,000 * column-index to OHLCV columns data[:, :5] += arange(5) * (10 * 1000) # Add days since Jan 1 2001 for OHLCV columns. data[:, :5] += (dates - self.PSEUDO_EPOCH).days[:, None] frame = DataFrame(data, index=dates, columns=US_EQUITY_PRICING_BCOLZ_COLUMNS) frame["day"] = nanos_to_seconds(dates.asi8) frame["id"] = asset_id return ctable.fromdataframe(frame)
def gen_tables(self, assets): for asset in assets: yield asset, ctable.fromdataframe(assets[asset])