def ingest(environ, asset_db_writer, minute_bar_writer, daily_bar_writer, adjustment_writer, calendar, start_session, end_session, cache, show_progress, output_dir): assets_to_sids = asset_to_sid_map(asset_db_writer.asset_finder, list_assets()) def minute_data_generator(): return (sid_df for (sid_df, *metadata.iloc[sid_df[0]] ) in df_generator(interval='1m', start=start_session, end=end_session, assets_to_sids=assets_to_sids)) def daily_data_generator(): return (sid_df for (sid_df, *metadata.iloc[sid_df[0]] ) in df_generator(interval='1d', start=start_session, end=end_session, assets_to_sids=assets_to_sids)) for _interval in interval: metadata = metadata_df() if _interval == '1d': daily_bar_writer.write(daily_data_generator(), assets=assets_to_sids.values(), show_progress=True) elif _interval == '1m': minute_bar_writer.write(minute_data_generator(), assets=assets_to_sids.values(), show_progress=True) # Drop the ticker rows which have missing sessions in their data sets metadata.dropna(inplace=True) asset_db_writer.write(equities=metadata) print(metadata) adjustment_writer.write()
def csvdir_bundle(environ, asset_db_writer, minute_bar_writer, daily_bar_writer, adjustment_writer, calendar, start_session, end_session, cache, show_progress, output_dir, tframes=None, csvdir=None): """ Build a zipline data bundle from the directory with csv files. """ if not csvdir: csvdir = environ.get('CSVDIR') if not csvdir: raise ValueError("CSVDIR environment variable is not set") if not os.path.isdir(csvdir): raise ValueError("%s is not a directory" % csvdir) if not tframes: tframes = set(["daily", "minute"]).intersection(os.listdir(csvdir)) if not tframes: raise ValueError("'daily' and 'minute' directories " "not found in '%s'" % csvdir) divs_splits = { 'divs': DataFrame(columns=[ 'sid', 'amount', 'ex_date', 'record_date', 'declared_date', 'pay_date' ]), 'splits': DataFrame(columns=['sid', 'ratio', 'effective_date']) } for tframe in tframes: ddir = os.path.join(csvdir, tframe) symbols = sorted( item.split('.csv')[0] for item in os.listdir(ddir) if '.csv' in item) if not symbols: raise ValueError("no <symbol>.csv* files found in %s" % ddir) dtype = [('start_date', 'datetime64[ns]'), ('end_date', 'datetime64[ns]'), ('auto_close_date', 'datetime64[ns]'), ('symbol', 'object')] metadata = DataFrame(empty(len(symbols), dtype=dtype)) if tframe == 'minute': writer = minute_bar_writer else: writer = daily_bar_writer assets_to_sids = asset_to_sid_map(asset_db_writer.asset_finder, symbols) writer.write(_pricing_iter(ddir, symbols, metadata, divs_splits, show_progress, assets_to_sids=assets_to_sids), show_progress=show_progress) # Hardcode the exchange to "CSVDIR" for all assets and (elsewhere) # register "CSVDIR" to resolve to the NYSE calendar, because these # are all equities and thus can use the NYSE calendar. metadata['exchange'] = "CSVDIR" asset_db_writer.write(equities=metadata) divs_splits['divs']['sid'] = divs_splits['divs']['sid'].astype(int) divs_splits['splits']['sid'] = divs_splits['splits']['sid'].astype(int) adjustment_writer.write(splits=divs_splits['splits'], dividends=divs_splits['divs'])
def ingest(environ, asset_db_writer, minute_bar_writer, daily_bar_writer, adjustment_writer, calendar, start_session, end_session, cache, show_progress, output_dir): divs_splits = { 'divs': pd.DataFrame(columns=[ 'sid', 'amount', 'ex_date', 'record_date', 'declared_date', 'pay_date' ]), 'splits': pd.DataFrame(columns=['sid', 'ratio', 'effective_date']) } assets_to_sids = asset_to_sid_map(asset_db_writer.asset_finder, list_assets()) def minute_data_generator(): return (sid_df for (sid_df, *metadata.iloc[sid_df[0]] ) in df_generator(interval='1m', start=start_session, end=end_session, assets_to_sids=assets_to_sids, divs_splits=divs_splits)) def daily_data_generator(): return (sid_df for (sid_df, *metadata.loc[sid_df[0]] ) in df_generator(interval='1d', start=start_session, end=end_session, assets_to_sids=assets_to_sids, divs_splits=divs_splits)) metadata = metadata_df(assets_to_sids) assets = list_assets() for _interval in interval: if _interval == '1d': daily_bar_writer.write(daily_data_generator(), assets=assets_to_sids.values(), show_progress=True, invalid_data_behavior='raise') elif _interval == '1m': minute_bar_writer.write(minute_data_generator(), show_progress=True) metadata.dropna(inplace=True) asset_db_writer.write(equities=metadata) # convert back wrong datatypes after pd.concat divs_splits['splits']['sid'] = divs_splits['splits']['sid'].astype( np.int) divs_splits['divs']['sid'] = divs_splits['divs']['sid'].astype(np.int) divs_splits['divs']['ex_date'] = pd.to_datetime( divs_splits['divs']['ex_date'], utc=True) divs_splits['divs']['pay_date'] = pd.to_datetime( divs_splits['divs']['pay_date'], utc=True) adjustment_writer.write(splits=divs_splits['splits'], dividends=divs_splits['divs']) # Drop the ticker rows which have missing sessions in their data sets print(metadata)