def create_market_trade_data_eikon(): """Creates a small dataset for testing purposes for market, trade and order data for EURUSD at the start of May 2017, which is dumped to the designated tcapy test harness folder. Returns ------- """ # Use database source as Arctic (or directly from Dukascopy) for market data (assume we are using market data as a source) tca_market = TCAMarketTradeLoaderImpl() util_func = UtilFunc() market_df = [] for tick in ticker: market_request = MarketRequest(ticker=tick, data_store=data_store, start_date=start_date, finish_date=finish_date) market_df.append( tca_market.get_market_data(market_request=market_request)) # Note: it can be very slow to write these CSV files market_df = pd.concat(market_df) market_df.to_csv(os.path.join(folder, 'small_test_market_df_eikon.csv.gz'), compression='gzip') # Also write to disk as HDF5 file (easier to load up later) util_func.write_dataframe_to_binary( market_df, os.path.join(folder, 'small_test_market_df_eikon.gzip')) # Create a spot file in reverse order market_df.sort_index(ascending=False)\ .to_csv(os.path.join(folder, 'small_test_market_df_reverse_eikon.csv.gz'), compression='gzip') # Also write to disk as Parquet file (easier to load up later) util_func.write_dataframe_to_binary( market_df, os.path.join(folder, 'small_test_market_df_reverse_eikon.parquet')) if create_trade_order_data: # Use the market data we just downloaded to CSV, and perturb it to generate the trade data data_test_creator = DataTestCreator( market_data_postfix=postfix, csv_market_data=os.path.join(folder, 'small_test_market_df_eikon.csv.gz'), write_to_db=False) # Create randomised trade/order data trade_order = data_test_creator.create_test_trade_order( ticker_trades, start_date=start_date, finish_date=finish_date) trade_order['trade_df'].to_csv( os.path.join(folder, 'small_test_trade_df_eikon.csv')) trade_order['order_df'].to_csv( os.path.join(folder, 'small_test_order_df_eikon.csv'))
def _fetch_market_data(self, start, finish, ticker, write_to_disk=True, read_cached_from_disk=True, web_proxies=constants.web_proxies): logger = LoggerManager.getLogger(__name__) key = (str(start) + str(finish) + ticker + '_' + self._get_postfix()).replace(":", '_') filename = os.path.join(self.temp_data_folder, key) + '.' + fileformat util_func = UtilFunc() start_time_stamp = pd.Timestamp(start) finish_time_stamp = pd.Timestamp(finish) if self._remove_weekend_points(): weekend_data = "Weekend? " + key weekday_point = UtilFunc().is_weekday_point(start_time_stamp, finish_time_stamp, friday_close_nyc_hour=constants.friday_close_utc_hour, sunday_open_utc_hour=constants.sunday_open_utc_hour) if not(weekday_point): return None, weekend_data df = None if read_cached_from_disk: if os.path.exists(filename): df = util_func.read_dataframe_from_binary(filename, format=binary_format) if df is not None: logger.debug("Read " + filename + " from disk") if df is None: # Convert tcapy ticker into vendor ticker df = self._get_input_data_source().fetch_market_data(start, finish, ticker=self._get_tickers_vendor()[ticker], web_proxies=web_proxies) if df is not None: if write_to_disk: # Write a small temporary dataframe to disk (if the process fails later, these can be picked up, # without having a call the external vendor again util_func.write_dataframe_to_binary(df, filename, format=binary_format) msg = None if df is None: msg = "No data? " + key return df, msg
def _fetch_market_data(self, start, finish, ticker, write_to_disk=True, read_cached_from_disk=True, web_proxies=constants.web_proxies): logger = LoggerManager.getLogger(__name__) key = (str(start) + str(finish) + ticker + '_' + self._get_postfix()).replace(":", '_') filename = os.path.join(self.temp_data_folder, key) + '.' + fileformat util_func = UtilFunc() start_time_stamp = pd.Timestamp(start) finish_time_stamp = pd.Timestamp(finish) if self._remove_saturday(): weekend_data = "Saturday? " + key # Ignore Saturday, and don't attempt to download if start_time_stamp.dayofweek == 5 or finish_time_stamp.dayofweek == 5: return None, weekend_data if self._remove_weekend_points(): weekend_data = "Weekend? " + key if start_time_stamp.dayofweek == 6 and start_time_stamp.hour < 20: return None, weekend_data if start_time_stamp.dayofweek == 4 and start_time_stamp.hour > 22: return None, weekend_data df = None if read_cached_from_disk: if os.path.exists(filename): df = util_func.read_dataframe_from_binary(filename, format=binary_format) if df is not None: logger.debug("Read " + filename + " from disk") if df is None: # Convert tcapy ticker into vendor ticker df = self._get_input_data_source().fetch_market_data( start, finish, ticker=self._get_tickers_vendor()[ticker], web_proxies=web_proxies) if df is not None: df = df.drop('ticker', axis=1) if write_to_disk: # Write a small temporary dataframe to disk (if the process fails later, these can be picked up, # without having a call the external vendor again util_func.write_dataframe_to_binary(df, filename, format=binary_format) msg = None if df is None: msg = "No data? " + key return df, msg
def _combine_mini_df_from_disk_single_thread(self, ticker, remove_duplicates=True): logger = LoggerManager.getLogger(__name__) time_series_ops = TimeSeriesOps() logger.info('Getting ' + ticker + ' filenames...') temp_data_folder = self.temp_data_folder filename_list = [] for root, dirnames, filenames in os.walk(temp_data_folder): for filename in filenames: if ticker in filename and '.' + fileformat in filename: filename_h5_parquet = os.path.join(root, filename) # if filename is less than 10MB add (otherwise likely a very large aggregated file!) if os.path.getsize(filename_h5_parquet) < 10 * 1024 * 1024: filename_list.append(filename_h5_parquet) df_list = [] util_func = UtilFunc() logger.info('Loading ' + ticker + ' mini dataframe into memory') i = 0 if len(filename_list) == 0: logger.warn("Looks like there are no files for " + ticker + " in " + temp_data_folder + ". Are you sure path is correct?") # Go through each mini file which represents a few minutes of data and append it for filename in filename_list: filesize = 0 try: filesize = os.path.getsize(filename) / 1024.0 df = util_func.read_dataframe_from_binary(filename, format=binary_format) i = i + 1 # every 100 files print reading output@ if i % 100 == 0: logger.info('Reading ' + filename + ' number ' + str(i)) if df is not None: df = df.sort_index() df = self._remove_duplicates_time_series(df, remove_duplicates, time_series_ops, field='mid') df_list.append(df) except Exception as e: logger.warn('Failed to parse ' + filename + " of " + str(filesize) + "KB") # + str(e)) # if i > 1000: # break # Assume UTC time (don't want to mix UTC and non-UTC in database!) if df_list == []: logger.warn('No dataframe read for ' + ticker + ', cannot combine!') return logger.info('About to combine ' + ticker + ' into large dataframe to write to disk...') df = pd.concat(df_list) df = time_series_ops.localize_as_UTC(df) df = df.sort_index() df = self._remove_duplicates_time_series(df, remove_duplicates, time_series_ops, field='mid') postfix = '-' + self._get_postfix() + '-with-duplicates' if remove_duplicates: postfix = '-' + self._get_postfix() + '-no-duplicates' filename = os.path.join(self.temp_large_data_folder, ticker + postfix) + '.' + fileformat df = time_series_ops.localize_as_UTC(df) util_func.write_dataframe_to_binary(df, filename, format=binary_format)
from tcapy.util.loggermanager import LoggerManager from tcapy.util.utilfunc import UtilFunc add_vendor = 'dukascopy' path = parquet_path = '/home/tcapyuser/csv_dump/' + add_vendor + '/' filenames = os.listdir(path) util_func = UtilFunc() logger = LoggerManager.getLogger(__name__) for filename in filenames: format = filename.split('.')[-1] if format == 'gzip': format = 'parquet' elif format == 'h5': format = 'hdf5' logger.info('Reading to patch file ' + filename) df = util_func.read_dataframe_from_binary(os.path.join(path, filename), format=format) # Do your edits here, in this case overwriting the ticker column ticker = filename.split('_')[0] df['ticker'] = ticker util_func.write_dataframe_to_binary(df, os.path.join(path, filename), format=format)