def test_stress_tca(fill_market_trade_databases): """Makes several large TCARequests at the same time to stress test tcapy application and also to check it works with parallel requests (note: you may need to reduce the length of the dataset if your machine has limited amounts of RAM). It can be possible that when deployed on the web, several users might make simultaneous requests. Note, do not use pylibmc, and instead use python-memcached, when using memcached as a result backend. pylibmc is not thread-safe so will come undone if you end up making parallel requests. """ from tcapy.util.swim import Swim if not (stress_test): return # Clear cache to ensure all test code runs! Mediator.get_volatile_cache().clear_cache() tca_request = TCARequest( start_date=start_date, finish_date=finish_date, ticker=valid_ticker_list, dummy_market=True, trade_data_store=trade_data_store, trade_data_database_name=trade_data_database_name, market_data_store=market_data_store, market_data_database_table=market_data_database_table, trade_order_mapping=trade_order_mapping, use_multithreading=True, tca_type='aggregated') # Kick off several simulanteous large TCA requests request_no = 2 tca_request_list = [] for i in range(0, request_no): tca_request_list.append(TCARequest(tca_request=tca_request)) tca_engine = TCAEngineImpl(version=tcapy_version) swim = Swim(parallel_library='thread') pool = swim.create_pool(thread_no=len(tca_request_list)) result = [] for item in tca_request_list: result.append(pool.apply_async(tca_engine.calculate_tca, args=(item, ))) output = [p.get() for p in result] swim.close_pool(pool, True) assert len(output) == len(tca_request_list) # Check that several DataFrames exist in the results for trade_order_results_df_dict in output: assert 'trade_df' in trade_order_results_df_dict.keys()
def write_df_to_db(self, tickers=None, remove_duplicates=True, if_exists_table='append', if_exists_ticker='replace'): """Loads up a large HDF5/Parquet file from disk into a pd DataFrame and then dumps locally. Uses use_multithreading to speed it up, by using a thread for each different ticker. Parameters ---------- tickers : str (list or dict) List of tickers remove_duplicates : bool True (default) - removes any follow on duplicates in the dataset if_exists_table : str 'append' - if database table already exists append data to it 'replace' - remove existing database table if_exists_ticker : str 'append' - if ticker already exists in the database, append to it 'replace' - replace any data for this ticker Returns ------- """ if tickers is None: tickers = self.tickers.keys() if isinstance(tickers, dict): tickers = tickers.keys() if not (isinstance(tickers, list)): tickers = [tickers] if constants.use_multithreading: swim = Swim(parallel_library=constants.database_populator_threading_library) pool = swim.create_pool(thread_no=self._get_threads()) result = [] for i in range(0, len(tickers)): result.append( pool.apply_async(self._write_df_to_db_single_thread, args=(tickers[i], remove_duplicates, if_exists_table, if_exists_ticker,))) output = [p.get() for p in result] swim.close_pool(pool, True) else: for i in range(0, len(tickers)): self._write_df_to_db_single_thread(tickers[i], remove_duplicates, if_exists_table, if_exists_ticker)
def combine_mini_df_from_disk(self, tickers=None, remove_duplicates=True): """Combines the mini HDF5/Parquet files for eg. 5 min chunks and combine into a very large HDF5/Parquet file, which is likely to be for multiple months of data. Uses multithreading to speed up, by using a thread for each different ticker. Parameters ---------- tickers : str (list or ditc) Ticker of each ticker remove_duplicates : bool Remove duplicated market prices, which follow one another Returns ------- """ if tickers is None: tickers = self.tickers.keys() if isinstance(tickers, dict): tickers = tickers.keys() if not (isinstance(tickers, list)): tickers = [tickers] if constants.use_multithreading: swim = Swim(parallel_library=constants. database_populator_threading_library) pool = swim.create_pool(thread_no=self._get_threads()) result = [] for i in range(0, len(tickers)): result.append( pool.apply_async( self._combine_mini_df_from_disk_single_thread, args=( tickers[i], remove_duplicates, ))) output = [p.get() for p in result] swim.close_pool(pool, True) else: for i in range(0, len(tickers)): self._combine_mini_df_from_disk_single_thread( tickers[i], remove_duplicates)
def download_from_external_source(self, append_data=True, remove_duplicates=True, if_exists_table='append', if_exists_ticker='append', number_of_days=30 * 7, chunk_int_min=None, start_date=None, finish_date=None, delete_cached_files=False, tickers=None, write_temp_to_disk=True, write_to_disk_db=True, read_cached_from_disk=True, write_large_csv=False, write_large_hdf5_parquet=True, csv_folder=constants.csv_folder, csv_compression=None, return_df=False, web_proxies=constants.web_proxies): """Downloads market data from an external source and then dumps to HDF5/Parquet files for temporary storage which is cached. If HDF5/Parquet cached files already exist for a time segment we read them in, saving us to make an external data call. Lastly, dumps it to an internal database. Parameters ---------- append_data : bool True - only start collecting later data not already in database (ignoring number_of_days parameter) False - start collecting all data, ignoring anything stored in database remove_duplicates : bool True (default) - remove values which are repeated False - leave in repeated values if_exists_table : str 'append' - if database table already exists append data to it 'replace' - remove existing database table if_exists_ticker : str 'append' - if ticker already exists in the database, append to it 'replace' - replace any data for this ticker number_of_days : int Number of days to download data for chunk_int_min : int (None) Size of each download (default - specified in constants) Returns ------- """ # Swim() logger = LoggerManager.getLogger(__name__) if write_to_disk_db: data_source_local = self._get_output_data_source() if write_large_csv: if not (os.path.isdir(csv_folder)): logger.warn("CSV folder " + self.temp_data_folder + " where we are about to write does not exist") # What chunk size in minutes do we want for this data provider? if chunk_int_min is None: chunk_int_min = self._get_download_chunk_min_size() if chunk_int_min is None: chunk_size_str = None else: chunk_size_str = str(chunk_int_min) + "min" if tickers is None: tickers = self._get_tickers() if isinstance(tickers, str): tickers = [tickers] # If there's no start or finish date, choose a default start finish data if start_date is None and finish_date is None: finish_date = datetime.datetime.utcnow() finish_date = datetime.datetime(finish_date.year, finish_date.month, finish_date.day, 0, 0, 0, 0) start_date = finish_date - timedelta(days=number_of_days) # 30*7 else: start_date = self.time_series_ops.date_parse(start_date) finish_date = self.time_series_ops.date_parse(finish_date) if finish_date < start_date: logger.error("Download finish date is before start data!") return now = pd.Timestamp(datetime.datetime.utcnow(), tz='utc') # Do not allow downloading of future data! if finish_date > now: finish_date = now df_dict = {} # Loop through each ticker for ticker in tickers: has_old = False if delete_cached_files and write_to_disk_db: logger.info("Deleting all cached temp files for " + ticker) for name in glob.glob(self.temp_data_folder + '/*' + ticker + "*"): try: os.remove(name) except: logger.warn("Couldn't delete file " + name) logger.info("Finished deleting cached files for " + ticker) # If we have been asked to append data, load up what you can from the internal database # find the last point if append_data and if_exists_ticker == 'append' and write_to_disk_db: logger.info("Trying to download old data first for " + ticker) try: df_old = data_source_local.fetch_market_data( start_date, finish_date, ticker, web_proxies=web_proxies) # This will vary between tickers (in particular if we happen to add a new ticker) start_date = df_old.index[-1] has_old = True # Remove reference - big file! df_old = None except Exception as e: logger.info("No data found for ticker " + ticker + " with error: " + str(e)) else: logger.info("Downloading new data for " + ticker + ".") # Date range may not work with timezones start_date = pd.Timestamp(start_date.replace(tzinfo=None)) finish_date = pd.Timestamp(finish_date.replace(tzinfo=None)) if finish_date - start_date < pd.Timedelta(days=1): start_date_list = [start_date, finish_date] else: # download from that last point to the present day start_date_list = pd.date_range(start_date, finish_date) start_date_list = [ pd.Timestamp(x.to_pydatetime()) for x in start_date_list ] if finish_date > start_date_list[-1]: start_date_list.append(finish_date) df = None filename = os.path.join(self.temp_data_folder, ticker) + '.' + fileformat try: # df = UtilFunc().read_dataframe_from_hdf(filename) pass except: logger.info("Couldn't read HDF5/Parquet file for " + ticker) # Create downloads in x minute chunks (if we request very large chunks of data with certain data providers, # we could cause problems!) if df is None: df_remote_list = [] # Loop by day (otherwise can end up with too many open files!) for i in range(0, len(start_date_list) - 1): if chunk_size_str is not None: if start_date_list[ i + 1] - start_date_list[i] < pd.Timedelta( minutes=chunk_int_min): start_date_hist = [start_date_list[i]] finish_date_hist = [start_date_list[i + 1]] else: start_date_hist, finish_date_hist = UtilFunc( ).split_into_freq(start_date_list[i], start_date_list[i + 1], freq=chunk_size_str) else: start_date_hist = [start_date_list[i]] finish_date_hist = [start_date_list[i + 1]] # For FX and most other markets we should remove weekends (cryptocurrencies do have weekend data) if self._remove_weekend_points(): start_date_hist, finish_date_hist = UtilFunc( ).remove_weekend_points(start_date_hist, finish_date_hist) output = [] if constants.use_multithreading: # Create a multiprocess object for downloading data swim = Swim(parallel_library=constants. database_populator_threading_library) pool = swim.create_pool(thread_no=self._get_threads()) result = [] for i in range(0, len(start_date_hist)): # output.append(self._fetch_market_data(start_date_hist[i], finish_date_hist[i], ticker)) result.append( pool.apply_async( self._fetch_market_data, args=(start_date_hist[i], finish_date_hist[i], ticker, write_temp_to_disk, read_cached_from_disk, web_proxies))) output = [p.get() for p in result] swim.close_pool(pool, True) else: # Otherwise run in single threaded fashion for i in range(0, len(start_date_hist)): output.append( self._fetch_market_data( start_date_hist[i], finish_date_hist[i], ticker, write_to_disk=write_temp_to_disk, read_cached_from_disk=read_cached_from_disk, web_proxies=web_proxies)) # Get all the dataframe chunks and returned messages df_list = [ self._remove_duplicates_time_series(x, remove_duplicates, field='mid') for x, y in output if x is not None ] msg_list = [ y for x, y in output if x is not None and y is not None ] # Concatenate all the 5 (or larger) minute data chunks try: if df_list != []: df_temp = pd.concat(df_list) if df_temp is not None: if not (df_temp.empty): df_remote_list.append(df_temp) except Exception as e: logger.error(str(e)) if df_remote_list != []: df = pd.concat(df_remote_list) # Need to sort data (database assumes sorted data for chunking/searches) df = df.sort_index() df = self.time_series_ops.localize_as_UTC(df) if write_large_hdf5_parquet: if df is not None: if not (df.empty): key = '_' + self._get_postfix() + "_" + \ (str(df.index[0]) + str(df.index[-1])).replace(":", '_').replace(" ", '_') filename = os.path.join( csv_folder, ticker + key) + '.' + fileformat # Temporary cache for testing purposes (also if the process crashes, we can read this back in) UtilFunc().write_dataframe_to_binary( df, filename, format=binary_format) if df is not None: # Assume UTC time (don't want to mix UTC and non-UTC in database!) df = self.time_series_ops.localize_as_UTC(df) # write CSV if write_large_csv: if df is not None: if not (df.empty): key = '_' + self._get_postfix() + "_" + \ (str(df.index[0]) + str(df.index[-1])).replace(":", '_').replace(" ", '_') if csv_compression is 'gzip': df.to_csv(os.path.join(csv_folder, ticker + key + ".csv.gz"), compression='gzip') else: df.to_csv( os.path.join(csv_folder, ticker + key + ".csv")) if return_df: df_dict[ticker] = df # Dump what we have locally (or whatever DatabaseSource we have defined) try: start_date = start_date.replace(tzinfo=pytz.utc) # Remove first point if matches last point from dataset if has_old: if df.index[0] == start_date: df = df[-1:] if df is not None: df = df.sort_index() df = self._remove_duplicates_time_series(df, remove_duplicates, field='mid') if write_to_disk_db and df is not None: data_source_local.append_market_data( df, ticker, if_exists_table=if_exists_table, if_exists_ticker=if_exists_ticker) logger.info("Wrote to database for " + ticker) except Exception as e: final_err = "Data was missing for these dates " + str(start_date) + " - " + str(finish_date) + " for " \ + str(tickers) + " Didn't write anything to disk or return any valid dataframe: " + str(e) logger.error(final_err) if df is None: msg_list.append("No downloaded data for " + str(start_date) + " - " + str(finish_date) + ". Is this a holiday?") # Returns a status containing any failed downloads, which can be read by a user return msg_list, df_dict