def _write_df_to_db_single_thread(self, ticker, remove_duplicates=True, if_exists_table='append', if_exists_ticker='replace'): logger = LoggerManager.getLogger(__name__) postfix = '-' + self._get_postfix() + '-with-duplicates' if remove_duplicates: postfix = '-' + self._get_postfix() + '-no-duplicates' filename = os.path.join(self.temp_large_data_folder, ticker + postfix) + '.' + fileformat logger.info("Reading " + filename) util_func = UtilFunc() time_series_ops = TimeSeriesOps() data_source_local = self._get_output_data_source() df = util_func.read_dataframe_from_binary(filename, format=binary_format) if df is not None: df = time_series_ops.localize_as_UTC(df) data_source_local.append_market_data(df, ticker, if_exists_table=if_exists_table, if_exists_ticker=if_exists_ticker) else: logger.warn("Couldn't write dataframe for " + ticker + " to database, appears it is empty!")
def _fetch_market_data(self, start, finish, ticker, write_to_disk=True, read_cached_from_disk=True, web_proxies=constants.web_proxies): logger = LoggerManager.getLogger(__name__) key = (str(start) + str(finish) + ticker + '_' + self._get_postfix()).replace(":", '_') filename = os.path.join(self.temp_data_folder, key) + '.' + fileformat util_func = UtilFunc() start_time_stamp = pd.Timestamp(start) finish_time_stamp = pd.Timestamp(finish) if self._remove_weekend_points(): weekend_data = "Weekend? " + key weekday_point = UtilFunc().is_weekday_point(start_time_stamp, finish_time_stamp, friday_close_nyc_hour=constants.friday_close_utc_hour, sunday_open_utc_hour=constants.sunday_open_utc_hour) if not(weekday_point): return None, weekend_data df = None if read_cached_from_disk: if os.path.exists(filename): df = util_func.read_dataframe_from_binary(filename, format=binary_format) if df is not None: logger.debug("Read " + filename + " from disk") if df is None: # Convert tcapy ticker into vendor ticker df = self._get_input_data_source().fetch_market_data(start, finish, ticker=self._get_tickers_vendor()[ticker], web_proxies=web_proxies) if df is not None: if write_to_disk: # Write a small temporary dataframe to disk (if the process fails later, these can be picked up, # without having a call the external vendor again util_func.write_dataframe_to_binary(df, filename, format=binary_format) msg = None if df is None: msg = "No data? " + key return df, msg
def _fetch_market_data(self, start, finish, ticker, write_to_disk=True, read_cached_from_disk=True, web_proxies=constants.web_proxies): logger = LoggerManager.getLogger(__name__) key = (str(start) + str(finish) + ticker + '_' + self._get_postfix()).replace(":", '_') filename = os.path.join(self.temp_data_folder, key) + '.' + fileformat util_func = UtilFunc() start_time_stamp = pd.Timestamp(start) finish_time_stamp = pd.Timestamp(finish) if self._remove_saturday(): weekend_data = "Saturday? " + key # Ignore Saturday, and don't attempt to download if start_time_stamp.dayofweek == 5 or finish_time_stamp.dayofweek == 5: return None, weekend_data if self._remove_weekend_points(): weekend_data = "Weekend? " + key if start_time_stamp.dayofweek == 6 and start_time_stamp.hour < 20: return None, weekend_data if start_time_stamp.dayofweek == 4 and start_time_stamp.hour > 22: return None, weekend_data df = None if read_cached_from_disk: if os.path.exists(filename): df = util_func.read_dataframe_from_binary(filename, format=binary_format) if df is not None: logger.debug("Read " + filename + " from disk") if df is None: # Convert tcapy ticker into vendor ticker df = self._get_input_data_source().fetch_market_data( start, finish, ticker=self._get_tickers_vendor()[ticker], web_proxies=web_proxies) if df is not None: df = df.drop('ticker', axis=1) if write_to_disk: # Write a small temporary dataframe to disk (if the process fails later, these can be picked up, # without having a call the external vendor again util_func.write_dataframe_to_binary(df, filename, format=binary_format) msg = None if df is None: msg = "No data? " + key return df, msg
def _combine_mini_df_from_disk_single_thread(self, ticker, remove_duplicates=True): logger = LoggerManager.getLogger(__name__) time_series_ops = TimeSeriesOps() logger.info('Getting ' + ticker + ' filenames...') temp_data_folder = self.temp_data_folder filename_list = [] for root, dirnames, filenames in os.walk(temp_data_folder): for filename in filenames: if ticker in filename and '.' + fileformat in filename: filename_h5_parquet = os.path.join(root, filename) # if filename is less than 10MB add (otherwise likely a very large aggregated file!) if os.path.getsize(filename_h5_parquet) < 10 * 1024 * 1024: filename_list.append(filename_h5_parquet) df_list = [] util_func = UtilFunc() logger.info('Loading ' + ticker + ' mini dataframe into memory') i = 0 if len(filename_list) == 0: logger.warn("Looks like there are no files for " + ticker + " in " + temp_data_folder + ". Are you sure path is correct?") # Go through each mini file which represents a few minutes of data and append it for filename in filename_list: filesize = 0 try: filesize = os.path.getsize(filename) / 1024.0 df = util_func.read_dataframe_from_binary(filename, format=binary_format) i = i + 1 # every 100 files print reading output@ if i % 100 == 0: logger.info('Reading ' + filename + ' number ' + str(i)) if df is not None: df = df.sort_index() df = self._remove_duplicates_time_series(df, remove_duplicates, time_series_ops, field='mid') df_list.append(df) except Exception as e: logger.warn('Failed to parse ' + filename + " of " + str(filesize) + "KB") # + str(e)) # if i > 1000: # break # Assume UTC time (don't want to mix UTC and non-UTC in database!) if df_list == []: logger.warn('No dataframe read for ' + ticker + ', cannot combine!') return logger.info('About to combine ' + ticker + ' into large dataframe to write to disk...') df = pd.concat(df_list) df = time_series_ops.localize_as_UTC(df) df = df.sort_index() df = self._remove_duplicates_time_series(df, remove_duplicates, time_series_ops, field='mid') postfix = '-' + self._get_postfix() + '-with-duplicates' if remove_duplicates: postfix = '-' + self._get_postfix() + '-no-duplicates' filename = os.path.join(self.temp_large_data_folder, ticker + postfix) + '.' + fileformat df = time_series_ops.localize_as_UTC(df) util_func.write_dataframe_to_binary(df, filename, format=binary_format)
import os from tcapy.util.loggermanager import LoggerManager from tcapy.util.utilfunc import UtilFunc add_vendor = 'dukascopy' path = parquet_path = '/home/tcapyuser/csv_dump/' + add_vendor + '/' filenames = os.listdir(path) util_func = UtilFunc() logger = LoggerManager.getLogger(__name__) for filename in filenames: format = filename.split('.')[-1] if format == 'gzip': format = 'parquet' elif format == 'h5': format = 'hdf5' logger.info('Reading to patch file ' + filename) df = util_func.read_dataframe_from_binary(os.path.join(path, filename), format=format) # Do your edits here, in this case overwriting the ticker column ticker = filename.split('_')[0] df['ticker'] = ticker util_func.write_dataframe_to_binary(df, os.path.join(path, filename), format=format)