def _prep_for_extraction(tickers, target_date, end_time): # stdout.write(f'=> Setting things up for extraction for: {target_date}\n') # form data caching directory cache_directory = join(_HISTORICAL_DATA_STORAGE, target_date) # create cache directory for success cache_success = join(cache_directory, '.success') make_dirs(cache_success) # create cache directory for failure cache_failure = join(cache_directory, '.failure') make_dirs(cache_failure) # stdout.write(f'\t-> Cache directies created...\n') # save tickers for later use path_input_tickers = join(cache_directory, 'input_tickers.json') # todo: find a better way to do this if not isfile(path_input_tickers): save_data_as_json(tickers, path_input_tickers, indent=0) # stdout.write(f'\t-> Cached target tickers...\n') # extract tickers that are yet to be processed tickers = _get_unprocessed_tickers(tickers, cache_success, cache_failure, target_date, end_time) # stdout.write(f'\t-> Excluded already processed tickers...\n') # stdout.write(f'\t-> Total target tickers: {len(tickers)}\n') return tickers, cache_success, cache_failure
def _setup_storage_directories(date, bar_size='1 min'): y, m = date[:4], date[4:6] storage_dir = join(HISTORICAL_DATA_STORAGE, bar_size.replace(' ', ''), y, MONTH_MAP[int(m)], date) make_dirs(storage_dir) return storage_dir
def generate_failure_dataframe(target_directory): """ Creates a pandas data fame from JSON files present at the given failure location. Assumes that all these JSON files have valid error stacks. :param target_directory: location to read JSON files from """ stdout.write(f'=> Generting dataframe for failure tickers...\n') def _get_ticker_id(file_name): return int(file_name.split(sep)[-1].split('.')[0]) # create a place holder dataframe expected_columns = ['ecode', 'status', 'code', 'message', 'attempts'] data = pd.DataFrame(columns=expected_columns) # create temporary directory to store smaller CSV files temp_directory = '.temp' make_dirs(temp_directory) # extract all json files from target directory file_pattern = join(target_directory, '*.json') # TODO: can be modified to match digital values failure_files = glob(file_pattern) total = len(failure_files) if bool(total): json_generator = map(read_json_file, failure_files) counter = 0 # to count temp CSV files with alive_bar(total=total, **_BAR_CONFIG) as bar: for i in range(total): ticker_data = next(json_generator) meta_data = ticker_data['meta_data'] error_stack = meta_data['_error_stack'] temp_data = pd.DataFrame(error_stack) status, attempts = meta_data['status'], meta_data['attempts'] temp_data['ecode'] = meta_data.get('ecode', _get_ticker_id(failure_files[i])) temp_data['status'], temp_data['attempts'] = status, attempts data = data.append(temp_data) _time_to_cache = (i+1 == total) or ((i > 0) and (i % 100 == 0)) if _time_to_cache: if data.shape[0] > 0: temp_file = join(temp_directory, f'failure_{counter}.csv') data.to_csv(temp_file) data = pd.DataFrame(columns=expected_columns) counter += 1 bar() # merge all CSV files into a single dataframe # delete all temp files temp_files = glob(join(temp_directory, 'failure_*.csv')) data = pd.concat(map(read_csv, temp_files)) data.sort_values(by=['ecode'], ignore_index=True, inplace=True) data = data[expected_columns] delete_directory(temp_directory) return data
def generate_success_dataframe(target_directory): """ Creates a pandas data fame from JSON files present at the given failure location. Assumes that all these JSON files have valid bar data. :param target_directory: location to read JSON files from """ stdout.write(f'=> Generating dataframe for success tickers...\n') def _get_ticker_id(file_name): return int(file_name.split(sep)[-1].split('.')[0]) # create a place holder dataframe expected_columns = ['time_stamp', 'ecode', 'session', 'high', 'low', 'close', 'volume', 'average', 'count'] data = pd.DataFrame(columns=expected_columns) # create temporary directory to store smaller CSV files temp_directory = '.temp' make_dirs(temp_directory) # extract all json files from target directory success_file_pattern = join(target_directory, '*.json') success_files = glob(success_file_pattern) total = len(success_files) if bool(total): json_generator = (read_json_file(file) for file in success_files) counter = 0 # to count temp files with alive_bar(total=total, **_BAR_CONFIG) as bar: for i in range(total): ticker_data = next(json_generator) bar_data, meta_data = ticker_data['bar_data'], ticker_data['meta_data'] temp_data = pd.DataFrame(bar_data) temp_data['ecode'] = meta_data.get('ecode', _get_ticker_id(success_files[i])) data = data.append(temp_data) _time_to_cache = (i+1 == total) or ((i > 0) and (i % 100 == 0)) if _time_to_cache: if data.shape[0] > 0: temp_file = join(temp_directory, f'success_{counter}.csv') data.to_csv(temp_file) data = pd.DataFrame(columns=expected_columns) counter += 1 bar() # merge all CSV files into a single dataframe # delete all temp files temp_files = glob(join(temp_directory, 'success_*.csv')) data = pd.concat(map(read_csv, temp_files)) data.sort_values(by=['ecode', 'time_stamp'], inplace=True, ignore_index=True) data = data[expected_columns] delete_directory(temp_directory) return data
def _prep_for_extraction(tickers, end_date, end_time, bar_size): """ # todo: to be added... """ # form data caching directory cache_directory = join(CACHE_DIR, bar_size.replace(' ', ''), end_date, end_time.replace(':', '_')) # create cache directory for success cache_success = join(cache_directory, 'success') make_dirs(cache_success) # create cache directory for failure cache_failure = join(cache_directory, 'failure') make_dirs(cache_failure) # save tickers for later use path_input_tickers = join(cache_directory, 'input_tickers.json') # todo: find a better way to do this if not isfile(path_input_tickers): save_data_as_json(tickers, path_input_tickers, indent=1, sort_keys=True) # extract tickers that are yet to be processed tickers = _get_unprocessed_tickers(tickers, cache_success) # clean failure directory, all these tickers will have to be processed again failure_tickers = list( map(_get_ticker_id, get_files_by_type(cache_failure))) common_tickers = list(set(tickers).intersection(failure_tickers)) for ticker in common_tickers: file_name = f'{ticker}.json' delete_file(cache_failure, file_name) return tickers, cache_success, cache_failure
def generate_success_dataframe(target_directory, bar_title=None, verbose=False): """ Creates a pandas data fame from JSON files present at the given failure location. Assumes that all these JSON files have valid bar data. :param target_directory: location to read JSON files from :param bar_title: message to show infron of progress bar :param verbose: set to true to see info messages on console """ if bar_title is not None: _BAR_CONFIG['title'] = bar_title def _get_ticker_id(file_name): return int(file_name.split(sep)[-1].split('.')[0]) # create a place holder dataframe expected_columns = [ 'time_stamp', 'ecode', 'session', 'high', 'low', 'close', 'volume', 'average', 'count' ] # create temporary directory to store smaller CSV files temp_directory = '.temp' make_dirs(temp_directory) # extract all json files from target directory success_files = get_files_by_type(target_directory) success_tickers = list(map(_get_ticker_id, success_files)) total = len(success_tickers) data = pd.DataFrame(columns=expected_columns) if bool(total): write_to_console(f'=> Generating dataframe for success tickers...', verbose=verbose) json_generator = map(read_json_file, success_files) counter = 0 # to count temp files with alive_bar(total=total, **_BAR_CONFIG) as bar: for i in range(total): ticker = success_tickers[i] ticker_data = next( json_generator) # load data into a dictionary bar_data, meta_data = ticker_data['bar_data'], ticker_data[ 'meta_data'] temp_data = pd.DataFrame(bar_data) temp_data['ecode'] = ticker data = data.append(temp_data) _time_to_cache = ((i > 0) and (i % 100 == 0)) or (i + 1 == total) if _time_to_cache: if data.shape[0] > 0: temp_file = join(temp_directory, f'success_{counter}.csv') data.to_csv(temp_file) data = pd.DataFrame(columns=expected_columns) counter += 1 bar() # merge all CSV files into a single dataframe # delete all temp files temp_files = get_files_by_type(temp_directory, file_type='csv') if bool(temp_files): data = pd.concat(map(read_csv, temp_files)) data.sort_values(by=['ecode', 'time_stamp'], inplace=True, ignore_index=True) data = data[expected_columns] delete_directory(temp_directory) return data
def generate_failure_dataframe(target_directory, bar_title=None, verbose=False): """ Creates a pandas data fame from JSON files present at the given failure location. Assumes that all these JSON files have valid error stacks. :param target_directory: location to read JSON files from :param bar_title: message to show infron of progress bar :param verbose: set to true to see info messages on console """ if bar_title is not None: _BAR_CONFIG['title'] = bar_title def _get_ticker_id(file_name): return int(file_name.split(sep)[-1].split('.')[0]) # create a place holder dataframe expected_columns = ['ecode', 'code', 'message'] data = pd.DataFrame(columns=expected_columns) # create temporary directory to store smaller CSV files temp_directory = '.temp' make_dirs(temp_directory) # extract all json files from target directory file_pattern = join( target_directory, '*.json') # TODO: can be modified to match digital values failure_files = glob(file_pattern) total = len(failure_files) if bool(total): write_to_console(f'=> Generting dataframe for failure tickers...', verbose=verbose) json_generator = map(read_json_file, failure_files) counter = 0 # to count temp CSV files with alive_bar(total=total, **_BAR_CONFIG) as bar: for i in range(total): ticker_data = next(json_generator) meta = ticker_data['meta_data'] error_stack = meta['_error_stack'] ecode = meta.get('ecode', _get_ticker_id(failure_files[i])) temp_data = pd.DataFrame(error_stack, columns=expected_columns) temp_data['ecode'] = ecode # if error stack is empty, then create a dummy row if temp_data.shape[ 0] == 0: # fixme: find a way to control this in the TWS Client dummy_row = { 'ecode': ecode, 'code': 'unknown', 'message': 'not available' } temp_data = temp_data.append(dummy_row, ignore_index=True) data = data.append(temp_data) _time_to_cache = (i + 1 == total) or ((i > 0) and (i % 100 == 0)) if _time_to_cache: if data.shape[0] > 0: temp_file = join(temp_directory, f'failure_{counter}.csv') data.to_csv(temp_file) data = pd.DataFrame(columns=expected_columns) counter += 1 bar() # merge all CSV files into a single dataframe # delete all temp files temp_files = get_files_by_type(temp_directory, file_type='csv') data = pd.concat(map(read_csv, temp_files)) data.sort_values(by=['ecode'], ignore_index=True, inplace=True) data = data[expected_columns] delete_directory(temp_directory) return data