示例#1
0
def _prep_for_extraction(tickers, target_date, end_time):
    # stdout.write(f'=> Setting things up for extraction for: {target_date}\n')

    # form data caching directory
    cache_directory = join(_HISTORICAL_DATA_STORAGE, target_date)

    # create cache directory for success
    cache_success = join(cache_directory, '.success')
    make_dirs(cache_success)

    # create cache directory for failure
    cache_failure = join(cache_directory, '.failure')
    make_dirs(cache_failure)
    # stdout.write(f'\t-> Cache directies created...\n')

    # save tickers for later use
    path_input_tickers = join(cache_directory, 'input_tickers.json')
    # todo: find a better way to do this
    if not isfile(path_input_tickers):
        save_data_as_json(tickers, path_input_tickers, indent=0)
    # stdout.write(f'\t-> Cached target tickers...\n')

    # extract tickers that are yet to be processed
    tickers = _get_unprocessed_tickers(tickers, cache_success, cache_failure,
                                       target_date, end_time)
    # stdout.write(f'\t-> Excluded already processed tickers...\n')
    # stdout.write(f'\t-> Total target tickers: {len(tickers)}\n')

    return tickers, cache_success, cache_failure
示例#2
0
def _setup_storage_directories(date, bar_size='1 min'):
    y, m = date[:4], date[4:6]
    storage_dir = join(HISTORICAL_DATA_STORAGE, bar_size.replace(' ', ''), y,
                       MONTH_MAP[int(m)], date)

    make_dirs(storage_dir)
    return storage_dir
def generate_failure_dataframe(target_directory):
    """
        Creates a pandas data fame from JSON files present at the given failure location.
        Assumes that all these JSON files have valid error stacks.
        :param target_directory: location to read JSON files from
    """
    stdout.write(f'=> Generting dataframe for failure tickers...\n')

    def _get_ticker_id(file_name):
        return int(file_name.split(sep)[-1].split('.')[0])

    # create a place holder dataframe
    expected_columns = ['ecode', 'status', 'code', 'message', 'attempts']
    data = pd.DataFrame(columns=expected_columns)

    # create temporary directory to store smaller CSV files
    temp_directory = '.temp'
    make_dirs(temp_directory)

    # extract all json files from target directory
    file_pattern = join(target_directory, '*.json')  # TODO: can be modified to match digital values
    failure_files = glob(file_pattern)
    total = len(failure_files)

    if bool(total):
        json_generator = map(read_json_file, failure_files)
        counter = 0  # to count temp CSV files
        with alive_bar(total=total, **_BAR_CONFIG) as bar:
            for i in range(total):
                ticker_data = next(json_generator)
                meta_data = ticker_data['meta_data']
                error_stack = meta_data['_error_stack']
                temp_data = pd.DataFrame(error_stack)
                status, attempts = meta_data['status'], meta_data['attempts']
                temp_data['ecode'] = meta_data.get('ecode', _get_ticker_id(failure_files[i]))
                temp_data['status'], temp_data['attempts'] = status, attempts
                data = data.append(temp_data)
                _time_to_cache = (i+1 == total) or ((i > 0) and (i % 100 == 0))
                if _time_to_cache:
                    if data.shape[0] > 0:
                        temp_file = join(temp_directory, f'failure_{counter}.csv')
                        data.to_csv(temp_file)
                        data = pd.DataFrame(columns=expected_columns)
                        counter += 1
                bar()

        # merge all CSV files into a single dataframe
        # delete all temp files
        temp_files = glob(join(temp_directory, 'failure_*.csv'))
        data = pd.concat(map(read_csv, temp_files))
        data.sort_values(by=['ecode'], ignore_index=True, inplace=True)
        data = data[expected_columns]
    delete_directory(temp_directory)

    return data
def generate_success_dataframe(target_directory):
    """
        Creates a pandas data fame from JSON files present at the given failure location.
        Assumes that all these JSON files have valid bar data.
        :param target_directory: location to read JSON files from
    """
    stdout.write(f'=> Generating dataframe for success tickers...\n')

    def _get_ticker_id(file_name):
        return int(file_name.split(sep)[-1].split('.')[0])
    # create a place holder dataframe
    expected_columns = ['time_stamp', 'ecode', 'session', 'high', 'low', 'close',
                        'volume', 'average', 'count']
    data = pd.DataFrame(columns=expected_columns)

    # create temporary directory to store smaller CSV files
    temp_directory = '.temp'
    make_dirs(temp_directory)

    # extract all json files from target directory
    success_file_pattern = join(target_directory, '*.json')
    success_files = glob(success_file_pattern)
    total = len(success_files)

    if bool(total):
        json_generator = (read_json_file(file) for file in success_files)
        counter = 0  # to count temp files
        with alive_bar(total=total, **_BAR_CONFIG) as bar:
            for i in range(total):
                ticker_data = next(json_generator)
                bar_data, meta_data = ticker_data['bar_data'], ticker_data['meta_data']
                temp_data = pd.DataFrame(bar_data)
                temp_data['ecode'] = meta_data.get('ecode', _get_ticker_id(success_files[i]))
                data = data.append(temp_data)
                _time_to_cache = (i+1 == total) or ((i > 0) and (i % 100 == 0))
                if _time_to_cache:
                    if data.shape[0] > 0:
                        temp_file = join(temp_directory, f'success_{counter}.csv')
                        data.to_csv(temp_file)
                        data = pd.DataFrame(columns=expected_columns)
                        counter += 1
                bar()

        # merge all CSV files into a single dataframe
        # delete all temp files
        temp_files = glob(join(temp_directory, 'success_*.csv'))
        data = pd.concat(map(read_csv, temp_files))
        data.sort_values(by=['ecode', 'time_stamp'], inplace=True, ignore_index=True)
        data = data[expected_columns]
    delete_directory(temp_directory)

    return data
示例#5
0
def _prep_for_extraction(tickers, end_date, end_time, bar_size):
    """
        # todo: to be added...
    """
    # form data caching directory
    cache_directory = join(CACHE_DIR, bar_size.replace(' ', ''), end_date,
                           end_time.replace(':', '_'))

    # create cache directory for success
    cache_success = join(cache_directory, 'success')
    make_dirs(cache_success)

    # create cache directory for failure
    cache_failure = join(cache_directory, 'failure')
    make_dirs(cache_failure)

    # save tickers for later use
    path_input_tickers = join(cache_directory, 'input_tickers.json')

    # todo: find a better way to do this
    if not isfile(path_input_tickers):
        save_data_as_json(tickers,
                          path_input_tickers,
                          indent=1,
                          sort_keys=True)

    # extract tickers that are yet to be processed
    tickers = _get_unprocessed_tickers(tickers, cache_success)

    # clean failure directory, all these tickers will have to be processed again
    failure_tickers = list(
        map(_get_ticker_id, get_files_by_type(cache_failure)))
    common_tickers = list(set(tickers).intersection(failure_tickers))
    for ticker in common_tickers:
        file_name = f'{ticker}.json'
        delete_file(cache_failure, file_name)

    return tickers, cache_success, cache_failure
示例#6
0
def generate_success_dataframe(target_directory,
                               bar_title=None,
                               verbose=False):
    """
        Creates a pandas data fame from JSON files present at the given failure location.
        Assumes that all these JSON files have valid bar data.
        :param target_directory: location to read JSON files from
        :param bar_title: message to show infron of progress bar
        :param verbose: set to true to see info messages on console
    """
    if bar_title is not None:
        _BAR_CONFIG['title'] = bar_title

    def _get_ticker_id(file_name):
        return int(file_name.split(sep)[-1].split('.')[0])

    # create a place holder dataframe
    expected_columns = [
        'time_stamp', 'ecode', 'session', 'high', 'low', 'close', 'volume',
        'average', 'count'
    ]

    # create temporary directory to store smaller CSV files
    temp_directory = '.temp'
    make_dirs(temp_directory)

    # extract all json files from target directory
    success_files = get_files_by_type(target_directory)
    success_tickers = list(map(_get_ticker_id, success_files))
    total = len(success_tickers)
    data = pd.DataFrame(columns=expected_columns)

    if bool(total):
        write_to_console(f'=> Generating dataframe for success tickers...',
                         verbose=verbose)
        json_generator = map(read_json_file, success_files)
        counter = 0  # to count temp files
        with alive_bar(total=total, **_BAR_CONFIG) as bar:
            for i in range(total):
                ticker = success_tickers[i]
                ticker_data = next(
                    json_generator)  # load data into a dictionary
                bar_data, meta_data = ticker_data['bar_data'], ticker_data[
                    'meta_data']
                temp_data = pd.DataFrame(bar_data)
                temp_data['ecode'] = ticker
                data = data.append(temp_data)
                _time_to_cache = ((i > 0) and
                                  (i % 100 == 0)) or (i + 1 == total)
                if _time_to_cache:
                    if data.shape[0] > 0:
                        temp_file = join(temp_directory,
                                         f'success_{counter}.csv')
                        data.to_csv(temp_file)
                        data = pd.DataFrame(columns=expected_columns)
                        counter += 1
                bar()

        # merge all CSV files into a single dataframe
        # delete all temp files
        temp_files = get_files_by_type(temp_directory, file_type='csv')
        if bool(temp_files):
            data = pd.concat(map(read_csv, temp_files))
            data.sort_values(by=['ecode', 'time_stamp'],
                             inplace=True,
                             ignore_index=True)
            data = data[expected_columns]
    delete_directory(temp_directory)

    return data
示例#7
0
def generate_failure_dataframe(target_directory,
                               bar_title=None,
                               verbose=False):
    """
        Creates a pandas data fame from JSON files present at the given failure location.
        Assumes that all these JSON files have valid error stacks.
        :param target_directory: location to read JSON files from
        :param bar_title: message to show infron of progress bar
        :param verbose: set to true to see info messages on console
    """
    if bar_title is not None:
        _BAR_CONFIG['title'] = bar_title

    def _get_ticker_id(file_name):
        return int(file_name.split(sep)[-1].split('.')[0])

    # create a place holder dataframe
    expected_columns = ['ecode', 'code', 'message']
    data = pd.DataFrame(columns=expected_columns)

    # create temporary directory to store smaller CSV files
    temp_directory = '.temp'
    make_dirs(temp_directory)

    # extract all json files from target directory
    file_pattern = join(
        target_directory,
        '*.json')  # TODO: can be modified to match digital values
    failure_files = glob(file_pattern)
    total = len(failure_files)

    if bool(total):
        write_to_console(f'=> Generting dataframe for failure tickers...',
                         verbose=verbose)
        json_generator = map(read_json_file, failure_files)
        counter = 0  # to count temp CSV files
        with alive_bar(total=total, **_BAR_CONFIG) as bar:
            for i in range(total):
                ticker_data = next(json_generator)
                meta = ticker_data['meta_data']
                error_stack = meta['_error_stack']
                ecode = meta.get('ecode', _get_ticker_id(failure_files[i]))
                temp_data = pd.DataFrame(error_stack, columns=expected_columns)
                temp_data['ecode'] = ecode
                # if error stack is empty, then create a dummy row
                if temp_data.shape[
                        0] == 0:  # fixme: find a way to control this in the TWS Client
                    dummy_row = {
                        'ecode': ecode,
                        'code': 'unknown',
                        'message': 'not available'
                    }
                    temp_data = temp_data.append(dummy_row, ignore_index=True)

                data = data.append(temp_data)
                _time_to_cache = (i + 1 == total) or ((i > 0) and
                                                      (i % 100 == 0))
                if _time_to_cache:
                    if data.shape[0] > 0:
                        temp_file = join(temp_directory,
                                         f'failure_{counter}.csv')
                        data.to_csv(temp_file)
                        data = pd.DataFrame(columns=expected_columns)
                        counter += 1
                bar()

        # merge all CSV files into a single dataframe
        # delete all temp files
        temp_files = get_files_by_type(temp_directory, file_type='csv')
        data = pd.concat(map(read_csv, temp_files))
        data.sort_values(by=['ecode'], ignore_index=True, inplace=True)
        data = data[expected_columns]
    delete_directory(temp_directory)

    return data