Пример #1
0
    def _write_df_to_db_single_thread(self, ticker, remove_duplicates=True, if_exists_table='append',
                                      if_exists_ticker='replace'):

        logger = LoggerManager.getLogger(__name__)

        postfix = '-' + self._get_postfix() + '-with-duplicates'

        if remove_duplicates:
            postfix = '-' + self._get_postfix() + '-no-duplicates'

        filename = os.path.join(self.temp_large_data_folder, ticker + postfix) + '.' + fileformat

        logger.info("Reading " + filename)

        util_func = UtilFunc()
        time_series_ops = TimeSeriesOps()
        data_source_local = self._get_output_data_source()

        df = util_func.read_dataframe_from_binary(filename, format=binary_format)

        if df is not None:
            df = time_series_ops.localize_as_UTC(df)

            data_source_local.append_market_data(df, ticker, if_exists_table=if_exists_table,
                                                 if_exists_ticker=if_exists_ticker)
        else:
            logger.warn("Couldn't write dataframe for " + ticker + " to database, appears it is empty!")
Пример #2
0
    def _fetch_market_data(self, start, finish, ticker, write_to_disk=True, read_cached_from_disk=True, web_proxies=constants.web_proxies):
        logger = LoggerManager.getLogger(__name__)

        key = (str(start) + str(finish) + ticker + '_' + self._get_postfix()).replace(":", '_')

        filename = os.path.join(self.temp_data_folder, key) + '.' + fileformat
        util_func = UtilFunc()

        start_time_stamp = pd.Timestamp(start)
        finish_time_stamp = pd.Timestamp(finish)

        if self._remove_weekend_points():
            weekend_data = "Weekend? " + key

            weekday_point = UtilFunc().is_weekday_point(start_time_stamp, finish_time_stamp,
                                                        friday_close_nyc_hour=constants.friday_close_utc_hour,
                                                        sunday_open_utc_hour=constants.sunday_open_utc_hour)

            if not(weekday_point):
                return None, weekend_data

        df = None

        if read_cached_from_disk:
            if os.path.exists(filename):
                df = util_func.read_dataframe_from_binary(filename, format=binary_format)

                if df is not None:
                    logger.debug("Read " + filename + " from disk")

        if df is None:
            # Convert tcapy ticker into vendor ticker
            df = self._get_input_data_source().fetch_market_data(start, finish,
                                                                 ticker=self._get_tickers_vendor()[ticker], web_proxies=web_proxies)

            if df is not None:

                if write_to_disk:
                    # Write a small temporary dataframe to disk (if the process fails later, these can be picked up,
                    # without having a call the external vendor again
                    util_func.write_dataframe_to_binary(df, filename, format=binary_format)

        msg = None

        if df is None:
            msg = "No data? " + key

        return df, msg
Пример #3
0
    def _fetch_market_data(self,
                           start,
                           finish,
                           ticker,
                           write_to_disk=True,
                           read_cached_from_disk=True,
                           web_proxies=constants.web_proxies):
        logger = LoggerManager.getLogger(__name__)

        key = (str(start) + str(finish) + ticker + '_' +
               self._get_postfix()).replace(":", '_')

        filename = os.path.join(self.temp_data_folder, key) + '.' + fileformat
        util_func = UtilFunc()

        start_time_stamp = pd.Timestamp(start)
        finish_time_stamp = pd.Timestamp(finish)

        if self._remove_saturday():
            weekend_data = "Saturday? " + key

            # Ignore Saturday, and don't attempt to download
            if start_time_stamp.dayofweek == 5 or finish_time_stamp.dayofweek == 5:
                return None, weekend_data

        if self._remove_weekend_points():
            weekend_data = "Weekend? " + key

            if start_time_stamp.dayofweek == 6 and start_time_stamp.hour < 20:
                return None, weekend_data

            if start_time_stamp.dayofweek == 4 and start_time_stamp.hour > 22:
                return None, weekend_data

        df = None

        if read_cached_from_disk:
            if os.path.exists(filename):
                df = util_func.read_dataframe_from_binary(filename,
                                                          format=binary_format)

                if df is not None:
                    logger.debug("Read " + filename + " from disk")

        if df is None:
            # Convert tcapy ticker into vendor ticker
            df = self._get_input_data_source().fetch_market_data(
                start,
                finish,
                ticker=self._get_tickers_vendor()[ticker],
                web_proxies=web_proxies)

            if df is not None:
                df = df.drop('ticker', axis=1)

                if write_to_disk:
                    # Write a small temporary dataframe to disk (if the process fails later, these can be picked up,
                    # without having a call the external vendor again
                    util_func.write_dataframe_to_binary(df,
                                                        filename,
                                                        format=binary_format)

        msg = None

        if df is None:
            msg = "No data? " + key

        return df, msg
Пример #4
0
    def _combine_mini_df_from_disk_single_thread(self,
                                                 ticker,
                                                 remove_duplicates=True):

        logger = LoggerManager.getLogger(__name__)
        time_series_ops = TimeSeriesOps()

        logger.info('Getting ' + ticker + ' filenames...')
        temp_data_folder = self.temp_data_folder

        filename_list = []

        for root, dirnames, filenames in os.walk(temp_data_folder):

            for filename in filenames:
                if ticker in filename and '.' + fileformat in filename:
                    filename_h5_parquet = os.path.join(root, filename)

                    # if filename is less than 10MB add (otherwise likely a very large aggregated file!)
                    if os.path.getsize(filename_h5_parquet) < 10 * 1024 * 1024:
                        filename_list.append(filename_h5_parquet)

        df_list = []

        util_func = UtilFunc()

        logger.info('Loading ' + ticker + ' mini dataframe into  memory')

        i = 0

        if len(filename_list) == 0:
            logger.warn("Looks like there are no files for " + ticker +
                        " in " + temp_data_folder +
                        ". Are you sure path is correct?")

        # Go through each mini file which represents a few minutes of data and append it
        for filename in filename_list:
            filesize = 0

            try:
                filesize = os.path.getsize(filename) / 1024.0
                df = util_func.read_dataframe_from_binary(filename,
                                                          format=binary_format)

                i = i + 1

                # every 100 files print reading output@
                if i % 100 == 0:
                    logger.info('Reading ' + filename + ' number ' + str(i))

                if df is not None:
                    df = df.sort_index()
                    df = self._remove_duplicates_time_series(df,
                                                             remove_duplicates,
                                                             time_series_ops,
                                                             field='mid')

                    df_list.append(df)
            except Exception as e:
                logger.warn('Failed to parse ' + filename + " of " +
                            str(filesize) + "KB")  # + str(e))

            # if i > 1000:
            #    break

        # Assume UTC time (don't want to mix UTC and non-UTC in database!)
        if df_list == []:
            logger.warn('No dataframe read for ' + ticker +
                        ', cannot combine!')

            return

        logger.info('About to combine ' + ticker +
                    ' into large dataframe to write to disk...')

        df = pd.concat(df_list)
        df = time_series_ops.localize_as_UTC(df)

        df = df.sort_index()

        df = self._remove_duplicates_time_series(df,
                                                 remove_duplicates,
                                                 time_series_ops,
                                                 field='mid')

        postfix = '-' + self._get_postfix() + '-with-duplicates'

        if remove_duplicates:
            postfix = '-' + self._get_postfix() + '-no-duplicates'

        filename = os.path.join(self.temp_large_data_folder,
                                ticker + postfix) + '.' + fileformat

        df = time_series_ops.localize_as_UTC(df)
        util_func.write_dataframe_to_binary(df, filename, format=binary_format)
Пример #5
0
import os

from tcapy.util.loggermanager import LoggerManager
from tcapy.util.utilfunc import UtilFunc

add_vendor = 'dukascopy'

path = parquet_path = '/home/tcapyuser/csv_dump/' +  add_vendor + '/'

filenames = os.listdir(path)

util_func = UtilFunc()
logger = LoggerManager.getLogger(__name__)

for filename in filenames:
    format = filename.split('.')[-1]

    if format == 'gzip':
        format = 'parquet'
    elif format == 'h5':
        format = 'hdf5'

    logger.info('Reading to patch file ' + filename)

    df = util_func.read_dataframe_from_binary(os.path.join(path, filename), format=format)

    # Do your edits here, in this case overwriting the ticker column
    ticker = filename.split('_')[0]
    df['ticker'] = ticker

    util_func.write_dataframe_to_binary(df, os.path.join(path, filename), format=format)