def test_stress_tca(fill_market_trade_databases):
    """Makes several large TCARequests at the same time to stress test tcapy application and also to check it works
    with parallel requests (note: you may need to reduce the length of the dataset if your machine has limited amounts of RAM).

    It can be possible that when deployed on the web, several users might make simultaneous requests. Note, do not use
    pylibmc, and instead use python-memcached, when using memcached as a result backend. pylibmc is not thread-safe so
    will come undone if you end up making parallel requests.
    """
    from tcapy.util.swim import Swim

    if not (stress_test):
        return

    # Clear cache to ensure all test code runs!
    Mediator.get_volatile_cache().clear_cache()

    tca_request = TCARequest(
        start_date=start_date,
        finish_date=finish_date,
        ticker=valid_ticker_list,
        dummy_market=True,
        trade_data_store=trade_data_store,
        trade_data_database_name=trade_data_database_name,
        market_data_store=market_data_store,
        market_data_database_table=market_data_database_table,
        trade_order_mapping=trade_order_mapping,
        use_multithreading=True,
        tca_type='aggregated')

    # Kick off several simulanteous large TCA requests
    request_no = 2

    tca_request_list = []

    for i in range(0, request_no):
        tca_request_list.append(TCARequest(tca_request=tca_request))

    tca_engine = TCAEngineImpl(version=tcapy_version)

    swim = Swim(parallel_library='thread')
    pool = swim.create_pool(thread_no=len(tca_request_list))

    result = []

    for item in tca_request_list:
        result.append(pool.apply_async(tca_engine.calculate_tca,
                                       args=(item, )))

    output = [p.get() for p in result]

    swim.close_pool(pool, True)

    assert len(output) == len(tca_request_list)

    # Check that several DataFrames exist in the results
    for trade_order_results_df_dict in output:
        assert 'trade_df' in trade_order_results_df_dict.keys()
示例#2
0
    def write_df_to_db(self, tickers=None, remove_duplicates=True, if_exists_table='append', if_exists_ticker='replace'):
        """Loads up a large HDF5/Parquet file from disk into a pd DataFrame and then dumps locally.
        Uses use_multithreading to speed it up, by using a thread for each different ticker.

        Parameters
        ----------
        tickers : str (list or dict)
            List of tickers

        remove_duplicates : bool
            True (default) - removes any follow on duplicates in the dataset

        if_exists_table : str
            'append' - if database table already exists append data to it
            'replace' - remove existing database table

        if_exists_ticker : str
            'append' - if ticker already exists in the database, append to it
            'replace' - replace any data for this ticker

        Returns
        -------

        """

        if tickers is None: tickers = self.tickers.keys()

        if isinstance(tickers, dict): tickers = tickers.keys()

        if not (isinstance(tickers, list)):
            tickers = [tickers]

        if constants.use_multithreading:

            swim = Swim(parallel_library=constants.database_populator_threading_library)
            pool = swim.create_pool(thread_no=self._get_threads())

            result = []

            for i in range(0, len(tickers)):
                result.append(
                    pool.apply_async(self._write_df_to_db_single_thread,
                                     args=(tickers[i], remove_duplicates, if_exists_table, if_exists_ticker,)))

            output = [p.get() for p in result]

            swim.close_pool(pool, True)
        else:
            for i in range(0, len(tickers)):
                self._write_df_to_db_single_thread(tickers[i], remove_duplicates, if_exists_table, if_exists_ticker)
示例#3
0
    def combine_mini_df_from_disk(self, tickers=None, remove_duplicates=True):
        """Combines the mini HDF5/Parquet files for eg. 5 min chunks and combine into a very large HDF5/Parquet file, which is likely to be
        for multiple months of data. Uses multithreading to speed up, by using a thread for each different ticker.

        Parameters
        ----------
        tickers : str (list or ditc)
            Ticker of each ticker

        remove_duplicates : bool
            Remove duplicated market prices, which follow one another

        Returns
        -------

        """

        if tickers is None: tickers = self.tickers.keys()

        if isinstance(tickers, dict): tickers = tickers.keys()

        if not (isinstance(tickers, list)):
            tickers = [tickers]

        if constants.use_multithreading:
            swim = Swim(parallel_library=constants.
                        database_populator_threading_library)
            pool = swim.create_pool(thread_no=self._get_threads())

            result = []

            for i in range(0, len(tickers)):
                result.append(
                    pool.apply_async(
                        self._combine_mini_df_from_disk_single_thread,
                        args=(
                            tickers[i],
                            remove_duplicates,
                        )))

            output = [p.get() for p in result]

            swim.close_pool(pool, True)

        else:
            for i in range(0, len(tickers)):
                self._combine_mini_df_from_disk_single_thread(
                    tickers[i], remove_duplicates)
示例#4
0
    def download_from_external_source(self,
                                      append_data=True,
                                      remove_duplicates=True,
                                      if_exists_table='append',
                                      if_exists_ticker='append',
                                      number_of_days=30 * 7,
                                      chunk_int_min=None,
                                      start_date=None,
                                      finish_date=None,
                                      delete_cached_files=False,
                                      tickers=None,
                                      write_temp_to_disk=True,
                                      write_to_disk_db=True,
                                      read_cached_from_disk=True,
                                      write_large_csv=False,
                                      write_large_hdf5_parquet=True,
                                      csv_folder=constants.csv_folder,
                                      csv_compression=None,
                                      return_df=False,
                                      web_proxies=constants.web_proxies):
        """Downloads market data from an external source and then dumps to HDF5/Parquet files for temporary storage which is cached.
        If HDF5/Parquet cached files already exist for a time segment we read them in, saving us to make an external data call.

        Lastly, dumps it to an internal database.

        Parameters
        ----------
        append_data : bool
            True - only start collecting later data not already in database (ignoring number_of_days parameter)
            False - start collecting all data, ignoring anything stored in database

        remove_duplicates : bool
            True (default) - remove values which are repeated
            False - leave in repeated values

        if_exists_table : str
            'append' - if database table already exists append data to it
            'replace' - remove existing database table

        if_exists_ticker : str
            'append' - if ticker already exists in the database, append to it
            'replace' - replace any data for this ticker

        number_of_days : int
            Number of days to download data for

        chunk_int_min : int (None)
            Size of each download (default - specified in constants)

        Returns
        -------

        """
        # Swim()

        logger = LoggerManager.getLogger(__name__)

        if write_to_disk_db:
            data_source_local = self._get_output_data_source()

        if write_large_csv:
            if not (os.path.isdir(csv_folder)):
                logger.warn("CSV folder " + self.temp_data_folder +
                            " where we are about to write does not exist")

        # What chunk size in minutes do we want for this data provider?
        if chunk_int_min is None:
            chunk_int_min = self._get_download_chunk_min_size()

        if chunk_int_min is None:
            chunk_size_str = None
        else:
            chunk_size_str = str(chunk_int_min) + "min"

        if tickers is None:
            tickers = self._get_tickers()

        if isinstance(tickers, str):
            tickers = [tickers]

        # If there's no start or finish date, choose a default start finish data
        if start_date is None and finish_date is None:
            finish_date = datetime.datetime.utcnow()
            finish_date = datetime.datetime(finish_date.year,
                                            finish_date.month, finish_date.day,
                                            0, 0, 0, 0)

            start_date = finish_date - timedelta(days=number_of_days)  # 30*7
        else:
            start_date = self.time_series_ops.date_parse(start_date)
            finish_date = self.time_series_ops.date_parse(finish_date)

        if finish_date < start_date:
            logger.error("Download finish date is before start data!")

            return

        now = pd.Timestamp(datetime.datetime.utcnow(), tz='utc')

        # Do not allow downloading of future data!
        if finish_date > now:
            finish_date = now

        df_dict = {}

        # Loop through each ticker
        for ticker in tickers:

            has_old = False

            if delete_cached_files and write_to_disk_db:
                logger.info("Deleting all cached temp files for " + ticker)

                for name in glob.glob(self.temp_data_folder + '/*' + ticker +
                                      "*"):
                    try:
                        os.remove(name)
                    except:
                        logger.warn("Couldn't delete file " + name)

                logger.info("Finished deleting cached files for " + ticker)

            # If we have been asked to append data, load up what you can from the internal database
            # find the last point
            if append_data and if_exists_ticker == 'append' and write_to_disk_db:
                logger.info("Trying to download old data first for " + ticker)

                try:
                    df_old = data_source_local.fetch_market_data(
                        start_date,
                        finish_date,
                        ticker,
                        web_proxies=web_proxies)

                    # This will vary between tickers (in particular if we happen to add a new ticker)
                    start_date = df_old.index[-1]

                    has_old = True

                    # Remove reference - big file!
                    df_old = None

                except Exception as e:
                    logger.info("No data found for ticker " + ticker +
                                " with error: " + str(e))
            else:
                logger.info("Downloading new data for " + ticker + ".")

            # Date range may not work with timezones
            start_date = pd.Timestamp(start_date.replace(tzinfo=None))
            finish_date = pd.Timestamp(finish_date.replace(tzinfo=None))

            if finish_date - start_date < pd.Timedelta(days=1):
                start_date_list = [start_date, finish_date]
            else:
                # download from that last point to the present day
                start_date_list = pd.date_range(start_date, finish_date)

                start_date_list = [
                    pd.Timestamp(x.to_pydatetime()) for x in start_date_list
                ]

                if finish_date > start_date_list[-1]:
                    start_date_list.append(finish_date)

            df = None
            filename = os.path.join(self.temp_data_folder,
                                    ticker) + '.' + fileformat

            try:
                # df = UtilFunc().read_dataframe_from_hdf(filename)
                pass
            except:
                logger.info("Couldn't read HDF5/Parquet file for " + ticker)

            # Create downloads in x minute chunks (if we request very large chunks of data with certain data providers,
            # we could cause problems!)
            if df is None:
                df_remote_list = []

                # Loop by day (otherwise can end up with too many open files!)
                for i in range(0, len(start_date_list) - 1):

                    if chunk_size_str is not None:
                        if start_date_list[
                                i + 1] - start_date_list[i] < pd.Timedelta(
                                    minutes=chunk_int_min):
                            start_date_hist = [start_date_list[i]]
                            finish_date_hist = [start_date_list[i + 1]]
                        else:
                            start_date_hist, finish_date_hist = UtilFunc(
                            ).split_into_freq(start_date_list[i],
                                              start_date_list[i + 1],
                                              freq=chunk_size_str)
                    else:
                        start_date_hist = [start_date_list[i]]
                        finish_date_hist = [start_date_list[i + 1]]

                    # For FX and most other markets we should remove weekends (cryptocurrencies do have weekend data)
                    if self._remove_weekend_points():
                        start_date_hist, finish_date_hist = UtilFunc(
                        ).remove_weekend_points(start_date_hist,
                                                finish_date_hist)

                    output = []

                    if constants.use_multithreading:

                        # Create a multiprocess object for downloading data
                        swim = Swim(parallel_library=constants.
                                    database_populator_threading_library)
                        pool = swim.create_pool(thread_no=self._get_threads())

                        result = []

                        for i in range(0, len(start_date_hist)):
                            # output.append(self._fetch_market_data(start_date_hist[i], finish_date_hist[i], ticker))

                            result.append(
                                pool.apply_async(
                                    self._fetch_market_data,
                                    args=(start_date_hist[i],
                                          finish_date_hist[i], ticker,
                                          write_temp_to_disk,
                                          read_cached_from_disk, web_proxies)))

                        output = [p.get() for p in result]

                        swim.close_pool(pool, True)
                    else:
                        # Otherwise run in single threaded fashion
                        for i in range(0, len(start_date_hist)):
                            output.append(
                                self._fetch_market_data(
                                    start_date_hist[i],
                                    finish_date_hist[i],
                                    ticker,
                                    write_to_disk=write_temp_to_disk,
                                    read_cached_from_disk=read_cached_from_disk,
                                    web_proxies=web_proxies))

                    # Get all the dataframe chunks and returned messages
                    df_list = [
                        self._remove_duplicates_time_series(x,
                                                            remove_duplicates,
                                                            field='mid')
                        for x, y in output if x is not None
                    ]
                    msg_list = [
                        y for x, y in output if x is not None and y is not None
                    ]

                    # Concatenate all the 5 (or larger) minute data chunks
                    try:
                        if df_list != []:
                            df_temp = pd.concat(df_list)

                            if df_temp is not None:
                                if not (df_temp.empty):
                                    df_remote_list.append(df_temp)

                    except Exception as e:
                        logger.error(str(e))

                if df_remote_list != []:
                    df = pd.concat(df_remote_list)

                    # Need to sort data (database assumes sorted data for chunking/searches)
                    df = df.sort_index()
                    df = self.time_series_ops.localize_as_UTC(df)

                    if write_large_hdf5_parquet:
                        if df is not None:
                            if not (df.empty):
                                key =  '_' + self._get_postfix() + "_" + \
                                       (str(df.index[0]) + str(df.index[-1])).replace(":", '_').replace(" ", '_')
                                filename = os.path.join(
                                    csv_folder,
                                    ticker + key) + '.' + fileformat

                                # Temporary cache for testing purposes (also if the process crashes, we can read this back in)
                                UtilFunc().write_dataframe_to_binary(
                                    df, filename, format=binary_format)

            if df is not None:
                # Assume UTC time (don't want to mix UTC and non-UTC in database!)
                df = self.time_series_ops.localize_as_UTC(df)

            # write CSV
            if write_large_csv:
                if df is not None:
                    if not (df.empty):
                        key = '_' + self._get_postfix() + "_" + \
                              (str(df.index[0]) + str(df.index[-1])).replace(":", '_').replace(" ", '_')

                        if csv_compression is 'gzip':
                            df.to_csv(os.path.join(csv_folder,
                                                   ticker + key + ".csv.gz"),
                                      compression='gzip')
                        else:
                            df.to_csv(
                                os.path.join(csv_folder,
                                             ticker + key + ".csv"))

            if return_df:
                df_dict[ticker] = df

            # Dump what we have locally (or whatever DatabaseSource we have defined)
            try:

                start_date = start_date.replace(tzinfo=pytz.utc)

                # Remove first point if matches last point from dataset
                if has_old:
                    if df.index[0] == start_date:
                        df = df[-1:]

                if df is not None:
                    df = df.sort_index()

                    df = self._remove_duplicates_time_series(df,
                                                             remove_duplicates,
                                                             field='mid')

                if write_to_disk_db and df is not None:
                    data_source_local.append_market_data(
                        df,
                        ticker,
                        if_exists_table=if_exists_table,
                        if_exists_ticker=if_exists_ticker)

                    logger.info("Wrote to database for " + ticker)

            except Exception as e:
                final_err = "Data was missing for these dates " + str(start_date) + " - " + str(finish_date) + " for " \
                            + str(tickers) + " Didn't write anything to disk or return any valid dataframe: " + str(e)

                logger.error(final_err)

            if df is None:
                msg_list.append("No downloaded data for " + str(start_date) +
                                " - " + str(finish_date) +
                                ". Is this a holiday?")

        # Returns a status containing any failed downloads, which can be read by a user
        return msg_list, df_dict