Exemplo n.º 1
0
class DatabasePopulator(ABC):
    """DatabasePopulator connects from one data source (typically an external one via a DatabaseSource eg. DatabaseNCFX)
    downloads historical data from that and then dumps it locally
    """
    def __init__(self,
                 temp_data_folder=constants.temp_data_folder,
                 temp_large_data_folder=constants.temp_large_data_folder,
                 tickers=None,
                 data_store=None):

        self.temp_data_folder = temp_data_folder
        self.temp_large_data_folder = temp_large_data_folder
        self.tickers = None
        self.util_func = UtilFunc()
        self.time_series_ops = TimeSeriesOps()
        self.data_store = data_store

        logger = LoggerManager().getLogger(__name__)

        if not (os.path.isdir(self.temp_data_folder)):
            logger.warn("Temp data folder " + self.temp_data_folder +
                        " does not exist")

        if not (os.path.isdir(self.temp_large_data_folder)):
            logger.warn("Temp large data folder " + self.temp_data_folder +
                        " does not exist")

        if tickers is not None:
            self.tickers = tickers

    @abc.abstractmethod
    def _fetch_market_data(self,
                           start,
                           finish,
                           ticker,
                           web_proxies=constants.web_proxies):
        """Fetches market data in a single download for a ticker. We need to be careful not to specify chunks which are
        too large, as many external sources will have a limit on how much data we can download in one chunk.

        Parameters
        ----------
        start : datetime
            Start date/time of the download

        finish : datetime
            Finish date/time of the download

        ticker : str
            Ticker to be downloaded

        web_proxies : dict
            Addresses for web proxies

        Returns
        -------

        """
        pass

    def _get_postfix(self):
        """The postfix which represents this data source, eg. 'ncfx' for New Change FX or 'dukascopy' for Dukascopy

        Returns
        -------
        str
        """
        pass

    @abc.abstractmethod
    def _get_output_data_source(self):
        """Gets the DatabaseSource object which represents how we wish to store the market data internally

        Returns
        -------
        DatabaseSource
        """
        return

    def _remove_weekend_points(self):
        return True

    @abc.abstractmethod
    def _get_input_data_source(self):
        """Gets the DatabaseSource object which represents how we input the market data (typically, this will be from
        an external data source)

        Returns
        -------
        DatabaseSource
        """
        return

    @abc.abstractmethod
    def _get_tickers(self):
        """List of tickers that can accessedd from the external/input DatabaseSource

        Returns
        -------
        str (list)
        """
        return

    @abc.abstractmethod
    def _get_threads(self, start_data_hist, finish_date_hist):
        """How many threads to use when downloading from our external/input DatabaseSource

        Returns
        -------
        int
        """
        return

    def download_to_csv(self,
                        start_date,
                        finish_date,
                        tickers,
                        remove_duplicates=True,
                        split_size='monthly',
                        chunk_int_min=None,
                        include_partial_periods=False,
                        write_temp_to_disk=True,
                        write_large_csv=True,
                        write_large_hdf5_parquet=True,
                        csv_folder=constants.csv_folder,
                        csv_compression=None,
                        return_df=False,
                        web_proxies=constants.web_proxies):

        start_date = self.time_series_ops.date_parse(start_date)
        finish_date = self.time_series_ops.date_parse(finish_date)

        dates = self.util_func.split_date_single_list(
            start_date,
            finish_date,
            split_size=split_size,
            add_partial_period_start_finish_dates=include_partial_periods)

        df_dict = {}
        msg = []

        for i in range(0, len(dates) - 1):
            msg_list, df_dict_list = self.download_from_external_source(
                start_date=dates[i],
                finish_date=dates[i + 1],
                tickers=tickers,
                chunk_int_min=chunk_int_min,
                append_data=False,
                remove_duplicates=remove_duplicates,
                write_temp_to_disk=write_temp_to_disk,
                write_to_disk_db=False,
                write_large_csv=write_large_csv,
                write_large_hdf5_parquet=write_large_hdf5_parquet,
                csv_folder=csv_folder,
                csv_compression=csv_compression,
                return_df=return_df,
                web_proxies=web_proxies)

            if msg_list != []:
                msg.append(msg_list)

            if return_df:
                for k in df_dict_list.keys():
                    if k in df_dict.keys():
                        df_dict[k] = df_dict[k].append(df_dict_list[k])
                    else:
                        df_dict[k] = df_dict_list[k]

        return self.util_func.flatten_list_of_lists(msg), df_dict

    def download_from_external_source(self,
                                      append_data=True,
                                      remove_duplicates=True,
                                      if_exists_table='append',
                                      if_exists_ticker='append',
                                      number_of_days=30 * 7,
                                      chunk_int_min=None,
                                      start_date=None,
                                      finish_date=None,
                                      delete_cached_files=False,
                                      tickers=None,
                                      write_temp_to_disk=True,
                                      write_to_disk_db=True,
                                      read_cached_from_disk=True,
                                      write_large_csv=False,
                                      write_large_hdf5_parquet=True,
                                      csv_folder=constants.csv_folder,
                                      csv_compression=None,
                                      return_df=False,
                                      web_proxies=constants.web_proxies):
        """Downloads market data from an external source and then dumps to HDF5/Parquet files for temporary storage which is cached.
        If HDF5/Parquet cached files already exist for a time segment we read them in, saving us to make an external data call.

        Lastly, dumps it to an internal database.

        Parameters
        ----------
        append_data : bool
            True - only start collecting later data not already in database (ignoring number_of_days parameter)
            False - start collecting all data, ignoring anything stored in database

        remove_duplicates : bool
            True (default) - remove values which are repeated
            False - leave in repeated values

        if_exists_table : str
            'append' - if database table already exists append data to it
            'replace' - remove existing database table

        if_exists_ticker : str
            'append' - if ticker already exists in the database, append to it
            'replace' - replace any data for this ticker

        number_of_days : int
            Number of days to download data for

        chunk_int_min : int (None)
            Size of each download (default - specified in constants)

        Returns
        -------

        """
        # Swim()

        logger = LoggerManager.getLogger(__name__)

        if write_to_disk_db:
            data_source_local = self._get_output_data_source()

        if write_large_csv:
            if not (os.path.isdir(csv_folder)):
                logger.warn("CSV folder " + self.temp_data_folder +
                            " where we are about to write does not exist")

        # What chunk size in minutes do we want for this data provider?
        if chunk_int_min is None:
            chunk_int_min = self._get_download_chunk_min_size()

        if chunk_int_min is None:
            chunk_size_str = None
        else:
            chunk_size_str = str(chunk_int_min) + "min"

        if tickers is None:
            tickers = self._get_tickers()

        if isinstance(tickers, str):
            tickers = [tickers]

        # If there's no start or finish date, choose a default start finish data
        if start_date is None and finish_date is None:
            finish_date = datetime.datetime.utcnow()
            finish_date = datetime.datetime(finish_date.year,
                                            finish_date.month, finish_date.day,
                                            0, 0, 0, 0)

            start_date = finish_date - timedelta(days=number_of_days)  # 30*7
        else:
            start_date = self.time_series_ops.date_parse(start_date)
            finish_date = self.time_series_ops.date_parse(finish_date)

        if finish_date < start_date:
            logger.error("Download finish date is before start data!")

            return

        now = pd.Timestamp(datetime.datetime.utcnow(), tz='utc')

        # Do not allow downloading of future data!
        if finish_date > now:
            finish_date = now

        df_dict = {}

        # Loop through each ticker
        for ticker in tickers:

            has_old = False

            if delete_cached_files and write_to_disk_db:
                logger.info("Deleting all cached temp files for " + ticker)

                for name in glob.glob(self.temp_data_folder + '/*' + ticker +
                                      "*"):
                    try:
                        os.remove(name)
                    except:
                        logger.warn("Couldn't delete file " + name)

                logger.info("Finished deleting cached files for " + ticker)

            # If we have been asked to append data, load up what you can from the internal database
            # find the last point
            if append_data and if_exists_ticker == 'append' and write_to_disk_db:
                logger.info("Trying to download old data first for " + ticker)

                try:
                    df_old = data_source_local.fetch_market_data(
                        start_date,
                        finish_date,
                        ticker,
                        web_proxies=web_proxies)

                    # This will vary between tickers (in particular if we happen to add a new ticker)
                    start_date = df_old.index[-1]

                    has_old = True

                    # Remove reference - big file!
                    df_old = None

                except Exception as e:
                    logger.info("No data found for ticker " + ticker +
                                " with error: " + str(e))
            else:
                logger.info("Downloading new data for " + ticker + ".")

            # Date range may not work with timezones
            start_date = pd.Timestamp(start_date.replace(tzinfo=None))
            finish_date = pd.Timestamp(finish_date.replace(tzinfo=None))

            if finish_date - start_date < pd.Timedelta(days=1):
                start_date_list = [start_date, finish_date]
            else:
                # download from that last point to the present day
                start_date_list = pd.date_range(start_date, finish_date)

                start_date_list = [
                    pd.Timestamp(x.to_pydatetime()) for x in start_date_list
                ]

                if finish_date > start_date_list[-1]:
                    start_date_list.append(finish_date)

            df = None
            filename = os.path.join(self.temp_data_folder,
                                    ticker) + '.' + fileformat

            try:
                # df = UtilFunc().read_dataframe_from_hdf(filename)
                pass
            except:
                logger.info("Couldn't read HDF5/Parquet file for " + ticker)

            # Create downloads in x minute chunks (if we request very large chunks of data with certain data providers,
            # we could cause problems!)
            if df is None:
                df_remote_list = []

                # Loop by day (otherwise can end up with too many open files!)
                for i in range(0, len(start_date_list) - 1):

                    if chunk_size_str is not None:
                        if start_date_list[
                                i + 1] - start_date_list[i] < pd.Timedelta(
                                    minutes=chunk_int_min):
                            start_date_hist = [start_date_list[i]]
                            finish_date_hist = [start_date_list[i + 1]]
                        else:
                            start_date_hist, finish_date_hist = UtilFunc(
                            ).split_into_freq(start_date_list[i],
                                              start_date_list[i + 1],
                                              freq=chunk_size_str)
                    else:
                        start_date_hist = [start_date_list[i]]
                        finish_date_hist = [start_date_list[i + 1]]

                    # For FX and most other markets we should remove weekends (cryptocurrencies do have weekend data)
                    if self._remove_weekend_points():
                        start_date_hist, finish_date_hist = UtilFunc(
                        ).remove_weekend_points(start_date_hist,
                                                finish_date_hist)

                    output = []

                    if constants.use_multithreading:

                        # Create a multiprocess object for downloading data
                        swim = Swim(parallel_library=constants.
                                    database_populator_threading_library)
                        pool = swim.create_pool(thread_no=self._get_threads())

                        result = []

                        for i in range(0, len(start_date_hist)):
                            # output.append(self._fetch_market_data(start_date_hist[i], finish_date_hist[i], ticker))

                            result.append(
                                pool.apply_async(
                                    self._fetch_market_data,
                                    args=(start_date_hist[i],
                                          finish_date_hist[i], ticker,
                                          write_temp_to_disk,
                                          read_cached_from_disk, web_proxies)))

                        output = [p.get() for p in result]

                        swim.close_pool(pool, True)
                    else:
                        # Otherwise run in single threaded fashion
                        for i in range(0, len(start_date_hist)):
                            output.append(
                                self._fetch_market_data(
                                    start_date_hist[i],
                                    finish_date_hist[i],
                                    ticker,
                                    write_to_disk=write_temp_to_disk,
                                    read_cached_from_disk=read_cached_from_disk,
                                    web_proxies=web_proxies))

                    # Get all the dataframe chunks and returned messages
                    df_list = [
                        self._remove_duplicates_time_series(x,
                                                            remove_duplicates,
                                                            field='mid')
                        for x, y in output if x is not None
                    ]
                    msg_list = [
                        y for x, y in output if x is not None and y is not None
                    ]

                    # Concatenate all the 5 (or larger) minute data chunks
                    try:
                        if df_list != []:
                            df_temp = pd.concat(df_list)

                            if df_temp is not None:
                                if not (df_temp.empty):
                                    df_remote_list.append(df_temp)

                    except Exception as e:
                        logger.error(str(e))

                if df_remote_list != []:
                    df = pd.concat(df_remote_list)

                    # Need to sort data (database assumes sorted data for chunking/searches)
                    df = df.sort_index()
                    df = self.time_series_ops.localize_as_UTC(df)

                    if write_large_hdf5_parquet:
                        if df is not None:
                            if not (df.empty):
                                key =  '_' + self._get_postfix() + "_" + \
                                       (str(df.index[0]) + str(df.index[-1])).replace(":", '_').replace(" ", '_')
                                filename = os.path.join(
                                    csv_folder,
                                    ticker + key) + '.' + fileformat

                                # Temporary cache for testing purposes (also if the process crashes, we can read this back in)
                                UtilFunc().write_dataframe_to_binary(
                                    df, filename, format=binary_format)

            if df is not None:
                # Assume UTC time (don't want to mix UTC and non-UTC in database!)
                df = self.time_series_ops.localize_as_UTC(df)

            # write CSV
            if write_large_csv:
                if df is not None:
                    if not (df.empty):
                        key = '_' + self._get_postfix() + "_" + \
                              (str(df.index[0]) + str(df.index[-1])).replace(":", '_').replace(" ", '_')

                        if csv_compression is 'gzip':
                            df.to_csv(os.path.join(csv_folder,
                                                   ticker + key + ".csv.gz"),
                                      compression='gzip')
                        else:
                            df.to_csv(
                                os.path.join(csv_folder,
                                             ticker + key + ".csv"))

            if return_df:
                df_dict[ticker] = df

            # Dump what we have locally (or whatever DatabaseSource we have defined)
            try:

                start_date = start_date.replace(tzinfo=pytz.utc)

                # Remove first point if matches last point from dataset
                if has_old:
                    if df.index[0] == start_date:
                        df = df[-1:]

                if df is not None:
                    df = df.sort_index()

                    df = self._remove_duplicates_time_series(df,
                                                             remove_duplicates,
                                                             field='mid')

                if write_to_disk_db and df is not None:
                    data_source_local.append_market_data(
                        df,
                        ticker,
                        if_exists_table=if_exists_table,
                        if_exists_ticker=if_exists_ticker)

                    logger.info("Wrote to database for " + ticker)

            except Exception as e:
                final_err = "Data was missing for these dates " + str(start_date) + " - " + str(finish_date) + " for " \
                            + str(tickers) + " Didn't write anything to disk or return any valid dataframe: " + str(e)

                logger.error(final_err)

            if df is None:
                msg_list.append("No downloaded data for " + str(start_date) +
                                " - " + str(finish_date) +
                                ". Is this a holiday?")

        # Returns a status containing any failed downloads, which can be read by a user
        return msg_list, df_dict

    def _remove_duplicates_time_series(self,
                                       df,
                                       remove_duplicates,
                                       field='mid'):

        if remove_duplicates:
            df = self.time_series_ops.drop_consecutive_duplicates(df, field)

        return df

    def combine_mini_df_from_disk(self, tickers=None, remove_duplicates=True):
        """Combines the mini HDF5/Parquet files for eg. 5 min chunks and combine into a very large HDF5/Parquet file, which is likely to be
        for multiple months of data. Uses multithreading to speed up, by using a thread for each different ticker.

        Parameters
        ----------
        tickers : str (list or ditc)
            Ticker of each ticker

        remove_duplicates : bool
            Remove duplicated market prices, which follow one another

        Returns
        -------

        """

        if tickers is None: tickers = self.tickers.keys()

        if isinstance(tickers, dict): tickers = tickers.keys()

        if not (isinstance(tickers, list)):
            tickers = [tickers]

        if constants.use_multithreading:
            swim = Swim(parallel_library=constants.
                        database_populator_threading_library)
            pool = swim.create_pool(thread_no=self._get_threads())

            result = []

            for i in range(0, len(tickers)):
                result.append(
                    pool.apply_async(
                        self._combine_mini_df_from_disk_single_thread,
                        args=(
                            tickers[i],
                            remove_duplicates,
                        )))

            output = [p.get() for p in result]

            swim.close_pool(pool, True)

        else:
            for i in range(0, len(tickers)):
                self._combine_mini_df_from_disk_single_thread(
                    tickers[i], remove_duplicates)

    def _combine_mini_df_from_disk_single_thread(self,
                                                 ticker,
                                                 remove_duplicates=True):

        logger = LoggerManager.getLogger(__name__)
        time_series_ops = TimeSeriesOps()

        logger.info('Getting ' + ticker + ' filenames...')
        temp_data_folder = self.temp_data_folder

        filename_list = []

        for root, dirnames, filenames in os.walk(temp_data_folder):

            for filename in filenames:
                if ticker in filename and '.' + fileformat in filename:
                    filename_h5_parquet = os.path.join(root, filename)

                    # if filename is less than 10MB add (otherwise likely a very large aggregated file!)
                    if os.path.getsize(filename_h5_parquet) < 10 * 1024 * 1024:
                        filename_list.append(filename_h5_parquet)

        df_list = []

        util_func = UtilFunc()

        logger.info('Loading ' + ticker + ' mini dataframe into  memory')

        i = 0

        if len(filename_list) == 0:
            logger.warn("Looks like there are no files for " + ticker +
                        " in " + temp_data_folder +
                        ". Are you sure path is correct?")

        # Go through each mini file which represents a few minutes of data and append it
        for filename in filename_list:
            filesize = 0

            try:
                filesize = os.path.getsize(filename) / 1024.0
                df = util_func.read_dataframe_from_binary(filename,
                                                          format=binary_format)

                i = i + 1

                # every 100 files print reading output@
                if i % 100 == 0:
                    logger.info('Reading ' + filename + ' number ' + str(i))

                if df is not None:
                    df = df.sort_index()
                    df = self._remove_duplicates_time_series(df,
                                                             remove_duplicates,
                                                             time_series_ops,
                                                             field='mid')

                    df_list.append(df)
            except Exception as e:
                logger.warn('Failed to parse ' + filename + " of " +
                            str(filesize) + "KB")  # + str(e))

            # if i > 1000:
            #    break

        # Assume UTC time (don't want to mix UTC and non-UTC in database!)
        if df_list == []:
            logger.warn('No dataframe read for ' + ticker +
                        ', cannot combine!')

            return

        logger.info('About to combine ' + ticker +
                    ' into large dataframe to write to disk...')

        df = pd.concat(df_list)
        df = time_series_ops.localize_as_UTC(df)

        df = df.sort_index()

        df = self._remove_duplicates_time_series(df,
                                                 remove_duplicates,
                                                 time_series_ops,
                                                 field='mid')

        postfix = '-' + self._get_postfix() + '-with-duplicates'

        if remove_duplicates:
            postfix = '-' + self._get_postfix() + '-no-duplicates'

        filename = os.path.join(self.temp_large_data_folder,
                                ticker + postfix) + '.' + fileformat

        df = time_series_ops.localize_as_UTC(df)
        util_func.write_dataframe_to_binary(df, filename, format=binary_format)

    def write_df_to_db(self,
                       tickers=None,
                       remove_duplicates=True,
                       if_exists_table='append',
                       if_exists_ticker='replace'):
        """Loads up a large HDF5/Parquet file from disk into a pd DataFrame and then dumps locally.
        Uses multithreading to speed it up, by using a thread for each different ticker.

        Parameters
        ----------
        tickers : str (list or dict)
            List of tickers

        remove_duplicates : bool
            True (default) - removes any follow on duplicates in the dataset

        if_exists_table : str
            'append' - if database table already exists append data to it
            'replace' - remove existing database table

        if_exists_ticker : str
            'append' - if ticker already exists in the database, append to it
            'replace' - replace any data for this ticker

        Returns
        -------

        """

        if tickers is None: tickers = self.tickers.keys()

        if isinstance(tickers, dict): tickers = tickers.keys()

        if not (isinstance(tickers, list)):
            tickers = [tickers]

        if constants.use_multithreading:

            swim = Swim(parallel_library=constants.
                        database_populator_threading_library)
            pool = swim.create_pool(thread_no=self._get_threads())

            result = []

            for i in range(0, len(tickers)):
                result.append(
                    pool.apply_async(self._write_df_to_db_single_thread,
                                     args=(
                                         tickers[i],
                                         remove_duplicates,
                                         if_exists_table,
                                         if_exists_ticker,
                                     )))

            output = [p.get() for p in result]

            swim.close_pool(pool, True)
        else:
            for i in range(0, len(tickers)):
                self._write_df_to_db_single_thread(tickers[i],
                                                   remove_duplicates,
                                                   if_exists_table,
                                                   if_exists_ticker)

    def _write_df_to_db_single_thread(self,
                                      ticker,
                                      remove_duplicates=True,
                                      if_exists_table='append',
                                      if_exists_ticker='replace'):

        logger = LoggerManager.getLogger(__name__)

        postfix = '-' + self._get_postfix() + '-with-duplicates'

        if remove_duplicates:
            postfix = '-' + self._get_postfix() + '-no-duplicates'

        filename = os.path.join(self.temp_large_data_folder,
                                ticker + postfix) + '.' + fileformat

        logger.info("Reading " + filename)

        util_func = UtilFunc()
        time_series_ops = TimeSeriesOps()
        data_source_local = self._get_output_data_source()

        df = util_func.read_dataframe_from_binary(filename,
                                                  format=binary_format)

        if df is not None:
            df = time_series_ops.localize_as_UTC(df)

            data_source_local.append_market_data(
                df,
                ticker,
                if_exists_table=if_exists_table,
                if_exists_ticker=if_exists_ticker)
        else:
            logger.warn("Couldn't write dataframe for " + ticker +
                        " to database, appears it is empty!")

    def _remove_saturday(self):
        return True
Exemplo n.º 2
0
    def create_test_trade_order(self,
                                ticker,
                                start_date='01 Jan 2016',
                                finish_date='01 May 2018',
                                order_min_size=0.5 * constants.MILLION,
                                order_max_size=20.0 * constants.MILLION,
                                number_of_orders_min_per_year=252 * 20,
                                number_of_orders_max_per_year=252 * 200):
        """Create a randomised list of orders & trade using indicative market data as a source (and perturbing the
        execution prices, within various constraints, such as the approximate size of orders trades, the orders per _year

        Parameters
        ----------
        ticker : str
            Ticker

        start_date : str
            Start date of the orders

        finish_date : str
            Finish date of the orders

        order_min_size : float
            Minimum size of orders

        order_max_size : float
            Maximum size of orders

        number_of_orders_min_per_year : int
            Minimum orders per _year

        number_of_orders_max_per_year : int
            Maximum orders per _year

        Returns
        -------
        DataFrame
        """
        logger = LoggerManager.getLogger(__name__)

        if isinstance(ticker, str):
            ticker = [ticker]

        order_list = []
        trade_list = []

        start_date = self.time_series_ops.date_parse(start_date,
                                                     assume_utc=True)
        finish_date = self.time_series_ops.date_parse(finish_date,
                                                      assume_utc=True)
        util_func = UtilFunc()

        # Make this parallel? but may have memory issues
        for tick in ticker:

            logger.info("Loading market data for " + tick)

            # split into yearly chunks (otherwise can run out of memory easily)
            date_list = util_func.split_date_single_list(
                start_date,
                finish_date,
                split_size='yearly',
                add_partial_period_start_finish_dates=True)

            # TODO do in a batch fashion
            for i in range(0, len(date_list) - 1):
                df = self._tca_market.get_market_data(
                    MarketRequest(start_date=date_list[i],
                                  finish_date=date_list[i + 1],
                                  ticker=tick,
                                  data_store=self._market_data_source))

                # self.database_source_market.fetch_market_data(start_date = start_date, finish_date = finish_date, ticker = tick)

                # Need to make sure there's sufficient market data!
                if df is not None:
                    if len(df.index) >= 2:
                        # Get the percentage of the _year represented by the difference between the start and finish dates
                        year_perc = float(
                            (df.index[-1] - df.index[0]).seconds /
                            (24.0 * 60.0 * 60.0)) / 365.0

                        logger.info("Constructing randomised trades for " +
                                    tick)

                        number_of_orders_min = int(
                            year_perc * number_of_orders_min_per_year)
                        number_of_orders_max = int(
                            year_perc * number_of_orders_max_per_year)

                        # Split up the data frame into equally sized chunks
                        df_orders = self._derive_order_no(
                            self._strip_columns(df, tick),
                            number_of_orders_min, number_of_orders_max)

                        # Don't want a memory leak, so delete this as soon possible from memory!
                        del df

                        # order_counter = 0

                        logger.info("Now beginning order construction for " +
                                    tick)

                        # For each order create randomised associated trades
                        # group together all the trades per day as orders
                        for df_order in df_orders:

                            # Set duration of the grandparent order (find randomised start/finish time)
                            # somewhere between 0-25% for start, and 75% to 100% for end point
                            df_order = self.rand_time_series.randomly_truncate_data_frame_within_bounds(
                                df_order, start_perc=0.25, finish_perc=0.75)

                            logger.debug("Creating order between " +
                                         str(df_order.index[0]) + " - " +
                                         str(df_order.index[-1]))

                            # Assume all orders/trades are in the same direction (which is randomly chosen)
                            buy_sell = randint(0, 1)

                            # Sell trades
                            if buy_sell == 0:
                                side_no = -1
                                side = 'bid'

                            # Buy trades
                            else:
                                side_no = 1
                                side = 'ask'

                            magnitude = 10000.0 * 2

                            if tick == 'USDJPY': magnitude = 100.0 * 2.0

                            if randint(0, 100) > 97:
                                new_tick = tick[3:6] + tick[0:3]

                                if 'ticker' in df_order.columns:
                                    df_order['ticker'] = new_tick

                                if 'bid' in df_order.columns and 'ask' in df_order.columns:
                                    ask = 1.0 / df_order['bid']
                                    bid = 1.0 / df_order['ask']

                                    df_order['bid'] = bid
                                    df_order['ask'] = ask

                                df_order['mid'] = 1.0 / df_order['mid']
                            else:
                                new_tick = tick

                            # Get 'bid' for sells, and 'ask' for buys
                            df_order['trade_value'] = df_order[side]

                            # We want to simulate the executions by perturbing the buys randomly
                            df_order = self.rand_time_series.randomly_perturb_column(
                                df_order,
                                column='trade_value',
                                magnitude=magnitude)

                            # Assume notional is in base currency in vast majority of cases
                            if randint(0, 100) > 97:
                                notional_currency = new_tick[3:6]
                            else:
                                notional_currency = new_tick[0:3]

                            notional_multiplier = 1.0

                            if notional_currency == 'JPY':
                                notional_multiplier = 100.0

                            # Randomly choose a realistic order notional
                            # This will later be subdivided into trade notional
                            order_notional = randint(
                                order_min_size * notional_multiplier,
                                order_max_size * notional_multiplier)

                            order_additional_attributes = {
                                'broker_id':
                                constants.test_brokers_dictionary['All'],
                                'broker_sub_id':
                                constants.test_sub_brokers_dictionary['All'],
                                'algo_id':
                                constants.test_algos_dictionary['All'],
                                'algo_settings':
                                'default',
                            }

                            # Construct an order and add it to list
                            ind_order = self._construct_order(
                                df_order,
                                order_type='order',
                                notional=order_notional,
                                notional_currency=notional_currency,
                                side=side_no,
                                tick=new_tick,
                                additional_attributes=
                                order_additional_attributes)

                            order_list.append(ind_order)

                            trade_additional_attributes = self.grab_attributes_from_trade_order(
                                ind_order, [
                                    'broker_id', 'broker_sub_id', 'algo_id',
                                    'algo_settings'
                                ])

                            # Now create all the broker messages for the order

                            # These will consist firstly of placement messages
                            # then potentionally cancels, cancel/replace and in most cases we randomly assign trade fills
                            trade_list = self._create_trades_from_order(
                                trade_list=trade_list,
                                df_order=df_order,
                                tick=new_tick,
                                ind_order=ind_order,
                                side_no=side_no,
                                order_notional=order_notional,
                                notional_currency=notional_currency,
                                additional_attributes=
                                trade_additional_attributes)

                            # order_counter = order_counter + 1

        # Aggregate all the lists into DataFrames (setting 'date' as the index)

        # For the trade dataframe also drop the 'index' column which was previous used to ensure that fills, were after placements
        trade_order_dict = {
            'trade_df':
            self.time_series_ops.aggregate_dict_to_dataframe(
                trade_list, 'date', 'index'),
            'order_df':
            self.time_series_ops.aggregate_dict_to_dataframe(
                order_list, 'date')
        }

        return trade_order_dict