예제 #1
0
 def __init__(self):
     self._calendar = Calendar()
예제 #2
0
class Filter(object):
    """Functions for filtering time series by dates and columns.

    This class is used extensively in both findatapy and finmarketpy.

    Market holidays are collected from web sources such as https://www.timeanddate.com/holidays/ and also individual
    exchange websites, and is manually updated from time to time to take into account newly instituted holidays, and stored
    in conf/holidays_table.parquet - if you need to add your own holidays.

    """

    _time_series_cache = {}  # shared across all instances of object!

    def __init__(self):
        self._calendar = Calendar()

    def filter_time_series(self,
                           market_data_request,
                           data_frame,
                           pad_columns=False):
        """Filters a time series given a set of criteria (like start/finish date and tickers)

        Parameters
        ----------
        market_data_request : MarketDataRequest
            defining time series filtering
        data_frame : DataFrame
            time series to be filtered
        pad_columns : boolean
            true, non-existant columns with nan

        Returns
        -------
        DataFrame
        """
        start_date = market_data_request.start_date
        finish_date = market_data_request.finish_date

        data_frame = self.filter_time_series_by_date(start_date, finish_date,
                                                     data_frame)

        # filter by ticker.field combinations requested
        columns = self.create_tickers_fields_list(market_data_request)

        if (pad_columns):
            data_frame = self.pad_time_series_columns(columns, data_frame)
        else:
            data_frame = self.filter_time_series_by_columns(
                columns, data_frame)

        return data_frame

    def filter_time_series_by_holidays(self,
                                       data_frame,
                                       cal='FX',
                                       holidays_list=[]):
        """Removes holidays from a given time series

        Parameters
        ----------
        data_frame : DataFrame
            data frame to be filtered
        cal : str
            business calendar to use

        Returns
        -------
        DataFrame
        """

        # Optimal case for weekdays: remove Saturday and Sunday
        if (cal == 'WEEKDAY' or cal == 'WKY'):
            return data_frame[data_frame.index.dayofweek <= 4]

        # Select only those holidays in the sample
        holidays_start = self._calendar.get_holidays(
            data_frame.index[0],
            data_frame.index[-1],
            cal,
            holidays_list=holidays_list)

        if (holidays_start.size == 0):
            return data_frame

        holidays_end = holidays_start + np.timedelta64(1, 'D')

        # floored_dates = data_frame.index.normalize()
        #
        # filter_by_index_start = floored_dates.searchsorted(holidays_start)
        # filter_by_index_end = floored_dates.searchsorted(holidays_end)
        #
        # indices_to_keep = []
        #
        # if filter_by_index_end[0] == 0:
        #     counter = filter_by_index_end[0] + 1
        #     start_index = 1
        # else:
        #     counter = 0
        #     start_index = 0
        #
        # for i in range(start_index, len(holidays_start)):
        #     indices = list(range(counter, filter_by_index_start[i] - 1))
        #     indices_to_keep = indices_to_keep + indices
        #
        #     counter = filter_by_index_end[i] + 1
        #
        # indices = list(range(counter, len(floored_dates)))
        # indices_to_keep = indices_to_keep + indices
        #
        # data_frame_filtered = data_frame[indices_to_keep]

        if data_frame.index.tz is None:
            holidays_start = holidays_start.tz_localize(None)
            holidays_end = holidays_end.tz_localize(None)

        data_frame_left = data_frame
        data_frame_filtered = []

        for i in range(0, len(holidays_start)):
            data_frame_temp = data_frame_left[
                data_frame_left.index < holidays_start[i]]
            data_frame_left = data_frame_left[
                data_frame_left.index >= holidays_end[i]]

            data_frame_filtered.append(data_frame_temp)

        data_frame_filtered.append(data_frame_left)

        return pd.concat(data_frame_filtered)

    def filter_time_series_by_date(self, start_date, finish_date, data_frame):
        """Filter time series by start/finish dates

        Parameters
        ----------
        start_date : DateTime
            start date of calendar
        finish_date : DataTime
            finish date of calendar
        data_frame : DataFrame
            data frame to be filtered

        Returns
        -------
        DataFrame
        """
        offset = 0  # inclusive

        return self.filter_time_series_by_date_offset(start_date,
                                                      finish_date,
                                                      data_frame,
                                                      offset,
                                                      exclude_start_end=False)

    def filter_time_series_by_days(self, days, data_frame):
        """Filter time series by start/finish dates

        Parameters
        ----------
        start_date : DateTime
            start date of calendar
        finish_date : DataTime
            finish date of calendar
        data_frame : DataFrame
            data frame to be filtered

        Returns
        -------
        DataFrame
        """
        offset = 0  # inclusive

        finish_date = datetime.datetime.utcnow()
        start_date = finish_date - timedelta(days=days)
        return self.filter_time_series_by_date_offset(start_date, finish_date,
                                                      data_frame, offset)

    def filter_time_series_by_date_exc(self, start_date, finish_date,
                                       data_frame):
        """Filter time series by start/finish dates (exclude start & finish dates)

        Parameters
        ----------
        start_date : DateTime
            start date of calendar
        finish_date : DataTime
            finish date of calendar
        data_frame : DataFrame
            data frame to be filtered

        Returns
        -------
        DataFrame
        """
        offset = 1  # exclusive of start finish date

        return self.filter_time_series_by_date_offset(start_date,
                                                      finish_date,
                                                      data_frame,
                                                      offset,
                                                      exclude_start_end=True)

        # try:
        #     # filter by dates for intraday data
        #     if(start_date is not None):
        #         data_frame = data_frame.loc[start_date <= data_frame.index]
        #
        #     if(finish_date is not None):
        #         # filter by start_date and finish_date
        #         data_frame = data_frame.loc[data_frame.index <= finish_date]
        # except:
        #     # filter by dates for daily data
        #     if(start_date is not None):
        #         data_frame = data_frame.loc[start_date.date() <= data_frame.index]
        #
        #     if(finish_date is not None):
        #         # filter by start_date and finish_date
        #         data_frame = data_frame.loc[data_frame.index <= finish_date.date()]
        #
        # return data_frame

    def filter_time_series_by_date_offset(self,
                                          start_date,
                                          finish_date,
                                          data_frame,
                                          offset,
                                          exclude_start_end=False):
        """Filter time series by start/finish dates (and an offset)

        Parameters
        ----------
        start_date : DateTime
            start date of calendar
        finish_date : DataTime
            finish date of calendar
        data_frame : DataFrame
            data frame to be filtered
        offset : int
            offset to be applied

        Returns
        -------
        DataFrame
        """

        if hasattr(data_frame.index, 'tz'):
            if data_frame.index.tz is not None:

                # If the start/finish dates are timezone naive, overwrite with the DataFrame timezone
                if not (isinstance(start_date, str)):
                    start_date = start_date.replace(tzinfo=data_frame.index.tz)

                if not (isinstance(finish_date, str)):
                    finish_date = finish_date.replace(
                        tzinfo=data_frame.index.tz)
            else:
                # Otherwise remove timezone from start_date/finish_date
                if not (isinstance(start_date, str)):
                    start_date = start_date.replace(tzinfo=None)

                if not (isinstance(finish_date, str)):
                    finish_date = finish_date.replace(tzinfo=None)

        if 'int' in str(data_frame.index.dtype):
            return data_frame

        try:
            data_frame = self.filter_time_series_aux(start_date, finish_date,
                                                     data_frame, offset)
        except:
            # start_date = start_date.date()
            # finish_date = finish_date.date()
            # if isinstance(start_date, str):
            #     # format expected 'Jun 1 2005 01:33', '%b %d %Y %H:%M'
            #     try:
            #         start_date = datetime.datetime.strptime(start_date, '%b %d %Y %H:%M')
            #     except:
            #         i = 0
            #
            # if isinstance(finish_date, str):
            #     # format expected 'Jun 1 2005 01:33', '%b %d %Y %H:%M'
            #     try:
            #         finish_date = datetime.datetime.strptime(finish_date, '%b %d %Y %H:%M')
            #     except:
            #         i = 0

            # try:
            #     start_date = start_date.date()
            # except: pass
            #
            # try:
            #     finish_date = finish_date.date()
            # except: pass

            # if we have dates stored as opposed to TimeStamps (ie. daily data), we use a simple (slower) method
            # for filtering daily data
            if (start_date is not None):
                if exclude_start_end:
                    data_frame = data_frame.loc[start_date < data_frame.index]
                else:
                    data_frame = data_frame.loc[start_date <= data_frame.index]

            if (finish_date is not None):
                if exclude_start_end:
                    data_frame = data_frame.loc[data_frame.index < finish_date]
                else:
                    # filter by start_date and finish_date
                    data_frame = data_frame.loc[
                        data_frame.index <= finish_date]

        return data_frame

    def filter_time_series_aux(self, start_date, finish_date, data_frame,
                               offset):
        """Filter time series by start/finish dates (and an offset)

        Parameters
        ----------
        start_date : DateTime
            start date of calendar
        finish_date : DataTime
            finish date of calendar
        data_frame : DataFrame
            data frame to be filtered
        offset : int (not implemented!)
            offset to be applied

        Returns
        -------
        DataFrame
        """

        # start_index = 0
        # finish_index = len(data_frame.index) - offset

        # filter by dates for intraday data
        # if(start_date is not None):
        #     start_index = data_frame.index.searchsorted(start_date)
        #
        #     if (0 <= start_index + offset < len(data_frame.index)):
        #         start_index = start_index + offset
        #
        #         # data_frame = data_frame[start_date < data_frame.index]
        #
        # if(finish_date is not None):
        #     finish_index = data_frame.index.searchsorted(finish_date)
        #
        #     if (0 <= finish_index - offset < len(data_frame.index)):
        #         finish_index = finish_index - offset
        # CAREFUL: need + 1 otherwise will only return 1 less than usual
        # return data_frame.iloc[start_date:finish_date]

        # Just use pandas, quicker and simpler code!
        if data_frame is None:
            return None

        # Slower method..
        # return data_frame.loc[start_date:finish_date]

        # Much faster, start and finish dates are inclusive
        return data_frame[(data_frame.index >= start_date)
                          & (data_frame.index <= finish_date)]

    def filter_time_series_by_time_of_day_timezone(self,
                                                   hour,
                                                   minute,
                                                   data_frame,
                                                   timezone_of_snap='UTC'):

        old_tz = data_frame.index.tz
        data_frame = data_frame.tz_convert(pytz.timezone(timezone_of_snap))

        data_frame = data_frame[data_frame.index.minute == minute]
        data_frame = data_frame[data_frame.index.hour == hour]

        data_frame = data_frame.tz_convert(old_tz)

        return data_frame

    def filter_time_series_by_time_of_day(self,
                                          hour,
                                          minute,
                                          data_frame,
                                          in_tz=None,
                                          out_tz=None):
        """Filter time series by time of day

        Parameters
        ----------
        hour : int
            hour of day
        minute : int
            minute of day
        data_frame : DataFrame
            data frame to be filtered
        in_tz : str (optional)
            time zone of input data frame
        out_tz : str (optional)
            time zone of output data frame

        Returns
        -------
        DataFrame
        """
        if out_tz is not None:
            try:
                if in_tz is not None:
                    data_frame = data_frame.tz_localize(pytz.timezone(in_tz))
            except:
                data_frame = data_frame.tz_convert(pytz.timezone(in_tz))

            data_frame = data_frame.tz_convert(pytz.timezone(out_tz))

            # change internal representation of time
            data_frame.index = pd.DatetimeIndex(data_frame.index.values)

        data_frame = data_frame[data_frame.index.minute == minute]
        data_frame = data_frame[data_frame.index.hour == hour]

        return data_frame

    def filter_time_series_by_minute_of_hour(self,
                                             minute,
                                             data_frame,
                                             in_tz=None,
                                             out_tz=None):
        """Filter time series by minute of hour

        Parameters
        ----------
        minute : int
            minute of hour
        data_frame : DataFrame
            data frame to be filtered
        in_tz : str (optional)
            time zone of input data frame
        out_tz : str (optional)
            time zone of output data frame

        Returns
        -------
        DataFrame
        """
        if out_tz is not None:
            if in_tz is not None:
                data_frame = data_frame.tz_localize(pytz.timezone(in_tz))

            data_frame = data_frame.tz_convert(pytz.timezone(out_tz))

            # change internal representation of time
            data_frame.index = pd.DatetimeIndex(data_frame.index.values)

        data_frame = data_frame[data_frame.index.minute == minute]

        return data_frame

    def filter_time_series_between_hours(self, start_hour, finish_hour,
                                         data_frame):
        """Filter time series between hours of the day

        Parameters
        ----------
        start_hour : int
            start of hour filter
        finish_hour : int
            finish of hour filter
        data_frame : DataFrame
            data frame to be filtered

        Returns
        -------
        DataFrame
        """

        data_frame = data_frame[data_frame.index.hour <= finish_hour]
        data_frame = data_frame[data_frame.index.hour >= start_hour]

        return data_frame

    def filter_time_series_by_columns(self, columns, data_frame):
        """Filter time series by certain columns

        Parameters
        ----------
        columns : list(str)
            start of hour filter
        data_frame : DataFrame
            data frame to be filtered

        Returns
        -------
        DataFrame
        """
        if data_frame is not None and columns is not None:
            return data_frame[columns]

        return None

    def pad_time_series_columns(self, columns, data_frame):
        """Selects time series from a dataframe and if necessary creates empty columns

        Parameters
        ----------
        columns : str
            columns to be included with this keyword
        data_frame : DataFrame
            data frame to be filtered

        Returns
        -------
        DataFrame
        """
        old_columns = data_frame.columns

        common_columns = [val for val in columns if val in old_columns]
        uncommon_columns = [val for val in columns if val not in old_columns]
        uncommon_columns = [str(x) for x in uncommon_columns]

        data_frame = data_frame[common_columns]

        if len(uncommon_columns) > 0:
            logger = LoggerManager().getLogger(__name__)

            logger.info(
                "Padding missing columns...")  # " + str(uncommon_columns))

            new_data_frame = pd.DataFrame(index=data_frame.index,
                                          columns=uncommon_columns)

            data_frame = pd.concat([data_frame, new_data_frame], axis=1)

            # SLOW method below
            # for x in uncommon_columns: data_frame.loc[:,x] = np.nan

        # get columns in same order again
        data_frame = data_frame[columns]

        return data_frame

    def filter_time_series_by_excluded_keyword(self, keyword, data_frame):
        """Filter time series to exclude columns which contain keyword

        Parameters
        ----------
        keyword : str
            columns to be excluded with this keyword
        data_frame : DataFrame
            data frame to be filtered

        Returns
        -------
        DataFrame
        """

        if not (isinstance(keyword, list)):
            keyword = [keyword]

        columns = []

        for k in keyword:
            columns.append(
                [elem for elem in data_frame.columns if k not in elem])

        columns = self._calendar.flatten_list_of_lists(columns)

        return self.filter_time_series_by_columns(columns, data_frame)

    def filter_time_series_by_included_keyword(self, keyword, data_frame):
        """Filter time series to include columns which contain keyword

        Parameters
        ----------
        keyword : str
            columns to be included with this keyword
        data_frame : DataFrame
            data frame to be filtered

        Returns
        -------
        DataFrame
        """

        if not (isinstance(keyword, list)):
            keyword = [keyword]

        columns = []

        for k in keyword:
            columns.append([elem for elem in data_frame.columns if k in elem])

        columns = self._calendar.flatten_list_of_lists(columns)

        return self.filter_time_series_by_columns(columns, data_frame)

    def filter_time_series_by_minute_freq(self, freq, data_frame):
        """Filter time series where minutes correspond to certain minute filter

        Parameters
        ----------
        freq : int
            minute frequency to be filtered
        data_frame : DataFrame
            data frame to be filtered

        Returns
        -------
        DataFrame
        """
        return data_frame.loc[data_frame.index.minute % freq == 0]

    def create_tickers_fields_list(self, market_data_request):
        """Creates a list of tickers concatenated with fields from a MarketDataRequest

        Parameters
        ----------
        market_data_request : MarketDataRequest
            request to be expanded

        Returns
        -------
        list(str)
        """
        tickers = market_data_request.tickers
        fields = market_data_request.fields

        if isinstance(tickers, str): tickers = [tickers]
        if isinstance(fields, str): fields = [fields]

        tickers_fields_list = []

        # Create ticker.field combination for series we wish to return
        for f in fields:
            for t in tickers:
                tickers_fields_list.append(t + '.' + f)

        return tickers_fields_list

    def resample_time_series(self, data_frame, freq):
        return data_frame.asfreq(freq, method='pad')

    def resample_time_series_frequency(self,
                                       data_frame,
                                       data_resample_freq,
                                       data_resample_type='mean',
                                       fill_empties=False):
        # Should we take the mean, first, last in our resample
        if data_resample_type == 'mean':
            data_frame_r = data_frame.resample(data_resample_freq).mean()
        elif data_resample_type == 'first':
            data_frame_r = data_frame.resample(data_resample_freq).first()
        elif data_resample_type == 'last':
            data_frame_r = data_frame.resample(data_resample_freq).last()
        else:
            # TODO implement other types
            return

        if fill_empties == True:
            data_frame, data_frame_r = data_frame.align(data_frame_r,
                                                        join='left',
                                                        axis=0)
            data_frame_r = data_frame_r.fillna(method='ffill')

        return data_frame_r

    def make_FX_1_min_working_days(self, data_frame):
        data_frame = data_frame.resample('1min').mean()
        data_frame = self.filter_time_series_by_holidays(data_frame, 'FX')
        data_frame = data_frame.fillna(method='ffill')
        data_frame = self.remove_out_FX_out_of_hours(data_frame)

        return data_frame

    def remove_out_FX_out_of_hours(self, data_frame):
        """Filtered a time series for FX hours (ie. excludes 22h GMT Fri - 19h GMT Sun and New Year's Day)

        Parameters
        ----------
        data_frame : DataFrame
            data frame with FX prices

        Returns
        -------
        list(str)
        """
        # assume data_frame is in GMT time
        # remove Fri after 22:00 GMT
        # remove Sat
        # remove Sun before 19:00 GMT

        # Monday = 0, ..., Sunday = 6
        data_frame = data_frame[~((data_frame.index.dayofweek == 4) &
                                  (data_frame.index.hour > 22))]
        data_frame = data_frame[~((data_frame.index.dayofweek == 5))]
        data_frame = data_frame[~((data_frame.index.dayofweek == 6) &
                                  (data_frame.index.hour < 19))]
        data_frame = data_frame[~((data_frame.index.day == 1) &
                                  (data_frame.index.month == 1))]

        return data_frame

    def remove_duplicate_indices(self, df):
        return df[~df.index.duplicated(keep='first')]

    def mask_time_series_by_time(self, df, time_list, time_zone):
        """ Mask a time series by time of day and time zone specified
        e.g. given a time series minutes data
             want to keep data at specific time periods every day with a considered time zone

        Parameters
        ----------
        df : DateTime
            time series needed to be masked
        time_list : list of tuples
            deciding the time periods which we want to keep the data on each day
            e.g. time_list = [('01:08', '03:02'),('12:24','12:55'),('17:31','19:24')]
            * Note: assume no overlapping of these tuples
        time_zone: str
            e.g. 'Europe/London'

        Returns
        -------
        DataFrame  (which the time zone is 'UTC')
        """

        # Change the time zone from 'UTC' to a given one
        df.index = df.index.tz_convert(time_zone)
        df_mask = pd.DataFrame(0, index=df.index, columns=['mask'])

        # Mask data with each given tuple
        for i in range(0, len(time_list)):
            start_hour = int(time_list[i][0].split(':')[0])
            start_minute = int(time_list[i][0].split(':')[1])
            end_hour = int(time_list[i][1].split(':')[0])
            end_minute = int(time_list[i][1].split(':')[1])

            # E.g. if tuple is ('01:08', '03:02'),
            # take hours in target - take values in [01:00,04:00]
            narray = np.where(
                df.index.hour.isin(range(start_hour, end_hour + 1)), 1, 0)
            df_mask_temp = pd.DataFrame(index=df.index,
                                        columns=df_mask.columns.tolist(),
                                        data=narray)

            # Remove minutes not in target - remove values in [01:00,01:07], [03:03,03:59]
            narray = np.where(((df.index.hour == start_hour) &
                               (df.index.minute < start_minute)), 0, 1)
            df_mask_temp = df_mask_temp * pd.DataFrame(
                index=df.index, columns=df_mask.columns.tolist(), data=narray)
            narray = np.where(
                (df.index.hour == end_hour) & (df.index.minute > end_minute),
                0, 1)
            df_mask_temp = df_mask_temp * pd.DataFrame(
                index=df.index, columns=df_mask.columns.tolist(), data=narray)

            # Collect all the periods we want to keep the data
            df_mask = df_mask + df_mask_temp

        narray = np.where(df_mask == 1, df, 0)
        df = pd.DataFrame(index=df.index,
                          columns=df.columns.tolist(),
                          data=narray)
        df.index = df.index.tz_convert('UTC')  # change the time zone to 'UTC'

        return df