def _sample_aligned_interval( self, interval, align_left=False, b_alpha=1.0, b_beta=1.0, name='interval_sample_', force_interval=False, **kwargs ): """ Samples continuous subset of data, such as entire episode records lie within positions specified by interval Episode start position within interval is drawn from beta-distribution parametrised by `b_alpha, b_beta`. By default distribution is uniform one. Args: interval: tuple, list or 1d-array of integers of length 2: [lower_row_number, upper_row_number]; align: if True - try to align sample to beginning of interval; b_alpha: float > 0, sampling B-distribution alpha param, def=1; b_beta: float > 0, sampling B-distribution beta param, def=1; name: str, sample filename id force_interval: bool, if true: force exact interval sampling Returns: - BTgymDataset instance such as: 1. number of records ~ max_episode_len, subj. to `time_gap` param; 2. actual episode start position is sampled from `interval`; - `False` if it is not possible to sample instance with set args. """ try: assert not self.data.empty except (AssertionError, AttributeError) as e: self.log.exception('Instance holds no data. Hint: forgot to call .read_csv()?') raise AssertionError try: assert len(interval) == 2 except AssertionError: self.log.exception( 'Invalid interval arg: expected list or tuple of size 2, got: {}'.format(interval) ) raise AssertionError if force_interval: return self._sample_exact_interval(interval, name) try: assert b_alpha > 0 and b_beta > 0 except AssertionError: self.log.exception( 'Expected positive B-distribution [alpha, beta] params, got: {}'.format([b_alpha, b_beta]) ) raise AssertionError sample_num_records = self.sample_num_records self.log.debug('Maximum sample time duration set to: {}.'.format(self.max_sample_len_delta)) self.log.debug('Respective number of steps: {}.'.format(sample_num_records)) self.log.debug('Maximum allowed data time gap set to: {}.\n'.format(self.max_time_gap)) # Sanity check param: if align_left: max_attempts = interval[-1] - interval[0] else: # Sanity check: max_attempts = 100 attempts = 0 align_shift = 0 # Sample enter point as close to beginning until all conditions are met: while attempts <= max_attempts: if align_left: first_row = interval[0] + align_shift else: first_row = interval[0] + int( (interval[-1] - interval[0] - sample_num_records) * random_beta(a=b_alpha, b=b_beta) ) #print('_sample_interval_sample_num_records: ', sample_num_records) self.log.debug('_sample_interval_first_row: {}'.format(first_row)) sample_first_day = self.data[first_row:first_row + 1].index[0] self.log.debug('Sample start: {}, weekday: {}.'.format(sample_first_day, sample_first_day.weekday())) # Keep sampling until good day: while not sample_first_day.weekday() in self.start_weekdays and attempts <= max_attempts: align_shift += 1 self.log.debug('Not a good day to start, resampling...') if align_left: first_row = interval[0] + align_shift else: first_row = interval[0] + int( (interval[-1] - interval[0] - sample_num_records) * random_beta(a=b_alpha, b=b_beta) ) #print('r_sample_interval_sample_num_records: ', sample_num_records) self.log.debug('_sample_interval_first_row: {}'.format(first_row)) sample_first_day = self.data[first_row:first_row + 1].index[0] self.log.debug('Sample start: {}, weekday: {}.'.format(sample_first_day, sample_first_day.weekday())) attempts += 1 # Check if managed to get proper weekday: try: assert attempts <= max_attempts except AssertionError: self.log.exception( 'Quitting after {} sampling attempts. Hint: check sampling params / dataset consistency.'. format(attempts) ) raise RuntimeError # If 00 option set, get index of first record of that day: if self.start_00: adj_timedate = sample_first_day.date() self.log.debug('Start time adjusted to <00:00>') first_row = self.data.index.get_loc(adj_timedate, method='nearest') else: adj_timedate = sample_first_day # first_row = self.data.index.get_loc(adj_timedate, method='nearest') # Easy part: last_row = first_row + sample_num_records # + 1 sampled_data = self.data[first_row: last_row] sample_len = (sampled_data.index[-1] - sampled_data.index[0]).to_pytimedelta() self.log.debug('Actual sample duration: {}.'.format(sample_len)) self.log.debug('Total sample time gap: {}.'.format(sample_len - self.max_sample_len_delta)) # Perform data gap check: if sample_len - self.max_sample_len_delta < self.max_time_gap: self.log.debug('Sample accepted.') # If sample OK - return new dataset: new_instance = self.nested_class_ref(**self.nested_params) new_instance.filename = name + 'num_{}_at_{}'.format(self.sample_num, adj_timedate) self.log.info('New sample id: <{}>.'.format(new_instance.filename)) new_instance.data = sampled_data new_instance.metadata['type'] = 'interval_sample' new_instance.metadata['first_row'] = first_row new_instance.metadata['last_row'] = last_row return new_instance else: self.log.debug('Attempt {}: duration too big, resampling, ...\n'.format(attempts)) attempts += 1 align_shift += 1 # Got here -> sanity check failed: msg = ('Quitting after {} sampling attempts.' + 'Hint: check sampling params / dataset consistency.').format(attempts) self.log.error(msg) raise RuntimeError(msg)
def _sample_interval(self, interval, b_alpha=1, b_beta=1): """ Samples continuous subset of data, such as entire episode records lie within positions specified by interval or. Episode start position within interval is drawn from beta-distribution parametrised by `b_alpha, b_beta`. By default distribution is uniform one. Args: interval: tuple, list or 1d-array of integers of length 2: [lower_position, upper_position]; b_alpha: sampling B-distribution alpha param; b_beta: sampling B-distribution beta param; Returns: - BTgymDataset instance such as: 1. number of records ~ max_episode_len, subj. to `time_gap` param; 2. actual episode start position is sampled from `interval`; - `False` if it is not possible to sample instance with set args. """ try: assert not self.data.empty except (AssertionError, AttributeError) as e: raise AssertionError('BTgymDataset instance holds no data. Hint: forgot to call .read_csv()?') assert len(interval) == 2, 'Invalid interval arg: expected list or tuple of size 2, got: {}'.format(interval) sample_num_records = self.episode_num_records assert interval[0] < interval[-1] < int(self.data.shape[0] - sample_num_records), \ 'Cannot sample with size {}, in {} from dataset of {} records'.\ format(sample_num_records, interval, self.data.shape[0]) self.log.debug('Maximum sample time duration set to: {}.'.format(self.max_episode_len)) self.log.debug('Respective number of steps: {}.'.format(sample_num_records)) self.log.debug('Maximum allowed data time gap set to: {}.\n'.format(self.max_time_gap)) # Sanity check param: max_attempts = 100 attempts = 0 # # Keep sampling random enter points until all conditions are met: while attempts <= max_attempts: first_row = interval[0] + round( (interval[-1] - interval[0] - sample_num_records - 1) * random_beta(a=b_alpha, b=b_beta) ) episode_first_day = self.data[first_row:first_row + 1].index[0] self.log.debug('Sample start: {}, weekday: {}.'.format(episode_first_day, episode_first_day.weekday())) # Keep sampling until good day: while not episode_first_day.weekday() in self.start_weekdays and attempts <= max_attempts: self.log.debug('Not a good day to start, resampling...') first_row = interval[0] + round( (interval[-1] - interval[0] - sample_num_records - 1) * random_beta(a=b_alpha, b=b_beta) ) episode_first_day = self.data[first_row:first_row + 1].index[0] self.log.debug('Sample start: {}, weekday: {}.'.format(episode_first_day, episode_first_day.weekday())) attempts += 1 # Check if managed to get proper weekday: assert attempts <= max_attempts, \ 'Quitting after {} sampling attempts. Hint: check sampling params / dataset consistency.'.\ format(attempts) # If 00 option set, get index of first record of that day: if self.start_00: adj_timedate = episode_first_day.date() self.log.debug('Start time adjusted to <00:00>') else: adj_timedate = episode_first_day first_row = self.data.index.get_loc(adj_timedate, method='nearest') # Easy part: last_row = first_row + sample_num_records # + 1 episode_sample = self.data[first_row: last_row] episode_sample_len = (episode_sample.index[-1] - episode_sample.index[0]).to_pytimedelta() self.log.debug('Sample duration: {}.'.format(episode_sample_len, )) self.log.debug('Total sample time gap: {}.'.format(episode_sample_len - self.max_episode_len)) # Perform data gap check: if episode_sample_len - self.max_episode_len < self.max_time_gap: self.log.debug('Sample accepted.') # If sample OK - return episodic-dataset: episode = self.__class__(**self.params) episode.filename = '_btgym_interval_sample_' + str(adj_timedate) self.log.info('Sample id: <{}>.'.format(episode.filename)) episode.data = episode_sample episode.metadata['type'] = 'interval_sample' episode.metadata['first_row'] = first_row return episode else: self.log.debug('Attempt {}: duration too big, resampling, ...\n'.format(attempts)) attempts += 1 # Got here -> sanity check failed: msg = ('Quitting after {} sampling attempts.' + 'Hint: check sampling params / dataset consistency.').format(attempts) self.log.warning(msg) raise AssertionError(msg)