def generate_goal(datum, threshold=2.0, min_time_above_threshold=datetime.timedelta(weeks=3), keep_data_within=datetime.timedelta(weeks=5), event_boundary=datetime.timedelta(weeks=1)): """Generates goal for early detection from datum in standard format. Parameters ---------- :param datum: datum from which events are generated :type datum: data dictionary in standard format Remaining input parameters same as for find_events. :return: (goal datum restricted to event intervals, pair of filters corresponding to intervals before/after events) :rtype: (data dictionary, list of two boolean lists) """ events = find_events(datum, threshold=threshold, min_time_above_threshold=min_time_above_threshold, min_time_between_events=keep_data_within) # Data after events filters_before = [] filters_during = [] filters_after = [] for event in events: f0 = fs.filter_on_times(datum, after=event - keep_data_within, before=event - event_boundary) filters_before.append(f0) f1 = fs.filter_on_times(datum, after=event - event_boundary, before=event + event_boundary) filters_during.append(f1) f2 = fs.filter_on_times(datum, after=event + event_boundary, before=event + keep_data_within) filters_after.append(f2) f_before_merged = fs.merge_filters(filters_before, method='any') goal_before = fs.select(datum, f_before_merged) goal_before['data']['values'] = [0] * len(goal_before['data']['values']) f_during_merged = fs.merge_filters(filters_during, method='any') goal_during = fs.select(datum, f_during_merged) goal_during['data']['values'] = [1] * len(goal_during['data']['values']) f_after_merged = fs.merge_filters(filters_after, method='any') goal_after = fs.select(datum, f_after_merged) goal_after['data']['values'] = [2] * len(goal_after['data']['values']) goal = copy.deepcopy(datum) d0 = dm.raw_to_pandas(goal_before['data']) d1 = dm.raw_to_pandas(goal_during['data']) d2 = dm.raw_to_pandas(goal_after['data']) goal['data'] = dm.sort_raw( dm.pandas_to_raw(d0.add(d1.add(d2, fill_value=0), fill_value=0))) return (goal, [f_before_merged, f_during_merged, f_after_merged])
def find_peaks_between(datum, events): boundaries = events + [datum['data']['times'][-1]] peaks = [] for i in range(0, len(boundaries) - 1): f = fs.filter_on_times(datum, after=boundaries[i], before=boundaries[i + 1]) _, peak_time = find_max(fs.select(datum, f)) peaks.append(peak_time) return peaks
def null_dist(goal_datum, data_sources, threshold): r"""Generate null distributions each of which only includes observations below the threshold from each data source (non-outbreak period). Parameters ---------- goal_datum : dictionary The gold standard data source. data_sources : list List of data sources that will be evaluated. Each data source is a dictionary. threshold : float Event baseline. When the value is larger than the threshold, there is an event. Otherwise, there is no event. Returns ------- empirical_dist_0 : list List of arrays. Each array contains observations below the threshold at time t from all data sources. mu_0 : array Elements in the array are mean values of m observations below the threshold from one data source. inv_Sigma_0 : array Inverse of array Sigma_0. Elements in Sigma_0 are covariances of each two data sources (only including observations below the threshold). """ f = fs.filter_on_values(goal_datum, below=threshold) series_list_null = [ fs.select(d, f)['data']['values'] for d in data_sources ] empirical_dist_0 = [ np.array(series_list_null)[:, t] for t in range(len(series_list_null[0])) ] mu_0 = np.mean(series_list_null, axis=1) Sigma_0 = np.cov(series_list_null) # Uses pseudoinverse in case of singular matrix from missing data if len(data_sources) == 1: if Sigma_0: inv_Sigma_0 = 1 / Sigma_0 else: inv_Sigma_0 = 0 else: inv_Sigma_0 = np.linalg.pinv(Sigma_0) return empirical_dist_0, mu_0, inv_Sigma_0
def generate_goal3(datum, threshold=2.0, min_time_above_threshold=datetime.timedelta(weeks=3), min_time_between_events=datetime.timedelta(weeks=5), event_boundary=datetime.timedelta(weeks=3)): """Generates goal for early detection from datum in standard format. Parameters ---------- :param datum: datum from which events are generated :type datum: data dictionary in standard format Remaining input parameters same as for find_events. :return: (goal datum restricted to event intervals, pair of filters corresponding to intervals before/after events) :rtype: (data dictionary, list of two boolean lists) """ events = find_events(datum, threshold=threshold, min_time_above_threshold=min_time_above_threshold, min_time_between_events=min_time_between_events) peaks = find_peaks_between(datum, events) # Data after events filters_after = [] for i in range(0, len(events)): f1 = fs.filter_on_times(datum, after=events[i], before=peaks[i]) filters_after.append(f1) f_after_merged = fs.merge_filters(filters_after, method='any') goal_before = fs.exclude(datum, f_after_merged) goal_before['data']['values'] = [0] * len(goal_before['data']['values']) goal_after = fs.select(datum, f_after_merged) goal_after['data']['values'] = [1] * len(goal_after['data']['values']) goal = copy.deepcopy(datum) d0 = dm.raw_to_pandas(goal_before['data']) d1 = dm.raw_to_pandas(goal_after['data']) goal['data'] = dm.sort_raw(dm.pandas_to_raw(d0.add(d1, fill_value=0))) return (goal, f_after_merged)
def filter_data(data, date_start='2010-11-22', date_end='2016-09-19', copy=True): """ Filter data to make sure all data are within the same date range """ if copy == True: data_f = data[:] else: data_f = data for i in range(len(data_f)): bool_mask = fs.filter_on_times(data_f[i], after=date_start, before=date_end) bool_mask_list = list(bool_mask) # for Python 3 map is one time only d = fs.select(data_f[i], bool_mask_list) data_f[i] = d return data_f
def generate_lagged_data(datum, goal_datum, min_lag=datetime.timedelta(weeks=3), max_history=5): """Formats datum for early detection. Parameters ---------- :param datum: datum to be formatted :param goal_datum: goal datum generated from generate_goal :param min_lag: minimum amount of time to lag data :param max_history: maximum size of lagged dataset for each time point :type datum: data dictionary in standard format :type goal_datum: data dictionary :type min_lag: datatime.timedelta :type max_history: positive integer :return: formatted datum with lagged data appended to key 'lagged' in dictionary 'data' :rtype: data dictionary in (appended) standard format """ datum = dm.fillna(datum, 0) #fill NAs with 0 automatically for now lagged_data = [] for t in goal_datum['data']['times']: f = fs.filter_on_times(datum, before=(t - min_lag)) v = fs.select(datum, f)['data']['values'][-max_history:] if len(v) < max_history: v = [0.0] * (max_history - len(v)) + v lagged_data.append(v) d = dm.fillna(dm.interpolate(datum, goal_datum), 0) d['data']['lagged'] = lagged_data #d['data']['lagged'] = [list(l) for l in zip(*lagged_data)] return d
def ED_obj(alarm, goal_datum, threshold=2.0, min_time_above_threshold=datetime.timedelta(weeks=3), min_time_between_events=datetime.timedelta(weeks=8)): r"""Objective function for Early Event Detection. Parameters ---------- alarm : dict Including date and whether or not alarm is triggered at that date. goal_datum : dictionary The gold standard data source. threshold : float Event baseline. When the value is larger than the threshold, there is an event. Otherwise, there is no event. min_time_above_threshold : datetime.timedelta Minimum amount of time above threshold required post-crossing to qualify as event. min_time_between_events : datetime.timedelta Minimum buffer between potential events. Returns ------- average_obj : number The average of the objective value. weeks_ahead_list : list How many weeks ahead in an event the alarm is triggered. """ events = find_events(goal_datum, threshold=threshold, min_time_above_threshold=min_time_above_threshold, min_time_between_events=min_time_between_events) total_obj = 0 weeks_ahead_list = [] alarm0 = start(alarm) total_alarm_times = [ t for t, v in zip(alarm0['data']['times'], alarm0['data']['values']) if v == True ] for event in events: f = fs.filter_on_times(alarm0, after=event - min_time_between_events / 2, before=event + min_time_between_events / 2) event_alarm = fs.select(alarm0, f) alarm_times = [ t for t, v in zip(event_alarm['data']['times'], event_alarm['data'] ['values']) if v == True ] if alarm_times: first_alarm = min(alarm_times) diff = (event - min_time_between_events / 2 - first_alarm).days / 7 weeks_ahead = (first_alarm - event).days / 7 else: diff = -np.inf weeks_ahead = 'NA' total_obj = total_obj + np.exp(diff) weeks_ahead_list.append(weeks_ahead) if len(events) > 0: average_obj = total_obj / len(events) else: average_obj = 0 return average_obj, weeks_ahead_list
def test_on_filter(test_func, data_sources, bool_mask): """ """ data_sources_resample = [fs.select(d, bool_mask) for d in data_sources] return test_func(data_sources_resample)
def train_on_filter(train_func, goal_datum, data_sources, bool_mask): """ """ goal_datum_resample = fs.select(goal_datum, bool_mask) data_sources_resample = [fs.select(d, bool_mask) for d in data_sources] return train_func(goal_datum_resample, data_sources_resample)