示例#1
0
def generate_goal(datum,
                  threshold=2.0,
                  min_time_above_threshold=datetime.timedelta(weeks=3),
                  keep_data_within=datetime.timedelta(weeks=5),
                  event_boundary=datetime.timedelta(weeks=1)):
    """Generates goal for early detection from datum in standard format.

    Parameters
    ----------
    :param datum: datum from which events are generated

    :type datum: data dictionary in standard format

    Remaining input parameters same as for find_events.

    :return: (goal datum restricted to event intervals, pair of filters
        corresponding to intervals before/after events)
    :rtype: (data dictionary, list of two boolean lists)
    """
    events = find_events(datum,
                         threshold=threshold,
                         min_time_above_threshold=min_time_above_threshold,
                         min_time_between_events=keep_data_within)

    # Data after events
    filters_before = []
    filters_during = []
    filters_after = []
    for event in events:
        f0 = fs.filter_on_times(datum,
                                after=event - keep_data_within,
                                before=event - event_boundary)
        filters_before.append(f0)
        f1 = fs.filter_on_times(datum,
                                after=event - event_boundary,
                                before=event + event_boundary)
        filters_during.append(f1)
        f2 = fs.filter_on_times(datum,
                                after=event + event_boundary,
                                before=event + keep_data_within)
        filters_after.append(f2)

    f_before_merged = fs.merge_filters(filters_before, method='any')
    goal_before = fs.select(datum, f_before_merged)
    goal_before['data']['values'] = [0] * len(goal_before['data']['values'])
    f_during_merged = fs.merge_filters(filters_during, method='any')
    goal_during = fs.select(datum, f_during_merged)
    goal_during['data']['values'] = [1] * len(goal_during['data']['values'])
    f_after_merged = fs.merge_filters(filters_after, method='any')
    goal_after = fs.select(datum, f_after_merged)
    goal_after['data']['values'] = [2] * len(goal_after['data']['values'])

    goal = copy.deepcopy(datum)
    d0 = dm.raw_to_pandas(goal_before['data'])
    d1 = dm.raw_to_pandas(goal_during['data'])
    d2 = dm.raw_to_pandas(goal_after['data'])
    goal['data'] = dm.sort_raw(
        dm.pandas_to_raw(d0.add(d1.add(d2, fill_value=0), fill_value=0)))

    return (goal, [f_before_merged, f_during_merged, f_after_merged])
示例#2
0
def find_peaks_between(datum, events):
    boundaries = events + [datum['data']['times'][-1]]

    peaks = []
    for i in range(0, len(boundaries) - 1):
        f = fs.filter_on_times(datum,
                               after=boundaries[i],
                               before=boundaries[i + 1])
        _, peak_time = find_max(fs.select(datum, f))
        peaks.append(peak_time)

    return peaks
示例#3
0
def null_dist(goal_datum, data_sources, threshold):
    r"""Generate null distributions each of which only includes observations
    below the threshold from each data source (non-outbreak period).

    Parameters
    ----------
    goal_datum : dictionary
        The gold standard data source.
    data_sources : list
        List of data sources that will be evaluated. Each data source is a
        dictionary.
    threshold : float
        Event baseline. When the value is larger than the threshold, there is
        an event. Otherwise, there is no event.

    Returns
    -------
    empirical_dist_0 : list
        List of arrays. Each array contains observations below the
        threshold at time t from all data sources.
    mu_0 : array
        Elements in the array are mean values of m observations below the
        threshold from one data source.
    inv_Sigma_0 : array
        Inverse of array Sigma_0. Elements in Sigma_0 are covariances of each
        two data sources (only including observations below the threshold).

    """
    f = fs.filter_on_values(goal_datum, below=threshold)
    series_list_null = [
        fs.select(d, f)['data']['values'] for d in data_sources
    ]
    empirical_dist_0 = [
        np.array(series_list_null)[:, t]
        for t in range(len(series_list_null[0]))
    ]
    mu_0 = np.mean(series_list_null, axis=1)
    Sigma_0 = np.cov(series_list_null)
    # Uses pseudoinverse in case of singular matrix from missing data
    if len(data_sources) == 1:
        if Sigma_0:
            inv_Sigma_0 = 1 / Sigma_0
        else:
            inv_Sigma_0 = 0
    else:
        inv_Sigma_0 = np.linalg.pinv(Sigma_0)
    return empirical_dist_0, mu_0, inv_Sigma_0
示例#4
0
def generate_goal3(datum,
                   threshold=2.0,
                   min_time_above_threshold=datetime.timedelta(weeks=3),
                   min_time_between_events=datetime.timedelta(weeks=5),
                   event_boundary=datetime.timedelta(weeks=3)):
    """Generates goal for early detection from datum in standard format.

    Parameters
    ----------
    :param datum: datum from which events are generated

    :type datum: data dictionary in standard format

    Remaining input parameters same as for find_events.

    :return: (goal datum restricted to event intervals, pair of filters
        corresponding to intervals before/after events)
    :rtype: (data dictionary, list of two boolean lists)
    """
    events = find_events(datum,
                         threshold=threshold,
                         min_time_above_threshold=min_time_above_threshold,
                         min_time_between_events=min_time_between_events)

    peaks = find_peaks_between(datum, events)

    # Data after events
    filters_after = []
    for i in range(0, len(events)):
        f1 = fs.filter_on_times(datum, after=events[i], before=peaks[i])
        filters_after.append(f1)

    f_after_merged = fs.merge_filters(filters_after, method='any')

    goal_before = fs.exclude(datum, f_after_merged)
    goal_before['data']['values'] = [0] * len(goal_before['data']['values'])
    goal_after = fs.select(datum, f_after_merged)
    goal_after['data']['values'] = [1] * len(goal_after['data']['values'])

    goal = copy.deepcopy(datum)
    d0 = dm.raw_to_pandas(goal_before['data'])
    d1 = dm.raw_to_pandas(goal_after['data'])
    goal['data'] = dm.sort_raw(dm.pandas_to_raw(d0.add(d1, fill_value=0)))

    return (goal, f_after_merged)
示例#5
0
def filter_data(data,
                date_start='2010-11-22',
                date_end='2016-09-19',
                copy=True):
    """
    Filter data to make sure all data are within the same date range
    """
    if copy == True:
        data_f = data[:]
    else:
        data_f = data

    for i in range(len(data_f)):
        bool_mask = fs.filter_on_times(data_f[i],
                                       after=date_start,
                                       before=date_end)
        bool_mask_list = list(bool_mask)  # for Python 3 map is one time only
        d = fs.select(data_f[i], bool_mask_list)
        data_f[i] = d
    return data_f
示例#6
0
def generate_lagged_data(datum,
                         goal_datum,
                         min_lag=datetime.timedelta(weeks=3),
                         max_history=5):
    """Formats datum for early detection.

    Parameters
    ----------
    :param datum: datum to be formatted
    :param goal_datum: goal datum generated from generate_goal
    :param min_lag: minimum amount of time to lag data
    :param max_history: maximum size of lagged dataset for each time point

    :type datum: data dictionary in standard format
    :type goal_datum: data dictionary
    :type min_lag: datatime.timedelta
    :type max_history: positive integer

    :return: formatted datum with lagged data appended to key 'lagged' in
        dictionary 'data'
    :rtype: data dictionary in (appended) standard format
    """
    datum = dm.fillna(datum, 0)  #fill NAs with 0 automatically for now
    lagged_data = []
    for t in goal_datum['data']['times']:
        f = fs.filter_on_times(datum, before=(t - min_lag))
        v = fs.select(datum, f)['data']['values'][-max_history:]
        if len(v) < max_history:
            v = [0.0] * (max_history - len(v)) + v
        lagged_data.append(v)

    d = dm.fillna(dm.interpolate(datum, goal_datum), 0)
    d['data']['lagged'] = lagged_data
    #d['data']['lagged'] = [list(l) for l in zip(*lagged_data)]

    return d
示例#7
0
def ED_obj(alarm,
           goal_datum,
           threshold=2.0,
           min_time_above_threshold=datetime.timedelta(weeks=3),
           min_time_between_events=datetime.timedelta(weeks=8)):
    r"""Objective function for Early Event Detection.

    Parameters
    ----------
    alarm : dict
        Including date and whether or not alarm is triggered at that date.
    goal_datum : dictionary
        The gold standard data source.
    threshold : float
        Event baseline. When the value is larger than the threshold, there is
        an event. Otherwise, there is no event.
    min_time_above_threshold : datetime.timedelta
        Minimum amount of time above threshold required post-crossing to qualify
        as event.
    min_time_between_events : datetime.timedelta
        Minimum buffer between potential events.

    Returns
    -------
    average_obj : number
        The average of the objective value.
    weeks_ahead_list : list
        How many weeks ahead in an event the alarm is triggered.

    """
    events = find_events(goal_datum,
                         threshold=threshold,
                         min_time_above_threshold=min_time_above_threshold,
                         min_time_between_events=min_time_between_events)
    total_obj = 0
    weeks_ahead_list = []
    alarm0 = start(alarm)
    total_alarm_times = [
        t for t, v in zip(alarm0['data']['times'], alarm0['data']['values'])
        if v == True
    ]
    for event in events:
        f = fs.filter_on_times(alarm0,
                               after=event - min_time_between_events / 2,
                               before=event + min_time_between_events / 2)
        event_alarm = fs.select(alarm0, f)
        alarm_times = [
            t for t, v in zip(event_alarm['data']['times'], event_alarm['data']
                              ['values']) if v == True
        ]
        if alarm_times:
            first_alarm = min(alarm_times)
            diff = (event - min_time_between_events / 2 - first_alarm).days / 7
            weeks_ahead = (first_alarm - event).days / 7
        else:
            diff = -np.inf
            weeks_ahead = 'NA'
        total_obj = total_obj + np.exp(diff)
        weeks_ahead_list.append(weeks_ahead)
    if len(events) > 0:
        average_obj = total_obj / len(events)
    else:
        average_obj = 0
    return average_obj, weeks_ahead_list
示例#8
0
def test_on_filter(test_func, data_sources, bool_mask):
    """
    """
    data_sources_resample = [fs.select(d, bool_mask) for d in data_sources]
    return test_func(data_sources_resample)
示例#9
0
def train_on_filter(train_func, goal_datum, data_sources, bool_mask):
    """
    """
    goal_datum_resample = fs.select(goal_datum, bool_mask)
    data_sources_resample = [fs.select(d, bool_mask) for d in data_sources]
    return train_func(goal_datum_resample, data_sources_resample)