def arrange_data(data, pop, periods=1): """Arrange the data in a dict: works also for data with higher frequency than yearly. """ #fill notification rates, population and percentual change notif = [] rel_change = [float('NaN')] times = data['years'] # in format (year, period) for i in range(len(times)): notif.append(data['abs_notif'][i] * 10.0**5 / pop[times[i][0]]) for i in range(1, len(times)): # Add 1 in the denominator to avoid division by 0 # Check if two points are successive if (times[i-1][0] == times[i][0] and times[i-1][1] == times[i][1] -1): rel_change.append((notif[i]-notif[i-1])/float(1 + notif[i-1])) elif times[i-1][0] == times[i][0] - 1 and times[i-1][1] == periods and times[i][1] ==1: rel_change.append((notif[i]-notif[i-1])/float(1 + notif[i-1])) else: rel_change.append(float('NaN')) #list years with missing data missing, missing_intervals = ut._missing(times, periods) #interpolate between missing interpolated = _interpolate(times, notif, missing, missing_intervals, periods) #define data as a dict of dicts new_data = {'abs_notif': data['abs_notif'], 'notif': notif, 'rel_change': rel_change, 'interpolated':interpolated, 'times': times, 'years': data['years']} return new_data
def _select_data(first, data, periods=1, outliers=None, threshold=5, plot=False): """Cuts the data to have only the data used for projection Input: first: int, starting year data: dict, the 'data' dictionary of extract_data() outliers: list of int, years considered outliers periods: int, data frequency: 1 for years, 4 for quarters... threshold: int, minimum number of points required for projection (excluding outliers) plot: boolean, whether we want to produce a plot Output: enough_data: boolean, whether enough data selected new_data: dict, containing selected data figdata: string for web plotting """ new_data = {} # If there are outliers if outliers: notif_new = [] abs_notif_new = [] years_new = [] changes_new = [] for i in range(len(data['times'])): # rewrite lists excluding outliers if data['times'][i] not in outliers: years_new.append(data['times'][i]) notif_new.append(data['notif'][i]) abs_notif_new.append(data['abs_notif'][i]) changes_new.append(data['rel_change'][i]) # re-run interpolation to accomodate for outliers missing, missing_int = ut._missing(years_new, periods) new_data['interpolated'] = _interpolate(years_new, notif_new, missing, missing_int, periods) # If no outliers else: # copy original data notif_new = data['notif'] abs_notif_new = data['abs_notif'] years_new = data['times'] changes_new = data['rel_change'] new_data['interpolated'] = data['interpolated'] # Find index for firs year for i in range(len(years_new)): if years_new[i] == first: first_i = i break # Cut data lists starting from first year new_data['notif'] = notif_new[first_i:] new_data['abs_notif'] = abs_notif_new[first_i:] new_data['times'] = years_new[first_i:] new_data['rel_change'] = changes_new[first_i:] # Check if enough data is available if len(new_data['notif']) < threshold: enough_data = False else: enough_data = True # Plot if plot == True: plt.clf() figdata = _plot_time_series(new_data) else: figdata = None return enough_data, new_data, figdata