def validate_on_paths(models, net, estimation_sampling_process, estimation_sampling_parameters, min_validation_path_lengths=[], **param): """ Main function which computes validate_on_paths metrics and plots Input: models: list of tuples where each tuple is (data, gmrf, hmm, name) net: network object estimation_sampling_process: string, see getTTDistribution Found at: mm.arterial_hkt.mixture_functions estimation_sampling_parameters: dict, see getTTDistribution Found at: mm.arterial_hkt.mixture_functions param: constant variables defined in pipeline_script_#.py """ # Make sure there is only one path specified assert not min_validation_path_lengths or ("min_validation_path_length" not in param) if min_validation_path_lengths: validation_paths = [] # Get one validation path for each of the elements for min_validation_path_length in min_validation_path_lengths: param2 = dict(param.items()+[("min_validation_path_length",min_validation_path_length)]) validation_paths += vp.select_validation_paths(models[0][0], net, debug=False, **param2) else: validation_paths = vp.select_validation_paths(models[0][0], net, debug=False, **param) print "Len val paths,",len(validation_paths) learned_dist = defaultdict(dict) validation_tt = defaultdict(list) lmrs = defaultdict(dict) for i, (all_data, gmrf, hmm, name) in enumerate(models): validation_data = vp.select_validation_data_given_paths(all_data, validation_paths) for val_path, data in validation_data.iteritems(): gmix = getTTDistribution(list(val_path),gmrf, hmm, sampling_procedure=estimation_sampling_process, sampling_parameters=estimation_sampling_parameters) learned_dist[val_path][name] = gmix if i == 0: val_tt = map(lambda d: sum([obs.value for obs in d]), data) validation_tt[val_path] = val_tt bounds = (gmix.inverseCumulative(0.05), gmix.inverseCumulative(0.25), gmix.inverseCumulative(0.5), gmix.inverseCumulative(0.75), gmix.inverseCumulative(0.95)) lmrs[val_path][name] = bounds plot_scatter_box(lmrs, validation_tt) plot_pdf(validation_tt, learned_dist) plot_cdf(validation_tt, learned_dist, **param) plot_pp_plot(validation_tt, learned_dist) plot_qq_plot(validation_tt, learned_dist)
def model_validation(data, gmrf_est, hmm, net, confidence_levels, given_mode, estimation_sampling_process, estimation_sampling_parameters, **param): ll = defaultdict(list) conf = np.zeros(len(confidence_levels), dtype=float) percentile = np.zeros(len(confidence_levels), dtype=float) length_bin_size = param['length_bin_size'] max_nb_paths = param['max_nb_paths'] for traj_obs in data[: max_nb_paths]: tt = sum([obs.value for obs in traj_obs.observations]) length = sum([net[obs.varId.nodeId].length for obs in traj_obs.observations]) if given_mode: dist = getTTDistributionGivenStop(gmrf_est, [obs.varId.nodeId for obs in traj_obs.observations], [obs.varId.mode for obs in traj_obs.observations]) else: dist = getTTDistribution([obs.varId.nodeId for obs in traj_obs.observations], gmrf_est, hmm, sampling_procedure=estimation_sampling_process, sampling_parameters=estimation_sampling_parameters) ll[int(length / length_bin_size)] += [dist.logProbability(tt)] (c, q) = tt_bound_quantiles(tt, dist, confidence_levels) conf += c percentile += q conf = conf / float(len(data[: max_nb_paths])) conf = np.hstack(([0], conf, [1])) percentile = percentile / float(len(data[: max_nb_paths])) percentile = np.hstack(([0], percentile, [1])) confidence_levels = np.hstack(([0], confidence_levels, [1])) conf_up_area = np.max(np.vstack((conf - confidence_levels, np.zeros_like(conf))), axis=0) conf_down_area = np.max(np.vstack((confidence_levels - conf, np.zeros_like(conf))), axis=0) conf_up_area = simps(conf_up_area, confidence_levels) conf_down_area = simps(conf_down_area, confidence_levels) percentile_up_area = np.max(np.vstack((percentile - confidence_levels, np.zeros_like(percentile))), axis=0) percentile_down_area = np.max(np.vstack((confidence_levels - percentile, np.zeros_like(percentile))), axis=0) percentile_up_area = simps(percentile_up_area, confidence_levels) percentile_down_area = simps(percentile_down_area, confidence_levels) ll_res = {} for b, ll_val in ll.items(): if len(ll_val) > param['min_nb_validation_points']: ll_res[b] = (np.median(ll_val), np.std(ll_val)) return ll_res, [conf, conf_up_area, conf_down_area], [percentile, percentile_up_area, percentile_down_area]