예제 #1
0
def validate_on_paths(models,
                      net,
                      estimation_sampling_process,
                      estimation_sampling_parameters,
                      min_validation_path_lengths=[],
                      **param):
  """ Main function which computes validate_on_paths metrics and plots
  Input:
  models: list of tuples where each tuple is (data, gmrf, hmm, name)
  net: network object
  estimation_sampling_process: string, see getTTDistribution Found at: mm.arterial_hkt.mixture_functions
  estimation_sampling_parameters: dict, see getTTDistribution Found at: mm.arterial_hkt.mixture_functions
  param: constant variables defined in pipeline_script_#.py
  """
  # Make sure there is only one path specified
  assert not min_validation_path_lengths or ("min_validation_path_length" not in param)
  if min_validation_path_lengths:
    validation_paths = []
    # Get one validation path for each of the elements
    for min_validation_path_length in min_validation_path_lengths:
      param2 = dict(param.items()+[("min_validation_path_length",min_validation_path_length)])
      validation_paths += vp.select_validation_paths(models[0][0], net, debug=False, **param2)
  else:
    validation_paths = vp.select_validation_paths(models[0][0], net, debug=False, **param)
  print "Len val paths,",len(validation_paths)
  learned_dist = defaultdict(dict)
  validation_tt = defaultdict(list)
  lmrs = defaultdict(dict)
  
  for i, (all_data, gmrf, hmm, name) in enumerate(models):
    validation_data = vp.select_validation_data_given_paths(all_data, validation_paths)
    for val_path, data in validation_data.iteritems():
      gmix = getTTDistribution(list(val_path),gmrf, hmm,
                               sampling_procedure=estimation_sampling_process,
                               sampling_parameters=estimation_sampling_parameters)
      learned_dist[val_path][name] = gmix
      
      if i == 0:
        val_tt = map(lambda d: sum([obs.value for obs in d]), data)
        validation_tt[val_path] = val_tt
      
      bounds = (gmix.inverseCumulative(0.05),
             gmix.inverseCumulative(0.25),
             gmix.inverseCumulative(0.5),
             gmix.inverseCumulative(0.75),
             gmix.inverseCumulative(0.95))
      lmrs[val_path][name] = bounds
  plot_scatter_box(lmrs, validation_tt)
  plot_pdf(validation_tt, learned_dist)
  plot_cdf(validation_tt, learned_dist, **param)
  plot_pp_plot(validation_tt, learned_dist)
  plot_qq_plot(validation_tt, learned_dist)
예제 #2
0
def model_validation(data,
                     gmrf_est,
                     hmm,
                     net,
                     confidence_levels,
                     given_mode,
                     estimation_sampling_process,
                     estimation_sampling_parameters,
                     **param):
  ll = defaultdict(list)
  conf = np.zeros(len(confidence_levels), dtype=float)
  percentile = np.zeros(len(confidence_levels), dtype=float)
  length_bin_size = param['length_bin_size']
  max_nb_paths = param['max_nb_paths']
  for traj_obs in data[: max_nb_paths]:
    tt = sum([obs.value for obs in traj_obs.observations])
    length = sum([net[obs.varId.nodeId].length for obs in traj_obs.observations])
    if given_mode:
      dist = getTTDistributionGivenStop(gmrf_est,
                                        [obs.varId.nodeId for obs in traj_obs.observations],
                                        [obs.varId.mode for obs in traj_obs.observations])
    else:
      dist = getTTDistribution([obs.varId.nodeId for obs in traj_obs.observations],
                               gmrf_est,
                               hmm,
                               sampling_procedure=estimation_sampling_process,
                               sampling_parameters=estimation_sampling_parameters)
    ll[int(length / length_bin_size)] += [dist.logProbability(tt)]
    (c, q) = tt_bound_quantiles(tt, dist, confidence_levels)
    conf += c
    percentile += q 
  conf = conf / float(len(data[: max_nb_paths]))
  conf = np.hstack(([0], conf, [1]))
  percentile = percentile / float(len(data[: max_nb_paths]))
  percentile = np.hstack(([0], percentile, [1]))
  confidence_levels = np.hstack(([0], confidence_levels, [1]))
  
  conf_up_area = np.max(np.vstack((conf - confidence_levels, np.zeros_like(conf))), axis=0)
  conf_down_area = np.max(np.vstack((confidence_levels - conf, np.zeros_like(conf))), axis=0)
  conf_up_area = simps(conf_up_area, confidence_levels)
  conf_down_area = simps(conf_down_area, confidence_levels)
  
  percentile_up_area = np.max(np.vstack((percentile - confidence_levels, np.zeros_like(percentile))), axis=0)
  percentile_down_area = np.max(np.vstack((confidence_levels - percentile, np.zeros_like(percentile))), axis=0)
  percentile_up_area = simps(percentile_up_area, confidence_levels)
  percentile_down_area = simps(percentile_down_area, confidence_levels)
  
  ll_res = {}
  for b, ll_val in ll.items():
    if len(ll_val) > param['min_nb_validation_points']:
      ll_res[b] = (np.median(ll_val), np.std(ll_val))
  return ll_res, [conf, conf_up_area, conf_down_area], [percentile, percentile_up_area, percentile_down_area]