def sites_and_env(session, species, layer_names, glob_name, glob_channels, buffer_width, n_pseudoabsences, dblock=None, simdata=False): """ Queries the DB to get a list of locations. Writes it out along with matching extractions of the requested layers to a temporary csv file, which serves the dual purpose of caching the extraction and making it easier to get data into the BRT package. """ breaks, x, found, zero, others_found, multipoints, eo = sites_as_ndarray(session, species) if simdata: print 'Process %i simulating presences for species %s.'%(multiprocessing.current_process().ident,species[1]) x = get_pseudoabsences(eo, -1, n_pseudoabsences, layer_names, glob_name) found = np.ones(n_pseudoabsences) fname = hashlib.sha1(str(x)+found.tostring()+\ glob_name+'channel'.join([str(i) for i in glob_channels])+\ 'layer'.join(layer_names)).hexdigest()+'.csv' pseudoabsences = get_pseudoabsences(eo, buffer_width, n_pseudoabsences, layer_names, glob_name) x_found = x[np.where(found)] x = np.vstack((x_found, pseudoabsences)) found = np.concatenate((np.ones(len(x_found)), np.zeros(n_pseudoabsences))) if fname in os.listdir('anopheles-caches'): pass else: # Makes list of (key, value) tuples env_layers = map(lambda ln: extract_environment(ln, x, lock=dblock), layer_names)\ + map(lambda ch: (os.path.basename(glob_name)+'_'+str(ch), extract_environment(glob_name,x,\ postproc=lambda d: d==ch, id_=ch, lock=dblock)[1]), glob_channels) arrays = [(found>0).astype('int')] + [l[1] for l in env_layers] names = ['found'] + [l[0] for l in env_layers] data = np.rec.fromarrays(arrays, names=','.join(names)) nancheck = np.array([np.any(np.isnan(row.tolist())) for row in data]) if np.any(nancheck): print 'There were some NaNs in the data, probably points in the sea' singletons = 0 for e in env_layers: if len(set(e[1][np.where(True-np.isnan(e[1]))]))==1: singletons += 1 if singletons == len(env_layers): raise ValueError, 'All environmental layer evaluations contained only single values.' data = data[np.where(True-nancheck)] rec2csv(data, os.path.join('anopheles-caches',fname)) return fname, pseudoabsences, x
def validate(M, session, x, a, burn=0, trace_thin=1): """ Computes posterior predictive distributions for all validation metrics at holdout locations x. The true classification is a. """ chain_len = len(M.db._h5file.root.chain0.PyMCsamples) species = M.species names = [s.__name__ for s in simple_assessments] results = dict([(n, []) for n in names]) env_x = np.array([extract_environment(n, x * 180./np.pi) for n in M.env_variables]).T full_x = np.hstack((x,env_x)) ptrace = M.trace('p')[:] ps = [] for i in xrange(burn, chain_len, trace_thin): pf = ptrace[i] p = pf(full_x) ps.append(p) for s in simple_assessments: results[s.__name__].append(s(p,a)) results = dict([(n, np.asarray(results[n])) for n in names]) results['roc'] = roc(np.asarray(ps), a) return results,