Пример #1
0
def sites_and_env(session, species, layer_names, glob_name, glob_channels, buffer_width, n_pseudoabsences, dblock=None, simdata=False):
    """
    Queries the DB to get a list of locations. Writes it out along with matching 
    extractions of the requested layers to a temporary csv file, which serves the 
    dual purpose of caching the extraction and making it easier to get data into 
    the BRT package.
    """

    breaks, x, found, zero, others_found, multipoints, eo = sites_as_ndarray(session, species)
    
    if simdata:
        print 'Process %i simulating presences for species %s.'%(multiprocessing.current_process().ident,species[1])
        x = get_pseudoabsences(eo, -1, n_pseudoabsences, layer_names, glob_name)
        found = np.ones(n_pseudoabsences)
        

    fname = hashlib.sha1(str(x)+found.tostring()+\
            glob_name+'channel'.join([str(i) for i in glob_channels])+\
            'layer'.join(layer_names)).hexdigest()+'.csv'
            
    pseudoabsences = get_pseudoabsences(eo, buffer_width, n_pseudoabsences, layer_names, glob_name)        
            
    x_found = x[np.where(found)]

    x = np.vstack((x_found, pseudoabsences))
    found = np.concatenate((np.ones(len(x_found)), np.zeros(n_pseudoabsences)))

    if fname in os.listdir('anopheles-caches'):
        pass
    else:

        # Makes list of (key, value) tuples
        env_layers = map(lambda ln: extract_environment(ln, x, lock=dblock), layer_names)\
                + map(lambda ch: (os.path.basename(glob_name)+'_'+str(ch), extract_environment(glob_name,x,\
                    postproc=lambda d: d==ch, id_=ch, lock=dblock)[1]), glob_channels)

        arrays = [(found>0).astype('int')] + [l[1] for l in env_layers]
        names = ['found'] + [l[0] for l in env_layers]

        data = np.rec.fromarrays(arrays, names=','.join(names))
        nancheck = np.array([np.any(np.isnan(row.tolist())) for row in data])
        if np.any(nancheck):
            print 'There were some NaNs in the data, probably points in the sea'

        singletons = 0
        for e in env_layers:
            if len(set(e[1][np.where(True-np.isnan(e[1]))]))==1:
                singletons += 1
        if singletons == len(env_layers):
            raise ValueError, 'All environmental layer evaluations contained only single values.'
        
        data = data[np.where(True-nancheck)]
        rec2csv(data, os.path.join('anopheles-caches',fname))

    return fname, pseudoabsences, x
Пример #2
0
def validate(M, session, x, a, burn=0, trace_thin=1):
    """
    Computes posterior predictive distributions for all validation metrics
    at holdout locations x. The true classification is a.
    """
    
    chain_len = len(M.db._h5file.root.chain0.PyMCsamples)
    
    species = M.species
    names = [s.__name__ for s in simple_assessments]
    results = dict([(n, []) for n in names])
    
    env_x = np.array([extract_environment(n, x * 180./np.pi) for n in M.env_variables]).T
    full_x = np.hstack((x,env_x))

    ptrace = M.trace('p')[:]
    ps = []
    for i in xrange(burn, chain_len, trace_thin):        
        pf = ptrace[i]
        p = pf(full_x)
        ps.append(p)
        for s in simple_assessments:
            results[s.__name__].append(s(p,a))
        
    results = dict([(n, np.asarray(results[n])) for n in names])
    results['roc'] = roc(np.asarray(ps), a)
    
    return results,