def get_rad_from_obs(ab, dist): if dist == 'negbin': n, p = negbin_solver(ab) pred_rad = get_rad_negbin(len(ab), n, p) elif dist == 'pln': mu, sigma = pln_solver(ab) pred_rad = get_rad_pln(len(ab), mu, sigma) return pred_rad
def get_rad_from_obs(self): if self.dist == 'negbin': n, p = nbinom_lower_trunc_solver(self.obs) pred_rad = self.get_rad_negbin(len(self.obs), n, p) elif self.dist == 'pln': mu, sigma = pln_solver(self.obs) pred_rad = self.get_rad_pln(len(self.obs), mu, sigma) return (pred_rad, mu, sigma)
def get_rad_from_obs(self): if self.dist == 'negbin': n, p = negbin_solver(self.obs) pred_rad = self.get_rad_negbin(len(self.obs), n, p) elif self.dist == 'pln': mu, sigma = pln_solver(self.obs) pred_rad = self.get_rad_pln(len(self.obs), mu, sigma) return pred_rad
def run_test(raw_data, dataset_name, data_dir='./data/', cutoff = 9): """Use data to compare the predicted and empirical SADs and get results in csv files Keyword arguments: raw_data : numpy structured array with 4 columns: 'site','year','sp','ab' dataset_name : short code that will indicate the name of the dataset in the output file names data_dir : directory in which to store output cutoff : minimum number of species required to run - 1. """ usites = np.sort(list(set(raw_data["site"]))) f1 = csv.writer(open(data_dir + dataset_name + '_obs_pred.csv','wb')) f2 = csv.writer(open(data_dir + dataset_name + '_dist_test.csv','wb')) for i in range(0, len(usites)): subsites = raw_data["site"][raw_data["site"] == usites[i]] subab = raw_data["ab"][raw_data["site"] == usites[i]] N = sum(subab) S = len(subsites) if S > cutoff: print("%s, Site %s, S=%s, N=%s" % (dataset_name, i, S, N)) # Generate predicted values and p (e ** -beta) based on METE: mete_pred = mete.get_mete_rad(int(S), int(N)) pred = np.array(mete_pred[0]) p = mete_pred[1] p_untruncated = exp(-mete.get_beta(S, N, version='untruncated')) obsab = np.sort(subab)[::-1] # Calculate Akaike weight of log-series: L_logser = md.logser_ll(obsab, p) L_logser_untruncated = md.logser_ll(obsab, p_untruncated) mu, sigma = md.pln_solver(obsab) L_pln = md.pln_ll(mu,sigma,obsab) k1 = 1 k2 = 2 AICc_logser = macroecotools.AICc(k1, L_logser, S) AICc_logser_untruncated = macroecotools.AICc(k1, L_logser_untruncated, S) AICc_pln = macroecotools.AICc(k2, L_pln, S) weight = macroecotools.aic_weight(AICc_logser, AICc_pln, S, cutoff = 4) weight_untruncated = macroecotools.aic_weight(AICc_logser_untruncated, AICc_pln, S, cutoff = 4) #save results to a csv file: results = ((np.column_stack((subsites, obsab, pred)))) results2 = ((np.column_stack((np.array(usites[i], dtype='S20'), S, N, p, weight, p_untruncated, weight_untruncated)))) f1.writerows(results) f2.writerows(results2)
def get_par_multi_dists(ab, dist_name): """Returns the parameters given the observed abundances and the designated distribution.""" if dist_name == 'logser': beta = mete.get_beta(len(ab), sum(ab), version='untruncated') par = (np.exp(-beta), ) elif dist_name == 'pln': par = md.pln_solver(ab) elif dist_name == 'geom': par = (len(ab) / sum(ab), ) elif dist_name == 'negbin': par = md.negbin_solver(ab) if np.isnan(par[0]): par = None elif dist_name == 'zipf': par = (md.zipf_solver(ab), ) else: print "Error: distribution not recognized." par = None return par
def get_par_multi_dists(ab, dist_name): """Returns the parameters given the observed abundances and the designated distribution.""" if dist_name == 'logser': beta = mete.get_beta(len(ab), sum(ab), version = 'untruncated') par = (np.exp(-beta), ) elif dist_name == 'pln': par = md.pln_solver(ab) elif dist_name == 'geom': par = (len(ab) / sum(ab), ) elif dist_name == 'negbin': par = md.negbin_solver(ab) if np.isnan(par[0]): par = None elif dist_name == 'zipf': par = (md.zipf_solver(ab), ) else: print "Error: distribution not recognized." par = None return par
def get_pln_from_obs(ab, dist): mu, sigma = pln_solver(ab) pred_rad = get_pln(len(ab), mu, sigma) return pred_rad
def get_rad_from_obs(self, ab): mu, sigma = pln_solver(ab) pred_rad = self.get_rad_pln(len(ab), mu, sigma) return pred_rad
fig_example = plt.figure(figsize = (12, 4)) for i in range(3): ax = plt.subplot(1, 3, i + 1) ab = ab_array[i] x_values = np.array(range(max(ab) + 2)[1:]) logser_p = md.logser_solver(ab) logser_values = md.trunc_logser.pmf(x_values, logser_p, upper_bound=float("inf")) lsll = md.logser_ll(ab, logser_p) nb_n, nb_p = md.nbinom_lower_trunc_solver(ab) nb_values = md.nbinom_lower_trunc.pmf(x_values, nb_n, nb_p) nbll = md.nbinom_lower_trunc_ll(ab, nb_n, nb_p) pln_mu, pln_sigma = md.pln_solver(ab) pln_values = md.pln.pmf(x_values, pln_mu, pln_sigma, lower_trunc=True) plnll = md.pln_ll(ab, pln_mu, pln_sigma) zipf_par = md.zipf_solver(ab) zipf_values = zipf.pmf(x_values, zipf_par) zll = md.zipf_ll(ab, zipf_par) ab_y = np.zeros(len(x_values) + 1) for j in range(len(ab)): ab_y[ab[j]] = ab_y[ab[j]] + 1/len(ab) ax.set_xlim([0,min(50, max(x_values))]) plt.ylabel('frequency') plt.xlabel('abundance')
ax = plt.subplot(1, 3, i + 1) ab = ab_array[i] x_values = np.array(range(max(ab) + 2)[1:]) logser_p = md.logser_solver(ab) logser_values = md.trunc_logser.pmf(x_values, logser_p, upper_bound=float("inf")) lsll = md.logser_ll(ab, logser_p) nb_n, nb_p = md.nbinom_lower_trunc_solver(ab) nb_values = md.nbinom_lower_trunc.pmf(x_values, nb_n, nb_p) nbll = md.nbinom_lower_trunc_ll(ab, nb_n, nb_p) pln_mu, pln_sigma = md.pln_solver(ab) pln_values = md.pln.pmf(x_values, pln_mu, pln_sigma, lower_trunc=True) plnll = md.pln_ll(ab, pln_mu, pln_sigma) zipf_par = md.zipf_solver(ab) zipf_values = zipf.pmf(x_values, zipf_par) zll = md.zipf_ll(ab, zipf_par) ab_y = np.zeros(len(x_values) + 1) for j in range(len(ab)): ab_y[ab[j]] = ab_y[ab[j]] + 1 / len(ab) ax.set_xlim([0, min(50, max(x_values))]) plt.ylabel('frequency') plt.xlabel('abundance')
def get_rad_from_obs(ab, dist): mu, sigma = pln_solver(ab) pred_rad = get_rad_pln(len(ab), mu, sigma) return pred_rad
def model_comparisons(raw_data, dataset_name, data_dir, cutoff=9): """ Uses raw species abundance data to compare predicted vs. empirical species abundance distributions (SAD) and output results in csv files. Keyword arguments: raw_data: numpy structured array with 4 columns: 'site', 'year', 'sp' (species), 'ab' (abundance). dataset_name: short code to indicate the name of the dataset in the output file names. data_dir: directory in which to store results output. cutoff: minimum number of species required to run -1. SAD models and packages used: Logseries (macroecotools/macroecodistributions) Poisson lognormal (macroecotools/macroecodistributions) Negative binomial (macroecotools/macroecodistributions) Zipf (macroecotools/macroecodistributions) Neutral theory: Neutral theory predicts the negative binomial distribution (Connolly et al. 2014. Commonness and rarity in the marine biosphere. PNAS 111: 8524-8529. http://www.pnas.org/content/111/23/8524.abstract """ usites = np.sort(np.unique(raw_data["site"])) # Open output files f1 = open(data_dir + dataset_name + '_dist_test.csv', 'wb') output1 = csv.writer(f1) f2 = open(data_dir + dataset_name + '_likelihoods.csv', 'wb') output2 = csv.writer(f2) f3 = open(data_dir + dataset_name + '_relative_L.csv', 'wb') output3 = csv.writer(f3) # Insert header output1.writerow([ 'site', 'S', 'N', 'AICc_logseries', 'AICc_pln', 'AICc_negbin', 'AICc_zipf' ]) output2.writerow([ 'site', 'S', 'N', 'likelihood_logseries', 'likelihood_pln', 'likelihood_negbin', 'likelihood_zipf' ]) output3.writerow([ 'site', 'S', 'N', 'relative_ll_logseries', 'relative_ll_pln', 'relative_ll_negbin', 'relative_ll_zipf' ]) results = [] for site in usites: subsites = raw_data["site"][raw_data["site"] == site] subabundance = raw_data["ab"][raw_data["site"] == site] N = sum(subabundance) # N = total abundance for a site S = len(subsites) # S = species richness at a site if (min(subabundance) > 0) and (S > cutoff): print("%s, Site %s, S=%s, N=%s" % (dataset_name, site, S, N)) # Calculate Akaike weight of species abundance models: # Parameter k is the number of fitted parameters k1 = 1 k2 = 2 # Calculate log-likelihoods of species abundance models and calculate AICc values: # Logseries p_untruncated = md.logser_solver(subabundance) L_logser_untruncated = md.logser_ll( subabundance, p_untruncated) # Log-likelihood of untruncated logseries AICc_logser_untruncated = macroecotools.AICc( k1, L_logser_untruncated, S) # AICc logseries untruncated relative_ll_logser_untruncated = AICc_logser_untruncated # Relative likelihood untruncated logseries #Start making AICc list AICc_list = [AICc_logser_untruncated] likelihood_list = [L_logser_untruncated] relative_likelihood_list = [relative_ll_logser_untruncated] # Poisson lognormal mu, sigma = md.pln_solver(subabundance) L_pln = md.pln_ll(subabundance, mu, sigma) # Log-likelihood of Poisson lognormal AICc_pln = macroecotools.AICc(k2, L_pln, S) # AICc Poisson lognormal relative_ll_pln = macroecotools.AICc( k1, L_pln, S) #Relative likelihood, Poisson lognormal # Add to AICc list AICc_list = AICc_list + [AICc_pln] likelihood_list = likelihood_list + [L_pln] relative_likelihood_list = relative_likelihood_list + [ relative_ll_pln ] # Negative binomial n0, p0 = md.nbinom_lower_trunc_solver(subabundance) L_negbin = md.nbinom_lower_trunc_ll( subabundance, n0, p0) # Log-likelihood of negative binomial AICc_negbin = macroecotools.AICc(k2, L_negbin, S) # AICc negative binomial relative_ll_negbin = macroecotools.AICc( k1, L_negbin, S) # Relative log-likelihood of negative binomial # Add to AICc list AICc_list = AICc_list + [AICc_negbin] likelihood_list = likelihood_list + [L_negbin] relative_likelihood_list = relative_likelihood_list + [ relative_ll_negbin ] # Zipf distribution par = md.zipf_solver(subabundance) L_zipf = md.zipf_ll(subabundance, par) #Log-likelihood of Zipf distribution AICc_zipf = macroecotools.AICc(k1, L_zipf, S) relative_ll_zipf = AICc_zipf #Add to AICc list AICc_list = AICc_list + [AICc_zipf] likelihood_list = likelihood_list + [L_zipf] relative_likelihood_list = relative_likelihood_list + [ relative_ll_zipf ] # Calculate AICc weight weight = macroecotools.aic_weight(AICc_list, S, cutoff=4) #Calculate relative likelihood relative_likelihoods = macroecotools.aic_weight( relative_likelihood_list, S, cutoff=4) # Convert weight to list weights_output = weight.tolist() #Convert relative likelihoods to list relative_likelihoods_output = relative_likelihoods.tolist() # Format results for output for weight in weights_output: results1 = [[site, S, N] + weights_output] results2 = [[site, S, N] + likelihood_list] results3 = [[site, S, N] + relative_likelihoods_output] results.append([site, S, N] + weights_output + likelihood_list + relative_likelihoods_output) # Save results to a csv file: output1.writerows(results1) output2.writerows(results2) output3.writerows(results3) results = DataFrame(results, columns=[ 'site', 'S', 'N', 'AICc_logseries', 'AICc_pln', 'AICc_negbin', 'AICc_zipf', 'likelihood_logseries', 'likelihood_pln', 'likelihood_negbin', 'likelihood_zipf', 'relative_ll_logseries', 'relative_ll_pln', 'relative_ll_negbin', 'relative_ll_zipf' ]) results.to_csv(os.path.join(data_dir, dataset_name + '_likelihood_results.csv'), index=False) f1.close() f2.close() f3.close()