def _fit_alternative_numpy(self, pa, a): from scipy.linalg import solve, LinAlgError from scipy.stats.distributions import chi2 gamma = self.gamma dpa = self._d_alt * pa # single thread => no need to copy ydy = self._ydy_alt xdy = self._xdy_alt xdx = self._xdx_alt if self.low_rank: xdy[0] = self.py @ dpa + gamma * (self.y @ a) xdx[0, 0] = pa @ dpa + gamma * (a @ a) xdx[0, 1:] = self.px.T @ dpa + gamma * (self.x.T @ a) else: xdy[0] = self.py @ dpa xdx[0, 0] = pa @ dpa xdx[0, 1:] = self.px.T @ dpa try: beta = solve(xdx, xdy, assume_a='pos') # only uses upper triangle residual_sq = ydy - xdy.T @ beta sigma_sq = residual_sq / self._dof_alt chi_sq = self.n * np.log(self._residual_sq / residual_sq) # division => precision p_value = chi2.sf(chi_sq, 1) return beta[0], sigma_sq, chi_sq, p_value except LinAlgError: return tuple(4 * [float('nan')])
def LikelihoodRatioTest(LRT_table_0_het, LRT_table_s_het, LRT_table_0_common, LRT_table_s_common, \ LRT_table_0_bins, LRT_table_s_bins, LRT_num_sims, obs_het, obs_common, \ obs_bins, constant_het, denom_het, constant_common, denom_common, eps_bins, \ use_het, use_common, use_bins): # Get likelihood s = 0 likelihood_0 = GetLikelihoodFromTable(LRT_table_0_het, LRT_table_0_common, LRT_table_0_bins, \ LRT_num_sims, obs_het, obs_common, obs_bins, constant_het, \ denom_het, constant_common, denom_common, eps_bins, use_het, \ use_common, use_bins) # Get likelihood s = ABC_s likelihood_s_ABC = GetLikelihoodFromTable(LRT_table_s_het, LRT_table_s_common, LRT_table_s_bins, \ LRT_num_sims, obs_het, obs_common, obs_bins, constant_het, \ denom_het, constant_common, denom_common, eps_bins, use_het, \ use_common, use_bins) # Calculate likelihood ratio LR = likelihood_0 / likelihood_s_ABC # Calculate LogLR LogLR = -2 * np.log(LR) # LogLR ~ Mixture distribution (50% 0, 50% Chi-square (df=1)) pval = 0.5 * SF(LogLR) + 0.5 * chi2.sf(LogLR, 1) return likelihood_0, likelihood_s_ABC, LR, LogLR, pval
def find_global_keywords(data): key_words = set() with open("data/aggregate_frequency.json", "r", encoding="utf8") as file: agg_dict = json.load(file) keys = list(agg_dict.keys()) global_freqs = list() step = 900 for chunk in range(step, len(keys), step): search_str = ",".join(keys[chunk - step: chunk]) retrieved = NGramRequest(search_str, start_year=2018).getJSON() global_freqs.append(retrieved) with open("data/global_freqs.json", "w", encoding="utf8") as g_freqs_file: g_freqs_file.write(json.dumps(global_freqs, indent=4)) for ngram in global_freqs: g_word = ngram['ngram'] g_freq = ngram['timeseries'][-1] if g_freq != 0: g_ll = math.log(g_freq) local_ll = math.log(agg_dict[g_word]) lr = likelihood_ratio(local_ll, g_ll) p = chi2.sf(lr, 1) if p < 0.001: print(g_word) key_words.add(g_word) print("\n".join(key_words))
def likelihood_ratio(mod, mod_r): """ Différence de déviance entre deux modèles logistiques (likelihood ratio) Parameters ---------- mod : statsmodel object from GLM First model to compare mod_r : statsmodel object from GLM Second model to compare at Returns ------- float : p-value of the likelihood ratio Comments -------- Source : http://rnowling.github.io/machine/learning/2017/10/07/likelihood-ratio-test.html Testé en Rstats avec lmtest """ val = [mod.llf, mod_r.llf] LR = 2 * (max(val) - min(val)) #rapport de déviance val = [mod.df_model, mod_r.df_model] diff_df = max(val) - min(val) #différence de ddf p = chi2.sf(LR, diff_df) #test de la significativité return p
def test_chisq(self, loci, genos1, genos2): """compare counts of observed to expected haplotypes at a locus""" if loci[0] == loci[1]: return (loci[0], loci[1], 0.0, 1.0) # count alleles and haplotypes ac1 = {0: 0, 1: 0} ac2 = {0: 0, 1: 0} obs = {'00': 0, '01': 0, '10': 0, '11': 0} n_loci = len(genos1) for i in range(n_loci): allele1 = genos1[i] allele2 = genos2[i] haplo = '{0}{1}'.format(allele1, allele2) obs[haplo] += 1 ac1[allele1] += 1 ac2[allele2] += 1 # observed haplotype counts observed = np.array(list(obs.values())) # expected haplotype counts allele_counts = [ac1[0], ac1[1], ac2[0], ac2[1]] af = np.array(allele_counts) / float(n_loci) expected = np.array([ af[0] * af[2], af[0] * af[3], af[1] * af[2], af[1] * af[3] ]) * n_loci # perform chisquare test hap_chisq = ((observed - expected)**2) / expected chisq_tot = hap_chisq.sum() p = chi2.sf(chisq_tot, 1) return (loci[0], loci[1], chisq_tot, p)
def chi2_calc(f, par, X, Y, dY, dX, cov): """ Parameters ---------- Returns ------- """ # as over this df is a derivative # has to be substitute, watch over df = (f(X + dX / 1e6, *par) - f(X, *par)) / (dX / 1e6) chi = sum((Y - f(X, *par))**2 / (dY**2 + (df * dX)**2)) p = chi2.sf(chi, len(X) - len(par)) sigma = sqrt(diag(cov)) normcov = zeros((len(par), len(par))) for i in range(len(par)): for j in range(len(par)): normcov[i, j] = cov[i, j] / (sigma[i] * sigma[j]) return chi, sigma, normcov, p
def plotcurve(self): plotx = np.linspace(self.x[0], self.x[-1], 500) plotmodel = batman.TransitModel(self.params, plotx) model = batman.TransitModel(self.params, self.x) alpha = '' dof = 3 + len(self.params.u) if self.params.ecc == self.initialecc: alpha = '_fixedecc' dof += 2 nu = len(self.y.tolist()) - dof print(nu) sigma2 = self.err**2. sum = (self.y-model.light_curve(self.params))**2. sum /= sigma2 chisq = np.sum(sum) chisq_prob = chi2.sf(chisq, nu) print(chisq) print(chisq_prob) print(chisq/nu) fig = plt.figure() ax1 = fig.add_axes((.1,.3,.8,.6)) ax2 = fig.add_axes((.1,.1,.8,.2)) ax2.set_xlabel(r'$\textrm{Days from}~t_0$') ax1.set_ylabel(r'$\textrm{Relative flux}$') ax2.set_ylabel(r"$\textrm{Residuals}$") ax1.get_xaxis().set_ticks([]) ax1.plot(plotx, plotmodel.light_curve(self.params), color='black', zorder = 10) ax2.plot(plotx, np.zeros_like(plotmodel.light_curve(self.params)), ':',color='black', zorder = 10) ax1.errorbar(self.x,self.y,yerr=self.err, fmt='o', mfc='darkgray', mec='darkgray', ecolor='darkgray', markersize=5, zorder = 0) ax2.errorbar(self.x, -(model.light_curve(self.params)-self.y), yerr = self.err, fmt='o', mfc='darkgray', mec='darkgray', ecolor='darkgray', markersize=5, zorder = 0) plt.savefig('finalplots/lightcurve_'+self.params.limb_dark+alpha+'.eps') plt.savefig('finalplots/lightcurve_'+self.params.limb_dark+alpha+'.png') plt.show()
def _fit_alternative_numpy(self, pa, a): from scipy.linalg import solve, LinAlgError from scipy.stats.distributions import chi2 gamma = self.gamma dpa = self._d_alt * pa # single thread => no need to copy ydy = self._ydy_alt xdy = self._xdy_alt xdx = self._xdx_alt if self.low_rank: xdy[0] = self.py @ dpa + gamma * (self.y @ a) xdx[0, 0] = pa @ dpa + gamma * (a @ a) xdx[0, 1:] = self.px.T @ dpa + gamma * (self.x.T @ a) else: xdy[0] = self.py @ dpa xdx[0, 0] = pa @ dpa xdx[0, 1:] = self.px.T @ dpa try: beta = solve(xdx, xdy, assume_a='pos') # only uses upper triangle residual_sq = ydy - xdy.T @ beta sigma_sq = residual_sq / self._dof_alt chi_sq = self.n * np.log(self._residual_sq / residual_sq) # division => precision p_value = chi2.sf(chi_sq, 1) return beta[0], sigma_sq, chi_sq, p_value except LinAlgError: return tuple(4 * [float('nan')])
def score_test(Xtest, y_true, y_predict): """对step forward进入的变量进行Score检验。函数假设新进入的变量放在最后. Xtest包括vars_old(似合模型并给出预测值y_predict的),和var_new(一个待检验的新变量)。 Score检验假设待检验变量的系数为0,所以Xtest虽然包括了它的数据,但拟合参数是按没有此变量计算出来的。""" u = np.dot(Xtest.T, y_true - y_predict) # 一阶导数 h = np.dot(Xtest.T * (y_predict * (1 - y_predict)).values.reshape(len(y_predict)), Xtest) # 二阶导数 score = np.dot(np.dot(u.T, np.linalg.inv(h)), u) # score 是 1*1 数组 p_value = chi2.sf(score, 1) # Score统计量服从自由度为1的卡方分布 return score, p_value
def __init__(self, data, attr1, attr2): self.observed = get_contingency(data, attr1, attr2) self.n = np.sum(self.observed) self.probs_x = self.observed.sum(axis=0) / self.n self.probs_y = self.observed.sum(axis=1) / self.n self.expected = np.outer(self.probs_y, self.probs_x) * self.n self.residuals = (self.observed - self.expected) / np.sqrt(self.expected) self.chisqs = self.residuals ** 2 self.chisq = float(np.sum(self.chisqs)) self.p = chi2.sf(self.chisq, (len(self.probs_x) - 1) * (len(self.probs_y) - 1))
def k2_lrt(dat, init_="k-means++", alpha=float(args.alpha)): # init_ is a vector of 0 and 1, describing the a priori groups for initialisation # Perform K=2 clustering: dat = np.array(dat) dat = dat.reshape(-1, 1) # Parse the initialisation parameters if type(init_) is list and len(init_) == len(dat): init_ = np.array(init_) mean_0 = np.mean(dat[np.where(init_ == 0)]) mean_1 = np.mean(dat[np.where(init_ == 1)]) init_ = np.array([[mean_0], [mean_1]]) kmeans = KMeans(init=init_, n_clusters=2, n_init=1, max_iter=300, random_state=42) else: kmeans = KMeans(init="k-means++", n_clusters=2, n_init=10, max_iter=300, random_state=42) kmeans.fit(dat) # One cluster model log-likelihood: mu_1 = np.mean(dat) sigma_1 = np.std(dat) lnL_1 = np.sum(np.log(norm.pdf(dat, mu_1, sigma_1))) # Two cluster model log-likelihood: # Cluster 0: dat_0 = dat[np.where(kmeans.labels_ == 0)] mu_2_0 = np.mean(dat_0) sigma_2_0 = np.std(dat_0) lnL_2_0 = np.sum(np.log(norm.pdf(dat_0, mu_2_0, sigma_2_0))) # Cluster 1: dat_1 = dat[np.where(kmeans.labels_ == 1)] mu_2_1 = np.mean(dat_1) sigma_2_1 = np.std(dat_1) lnL_2_1 = np.sum(np.log(norm.pdf(dat_1, mu_2_1, sigma_2_1))) # Likelihood-ratio test: lnL_2 = lnL_2_0 + lnL_2_1 LRT = - 2 * (lnL_1 - lnL_2) pval = chi2.sf(LRT, 2 + len(dat)) # Degrees of freedom: I consider the base model has 1 df (the mean) # If we declare the test significant: if pval <= alpha: isBimodal = 1 # We polarise the clusters: 0 for low het, 1 for high het if mu_2_0 >= mu_2_1: assign = [1 if x == 0 else 0 for x in kmeans.labels_] else: assign = [x for x in kmeans.labels_] else: isBimodal = 0 assign = [0] * dat.shape[0] return ([assign, pval, isBimodal])
def extract_traitrelax_parameters(input_path): with open(input_path, "r") as infile: content = infile.read() dictionary = dict() dataset_id_regex = re.compile("Parsing file .*?([^\/]*?).bpp for options", re.MULTILINE | re.DOTALL) dictionary["dataset_id"] = dataset_id_regex.search(content).group(1) print("input_path: ", input_path) print("dataset: ", dictionary["dataset_id"]) regex_strings = {"null_logl": "Null model fitting.*?Overall Log likelihood\.*?\s*\:\s*(-\d*\.?\d*)", "null_kappa": "Null model fitting.*?RELAX.kappa_1\.*?\s*\:\s*(\d*\.?\d*)", "null_p": "Null model fitting.*?RELAX.p_1\.*?\s*\:\s*(\d*\.?\d*)", "null_omega1": "Null model fitting.*?RELAX.omega1_1\.*?\s*\:\s*(\d*\.?\d*)", "null_omega2": "Null model fitting.*?RELAX.omega2_1\.*?\s*\:\s*(\d*\.?\d*)", "null_theta1": "Null model fitting.*?RELAX.theta1_1\.*?\s*\:\s*(\d*\.?\d*)", "null_theta2": "Null model fitting.*?RELAX.theta2_1\.*?\s*\:\s*(\d*\.?\d*)", "null_k": "Null model fitting.*?RELAX.k_2\.*?\s*\:\s*(\d*\.?\d*)", "null_mu": "Null model fitting.*?TwoParameterBinary\.mu\.*?\s*\:\s*(\d*\.?\d*)", "null_pi0": "Null model fitting.*?TwoParameterBinary\.pi0\.*?\s*\:\s*(\d*\.?\d*)", "alternative_logl": "Alternative model fitting.*Overall Log likelihood\.*?\s*\:\s*(-\d*\.?\d*)", "alternative_kappa": "Alternative model fitting.*RELAX.kappa_1\.*?\s*\:\s*(\d*\.?\d*)", "alternative_p": "Null model fitting.*?RELAX.p_1\.*?\s*\:\s*(\d*\.?\d*)", "alternative_omega1": "Alternative model fitting.*RELAX.omega1_1\.*?\s*\:\s*(\d*\.?\d*)", "alternative_omega2": "Alternative model fitting.*RELAX.omega2_1\.*?\s*\:\s*(\d*\.?\d*)", "alternative_theta1": "Alternative model fitting.*RELAX.theta1_1\.*?\s*\:\s*(\d*\.?\d*)", "alternative_theta2": "Alternative model fitting.*RELAX.theta2_1\.*?\s*\:\s*(\d*\.?\d*)", "alternative_k": "Alternative model fitting.*RELAX.k_2\.*?\s*\:\s*(\d*\.?\d*)", "alternative_mu": "Alternative model fitting.*TwoParameterBinary\.mu\.*?\s*\:\s*(\d*\.?\d*)", "alternative_pi0": "Alternative model fitting.*TwoParameterBinary\.pi0\.*?\s*\:\s*(\d*\.?\d*)"} # extract the basic field for field in regex_strings.keys(): try: regex = re.compile(regex_strings[field], re.MULTILINE | re.DOTALL) dictionary[field] = regex.search(content).group(1) except: print("failed to extract ", field, " for dataset ", dictionary["dataset_id"]) print("regex: ", regex_strings[field]) print("function: extract_traitrelax_parameters") exit(1) # compute the induced parameters dictionary["null_omega0"] = str(float(dictionary["null_p"]) * float(dictionary["null_omega1"])) dictionary["null_p0"] = dictionary["null_theta1"] dictionary["null_p1"] = str(float(dictionary["null_theta2"]) * (1 - float(dictionary["null_theta1"]))) dictionary["alternative_omega0"] = str(float(dictionary["alternative_p"]) * float(dictionary["alternative_omega1"])) dictionary["alternative_p0"] = dictionary["alternative_theta1"] dictionary["alternative_p1"] = str(float(dictionary["alternative_theta2"]) * (1 - float(dictionary["alternative_theta1"]))) # LR and pvalue dictionary["LRT_statistic"] = 2 * (float(dictionary["alternative_logl"]) - float(dictionary["null_logl"])) dictionary["pvalue"] = chi2.sf(dictionary["LRT_statistic"], 1) # 1 degree of freedom is the diff in num of parameters between alternative and null models return dictionary
def kendallsW(datas): dim = datas.shape S = np.var(list(datas.apply(sum, axis = 0))) * dim[1] d = (dim[0]**2)*(dim[1]**3 - dim[1]) w = S / d xx = dim[1] * (dim[0] - 1) * w df = dim[0] - 1 pv = chi2.sf(xx, df) return({'type' : 'Kendall\'s W Test', 'value' : w , 'p-value' : pv })
def ej4(): tiradas = 101 N = [48, 35, 15, 3] P = [.67, .05, .11, .17] T = 0 for i in range(4): print T T += (N[i] - tiradas * P[i])**2 / float(tiradas * P[i]) return T, chi2.sf(T, 3)
def find_relative_keywords(a_dict, b_dict): keywords = set() for key in a_dict: if key in b_dict: a_freq = math.log(a_dict[key]) b_freq = math.log(b_dict[key]) lr = likelihood_ratio(a_freq, b_freq) p = chi2.sf(lr, 1) if p < 0.001: keywords.add(key.lower()) return keywords
def ConductLikelyhoodRatioTest(self, resulting_LLH, hypothesis_value): lr = 2 * (hypothesis_value - resulting_LLH) p = chi2.sf(lr, 0) if p > 0.9772: print("Likelyhood ratio test has passed") else: print("WARNING, likelyhood ratio test has failed") return chi2
def __init__(self, data, attr1, attr2): self.observed = get_contingency(data, attr1, attr2) self.n = np.sum(self.observed) self.probs_x = self.observed.sum(axis=0) / self.n self.probs_y = self.observed.sum(axis=1) / self.n self.expected = np.outer(self.probs_y, self.probs_x) * self.n self.residuals = \ (self.observed - self.expected) / np.sqrt(self.expected) self.chisqs = self.residuals**2 self.chisq = float(np.sum(self.chisqs)) self.p = chi2.sf(self.chisq, (len(self.probs_x) - 1) * (len(self.probs_y) - 1))
def compare_model(first, second): LR = likelihood_ratio(first, second) ## min,max # 자유도계산 df = abs(d1_num - d2_num) if df == 0: df = 1 else: df == df ## H0 모형이 적합함 ## 정규분포함을 전제 p = chi2.sf(LR, df) return p
def km_plot_data(self, name, time, censor, values): values_df = pd.DataFrame( { 'time': time, 'censor': censor, 'value': values }, dtype=float) mean_value = values_df.value.mean() values_df['high'] = values_df.value >= mean_value data = { 'time': robjects.FloatVector(values_df['time']), 'censor': robjects.IntVector(values_df['censor']), 'high': robjects.IntVector(values_df['high']) } df = robjects.DataFrame(data) # p value km_diff = self.surv.survdiff( robjects.Formula('Surv(time, censor) ~ high'), data=df) chisq_ind = list(km_diff.names).index('chisq') pvalue = chi2.sf(km_diff[chisq_ind][0], 1) km = self.surv.survfit(robjects.Formula('Surv(time, censor) ~ high'), data=df) summary = pandas2ri.ri2py(r.summary(km, extend=True)) r.assign('km', km) r.assign('times', data['time']) r.assign('res', r('summary(km, times=times)')) cols = r('lapply(c(2:6, 8:11), function(x) res[x])') r.assign('cols', cols) km_results = r('do.call(data.frame, cols)') km_results = pd.DataFrame(km_results) low_km = km_results[km_results['strata'] == 'high=0'] high_km = km_results[km_results['strata'] == 'high=1'] high_time, high_percent = self.make_plottable_kms( high_km['time'], high_km['surv']) low_time, low_percent = self.make_plottable_kms( low_km['time'], low_km['surv']) high = [{ 'percent': i[0], 'time': i[1] } for i in zip(high_percent, high_time)] low = [{ 'percent': i[0], 'time': i[1] } for i in zip(low_percent, low_time)] return {'high': high, 'low': low, 'p': float('%.4g' % pvalue)}
def chi2_test(a, b, chi2_p_thresh, label): sum_ = a + b chi2_val = (((a - sum_ / 2.) ** 2) + ((b - sum_ / 2.) ** 2)) / sum_ chi2_p = chi2.sf(chi2_val, 1) if chi2_p <= chi2_p_thresh: logger.warning("{} Forward/Reverse read count imbalance.".format(label)) logger.warning("+/- = {} / {}, Chi-squared test p-val = {} <= {}".format( a, b, chi2_p, chi2_p_thresh )) else: logger.info("{} Forward/Reverse read count +/- = {} / {}".format(label, a, b)) logger.info("Chi-squared test p-val = {} > {}".format(chi2_p, chi2_p_thresh))
def __init__(self, LL1, LL2, df, verbose=True): """ :param LL1: log-likelihood with H0 :param LL2: log-likelihood with H1/fitted parameters :param df: specify dfs, # of tested params :param verbose: display results in table """ self.LR = LR = 2 * (LL2 - LL1) self.pval = pval = chi2.sf(LR, df=df) tbl = PrettyTable() tbl.field_names = ['LR test', ''] tbl.add_row(['chi2({}) = '.format(df), '{:.4f}'.format(LR)]) tbl.add_row(['Prob > chi2', '{:.4f}'.format(pval)]) if verbose: print(tbl)
def llr_test(model1: pm.ARIMA, model2: pm.ARIMA, significance=0.05) -> bool: """ Likelihood ratio test :param model1: H0 model :param model2: HA model :param significance: significance level :return: H0 result test """ k1 = len(model1.params()) k2 = len(model2.params()) lr = 2 * (k1 - k2) + model2.aic() - model1.aic() return chi2.sf(lr, k2 - k1) > significance
def wald_test(result): """逐步回归backward的wald检验。result.wald_test_terms也实现了此算法 \n 参数: ---------- result: statsmodel.api.Logit.fit() 返回结果对象 \n 返回值: ---------- test_df: dataframe, wald 检验的结果,包含2列:wald_chi2,pvalue_chi2 """ wald_chi2 = (result.params / result.bse) ** 2 # backward 的 wald 检验统计量,服从自由度为1的卡方分布 wald_chi2.name = 'wald_chi2' pvalue_chi2 = pd.Series(chi2.sf(wald_chi2, 1), index=wald_chi2.index, name='P>chi2') # backward 的 wald 检验 p 值 test = pd.concat([wald_chi2, pvalue_chi2], axis=1) return test
def chi_squared_test(observed, mu, total): expected = [] sum = 0 for value in range(len(observed) - 1): expected.append(total * (exp(-mu) * mu**value / factorial(value))) sum += total * (exp(-mu) * mu**value / factorial(value)) expected.append(total - sum) (testStatistic, pValue) = chisquare(observed, f_exp=expected, ddof=len(observed) - 1) # note: the p-value returned by the chisquare function seems to be # incorrect for the test statistic; I verified the return value of chi2.sf # with both online tables of chi-squared value as well as Mathematica and # MATLAB p = chi2.sf(testStatistic, len(observed) - 1) return (testStatistic, p)
def independence_test_binary(tar, data, state): tar_size = len(state[tar]) num_tar = np.zeros((tar_size, 1)) num_che = {} num_co = {} for che in state.keys(): if not (che is tar): che_size = len(state[che]) num_che[che] = np.zeros((che_size, 1)) num_co[che] = np.zeros((che_size, tar_size)) for i in range(data.shape[0]): tar_state = state[tar].index(data[tar][i]) num_tar[tar_state] = num_tar[tar_state] + 1 for che in num_che.keys(): che_state = state[che].index(data[che][i]) num_che[che][che_state] = num_che[che][che_state] + 1 num_co[che][che_state][ tar_state] = num_co[che][che_state][tar_state] + 1 p = {} for che in num_che.keys(): G_temp = num_co[che] * np.log( num_co[che] * data.shape[0] / num_che[che].dot(num_tar.T)) G_temp = G_temp.ravel() G = 2 * sum(G_temp[i] for i in range(len(G_temp)) if not np.isnan(G_temp[i])) # p_temp = 1 - stats.chi2.cdf(G, (len(state[tar]) - 1) * (len(state[che]) - 1)) dof = (len(state[tar]) - 1) * (len(state[che]) - 1) p_temp = chi2.sf(G, dof) if p_temp < 0.05: p[che] = p_temp pc_con = [] if p: pc_con.append(min(p, key=p.get)) p.pop(min(p, key=p.get)) pc_rest = [] while p: pc_rest.append(min(p, key=p.get)) p.pop(min(p, key=p.get)) return pc_con, pc_rest
def find_aggregate_keywords(): data = get_data() with open("data/aggregate_frequency.json", "r", encoding="utf8") as file: agg_freq = json.load(file) key_words = set() for person_data in data: for word in person_data["freq"].keys(): if word.lower() in agg_freq.keys(): local_freq = math.log(person_data["freq"][word]) act_freq = math.log(agg_freq[word.lower()]) lr = likelihood_ratio(local_freq, act_freq) p = chi2.sf(lr, 1) if p < 0.001: key_words.add(word) with open("data/aggregate_keywords.txt", "w", encoding="utf8") as keyword_file: for key in key_words: keyword_file.write(key + "\n")
def chi_sq_test(p_val, c_val, m_val, n_val, signif): """Gives the Chi squared test results between groups one and two in a 2x2\ contingency table. Parameters ========== p_val : Number of exposed in group one c_val : Number of exposed in group two m_val : Total number in group one n_val : Total number in group two signif : Significance cut off desired Returns ======= The Chi square statistic Raises ====== ValueError Significance level must be between 0 and 1 See Also ======= chi_sq_stat : Chi squared statistic Examples ======== >>> chi_sq_test(56, 126, 366, 354, 0.05) (3.762993555770853e-10, True) >>> chi_sq_test(25, 108, 123, 313, 0.05) (0.0038048156707230687, True) """ if not (isinstance(p_val, int) and isinstance(c_val, int) and isinstance(m_val, int) and isinstance(n_val, int)): raise TypeError('Count inputs must be integers') if not 0 <= signif <= 1: raise ValueError('Significance level must be between 0 and 1') stat = chi_sq_stat(p_val, c_val, m_val, n_val) prob = chi2.sf(stat, 1) return prob, prob < signif
def _hosmer_lemeshow(y_true, predict_probas, num_groups=10, labels=None): df = pd.DataFrame(data=predict_probas, columns=['prediction_proba']) if labels is None: labels = np.unique(y_true) y_true = label_binarize(y_true, classes=labels)[:, 0] df['label'] = y_true df['quantile_rank'] = pd.qcut(df['prediction_proba'], num_groups, labels=False, duplicates='drop') h = 0 results = pd.DataFrame(columns=[ 'decile', 'lower_bound', 'upper_bound', 'num_observations', 'num_failures', 'predicted_failures' ]) for i in range(num_groups): pcat_predictions = df[df['quantile_rank'] == i] num_observations = len(pcat_predictions) if num_observations == 0: continue obs1 = len(pcat_predictions[pcat_predictions['label'] == 1]) # how many were in category 1 exp1 = pcat_predictions['prediction_proba'].mean() * num_observations lower_bound = pcat_predictions['prediction_proba'].min() upper_bound = pcat_predictions['prediction_proba'].max() obs0 = num_observations - obs1 exp0 = num_observations - exp1 h += ((obs1 - exp1)**2) / exp1 + ((obs0 - exp0)**2) / exp0 results = results.append( { 'decile': i + 1, 'lower_bound': lower_bound, 'upper_bound': upper_bound, 'num_observations': num_observations, 'num_failures': obs1, 'predicted_failures': exp1 }, ignore_index=True) p = chi2.sf(h, num_groups - 2) return h, p, results
def __init__(self, data, attr1, attr2): attr1 = data.domain[attr1] attr2 = data.domain[attr2] if attr1.is_discrete and not attr1.values or \ attr2.is_discrete and not attr2.values: self.p = np.nan return self.observed = get_contingency(data, attr1, attr2) self.n = np.sum(self.observed) self.probs_x = self.observed.sum(axis=0) / self.n self.probs_y = self.observed.sum(axis=1) / self.n self.expected = np.outer(self.probs_y, self.probs_x) * self.n self.residuals = \ (self.observed - self.expected) / np.sqrt(self.expected) self.chisqs = self.residuals**2 self.chisq = float(np.sum(self.chisqs)) self.p = chi2.sf(self.chisq, (len(self.probs_x) - 1) * (len(self.probs_y) - 1))
def gtest(f_obs, f_exp=None, ddof=0): """ http://en.wikipedia.org/wiki/G-test The G test can test for goodness of fit to a distribution Parameters ---------- f_obs : array observed frequencies in each category f_exp : array, optional expected frequencies in each category. By default the categories are assumed to be equally likely. ddof : int, optional adjustment to the degrees of freedom for the p-value Returns ------- chisquare statistic : float The chisquare test statistic p : float The p-value of the test. Notes ----- The p-value indicates the probability that the observed distribution is drawn from a distribution given frequencies in expected. So a low p-value inidcates the distributions are different. Examples -------- >>> gtest([9.0, 8.1, 2, 1, 0.1, 20.0], [10, 5.01, 6, 4, 2, 1]) (117.94955444335938, 8.5298516190930345e-24) >>> gtest([1.01, 1.01, 4.01], [1.00, 1.00, 4.00]) (0.060224734246730804, 0.97033649350189344) >>> gtest([2, 1, 6], [4, 3, 2]) (8.2135343551635742, 0.016460903780063787) References ---------- http://en.wikipedia.org/wiki/G-test """ f_obs = [i if i != 0 else 1e-10 for i in f_obs] f_obs = np.asarray(f_obs, 'f') k = f_obs.shape[0] f_exp = np.array([np.sum(f_obs, axis=0) / float(k)] * k, 'f') \ if f_exp is None \ else np.asarray(f_exp, 'f') g = 2 * np.add.reduce(f_obs * np.log(f_obs / f_exp)) return g, chi2.sf(g, k - 1 - ddof)
def ScafLRT(geno, posdf, groups, scaf): these_idx = [x for x in posdf.loc[posdf["chrom"] == scaf].index] these_geno = geno[:, these_idx] these_geno = ma.array(these_geno, mask = [these_geno == -1]) # Ref allele frequency nChroms, nRef = [], [] for x in range(these_geno.shape[1]): nChroms.append(2 * these_geno[:, x].count(axis=0)) nRef.append(np.where(these_geno[:, x] == 0)[0].shape[0] * 2 + np.where(these_geno[:, x] == 1)[0].shape[0]) expHet = np.mean(np.array([2 * x * (1 - x) for x in [nRef[i] / c for i, c in enumerate(nChroms)]])) these_geno[these_geno == 2] = 0 obsHet = these_geno.mean(axis=1) F = np.array([(1 - obsHet[i]/expHet) for i in range(obsHet.shape[0])]) # One cluster model log-likelihood: mu_1 = np.mean(F) sigma_1 = np.std(F) lnL_1 = np.sum(np.log(norm.pdf(F, mu_1, sigma_1))) # Two cluster model log-likelihood: # Cluster 0: dat_0 = F[np.where(np.array(groups) == 0)] mu_2_0 = np.mean(dat_0) sigma_2_0 = np.std(dat_0) lnL_2_0 = np.sum(np.log(norm.pdf(dat_0, mu_2_0, sigma_2_0))) # Cluster 1: dat_1 = F[np.where(np.array(groups) == 1)] mu_2_1 = np.mean(dat_1) sigma_2_1 = np.std(dat_1) lnL_2_1 = np.sum(np.log(norm.pdf(dat_1, mu_2_1, sigma_2_1))) lnL_2 = lnL_2_0 + lnL_2_1 # Likelihood-ratio test: LRT = - 2 * (lnL_1 - lnL_2) dof = 2 + len(groups) pval = chi2.sf(LRT, dof) return(pval)
def __init__(self, data, attr1, attr2): attr1 = data.domain[attr1] attr2 = data.domain[attr2] if attr1.is_discrete and not attr1.values or \ attr2.is_discrete and not attr2.values: self.p = np.nan return self.observed = get_contingency(data, attr1, attr2) self.n = np.sum(self.observed) self.probs_x = self.observed.sum(axis=0) / self.n self.probs_y = self.observed.sum(axis=1) / self.n self.expected = np.outer(self.probs_y, self.probs_x) * self.n self.residuals = \ (self.observed - self.expected) / np.sqrt(self.expected) self.residuals = np.nan_to_num(self.residuals) self.chisqs = self.residuals ** 2 self.chisq = float(np.sum(self.chisqs)) self.p = chi2.sf( self.chisq, (len(self.probs_x) - 1) * (len(self.probs_y) - 1))
def chi2_approx(calc_stat, x, y): """ Calculate the p-value for Dcorr and Hsic via a chi-squared approximation. In the case of distance and kernel methods, Dcorr (and by extension Hsic [#2ChiSq]_) can be approximated via a chi-squared distribution [#1ChiSq]. This approximation is also applicable for the nonparametric MANOVA via independence testing method in our package [#3ChiSq]_. Parameters ---------- calc_stat : callable() The method used to calculate the test statistic (must use hyppo API). x, y : ndarray Input data matrices. `x` and `y` must have the same number of samples. That is, the shapes must be `(n, p)` and `(n, q)` where `n` is the number of samples and `p` and `q` are the number of dimensions. Alternatively, `x` and `y` can be distance matrices, where the shapes must both be `(n, n)`. Returns ------- stat : float The computed test statistic. pvalue : float The computed p-value. References ---------- .. [#1ChiSq] Shen, C., & Vogelstein, J. T. (2019). The Chi-Square Test of Distance Correlation. arXiv preprint arXiv:1912.12150. .. [#2ChiSq] Shen, C., & Vogelstein, J. T. (2018). The exact equivalence of distance and kernel methods for hypothesis testing. arXiv preprint arXiv:1806.05514. .. [#3ChiSq] Panda, S., Shen, C., Perry, R., Zorn, J., Lutz, A., Priebe, C. E., & Vogelstein, J. T. (2019). Nonparametric MANOVA via Independence Testing. arXiv e-prints, arXiv-1910. """ n = x.shape[0] stat = calc_stat(x, y) pvalue = chi2.sf(stat * n + 1, 1) return stat, pvalue
def p_value_calculator(N_kij, pc, dof): if pc: G = 0 for k in range(N_kij.shape[2]): N_div = np.ones(N_kij.shape[0:2]) N_div = np.multiply(N_div, N_kij[:, :, k].sum(axis=0)) N_div = np.multiply(N_div, N_kij[:, :, k].sum(axis=1).reshape(N_kij[:, :, k].shape[0], 1)) np.seterr(all='ignore') G = G + np.nansum(np.multiply(2 * N_kij[:, :, k], np.log(np.divide(N_kij[:, :, k] * N_kij[:, :, k].sum(), N_div)))) else: N_div = np.ones(N_kij.shape) N_div = np.multiply(N_div, N_kij.sum(axis=0)) N_div = np.multiply(N_div, N_kij.sum(axis=1).reshape(N_kij.shape[0], 1)) np.seterr(all='ignore') G = np.nansum(np.multiply(2 * N_kij, np.log(np.divide(N_kij * N_kij.sum(), N_div)))) p_value = chi2.sf(G, dof) return p_value
def __init__(self, data, attr1, attr2): attr1 = data.domain[attr1] attr2 = data.domain[attr2] if attr1.is_discrete and not attr1.values or \ attr2.is_discrete and not attr2.values: self.p = np.nan return self.observed = get_contingency(data, attr1, attr2) self.n = np.sum(self.observed) # pylint: disable=unexpected-keyword-arg self.probs_x = self.observed.sum(axis=0) / self.n self.probs_y = self.observed.sum(axis=1) / self.n self.expected = np.outer(self.probs_y, self.probs_x) * self.n with np.errstate(divide="ignore", invalid="ignore"): self.residuals = \ (self.observed - self.expected) / np.sqrt(self.expected) self.residuals = np.nan_to_num(self.residuals) self.chisqs = self.residuals ** 2 self.chisq = float(np.sum(self.chisqs)) self.p = chi2.sf( self.chisq, (len(self.probs_x) - 1) * (len(self.probs_y) - 1))
def valor_p(T, k): return chi2.sf(T,k-1)
def valor_p(T, m): return chi2.sf(T,m-1)
def valor_p(T): return chi2.sf(T,N-1)
def valor_p_ji(y, k): return 2* min(chi2.sf(y,k-1), 1 - chi2.sf(y,k-1))
def ehabitat(ecor,nw,nwpathout): global nwpath if nw=='': nwpath = os.getcwd() else: nwpath = nw if gmaps == 0: initglobalmaps() if nwpathout=='': #outdir = 'results' # ToDo: locally create folder "results" if it does not exist! outdir = os.path.join(os.path.sep, os.getcwd(), 'results') safelyMakeDir(outdir) else: #outdir = nwpathout+'/results' # SHARED FOLDER PATH outdir = os.path.join(os.path.sep, nwpathout, 'results') safelyMakeDir(outdir) treepamin = treepamax = eprpamin = eprpamax = prepamin = prepamax = biopamin = biopamax = slopepamin = slopepamax = ndwipamin = ndwipamax = ndvimaxpamin = ndvimaxpamax = ndviminpamin = ndviminpamax = hpamin = hpamax = None s = nd.generate_binary_structure(2,2) # most restrictive pattern for the landscape patches # LOCAL FOLDER csvname1 = os.path.join(os.path.sep, outdir, 'ecoregs_done.csv') print csvname1 if os.path.isfile(csvname1) == False: wb = open(csvname1,'a') wb.write('None') wb.write('\n') wb.close() # LOCAL FOLDER csvname = os.path.join(os.path.sep, outdir, 'hri_results.csv') print csvname if os.path.isfile(csvname) == False: wb = open(csvname,'a') wb.write('ecoregion wdpaid averpasim hr2aver pxpa hr1insumaver hriaver nfeatsaver lpratio lpratio2 numpszok lpmaxsize aggregation treepamin treepamax eprpamin eprpamax prepamin prepamax biopamin biopamax slopepamin slopepamax ndwipamin ndwipamax ndvimaxpamin ndvimaxpamax ndviminpamin ndviminpamax hpamin hpamax treepamean eprpamean prepamean biopamean slopepamean ndwipamean ndvimaxpamean ndviminpamean hpamean') wb.write('\n') wb.close() treepamean = eprpamean = prepamean = biopamean = slopepamean = ndwipamean = ndvimaxpamean = ndviminpamean = hpamean = None ef = 'eco_'+str(ecor)+'.tif' ecofile = os.path.join(os.path.sep, nwpath, 'ecoregs', ef) #ecofile = os.path.join(os.path.sep, nwpath, os.path.sep,'ecoregs', os.path.sep, ef) print ecofile avail = os.path.isfile(ecofile) if avail == True: eco_csv = str(ecor)+'.csv' print eco_csv ecoparksf = os.path.join(os.path.sep, nwpath, 'pas', eco_csv) #ecoparksf = os.path.join(os.path.sep, nwpath, os.path.sep, 'pas', os.path.sep, eco_csv) print ecoparksf #ecoparksf = nwpath+'/pas/'+str(ecor)+'.csv' src_ds_eco = gdal.Open(ecofile) eco = src_ds_eco.GetRasterBand(1) eco_mask0 = eco.ReadAsArray(0,0,eco.XSize,eco.YSize).astype(np.int32) eco_mask = eco_mask0.flatten() gt_eco = src_ds_eco.GetGeoTransform() print 'eco mask' xoff = int((gt_eco[0]-gt_epr_global[0])/1000) yoff = int((gt_epr_global[3]-gt_eco[3])/1000) epr_eco_bb0 = epr_global.ReadAsArray(xoff,yoff,eco.XSize,eco.YSize).astype(np.float32) epr_eco_bb = epr_eco_bb0.flatten() epr_eco0 = np.where(eco_mask == 1, (epr_eco_bb),(0)) epr_eco = np.where(epr_eco0 == 65535.0, (float('NaN')),(epr_eco0)) maskepr = np.isnan(epr_eco) epr_eco[maskepr] = np.interp(np.flatnonzero(maskepr), np.flatnonzero(~maskepr), epr_eco[~maskepr]) print 'eco epr' xoff = int((gt_eco[0]-gt_slope_global[0])/1000) yoff = int((gt_slope_global[3]-gt_eco[3])/1000) slope_eco_bb0 = slope_global.ReadAsArray(xoff,yoff,eco.XSize,eco.YSize).astype(np.float32) slope_eco_bb = slope_eco_bb0.flatten() slope_eco0 = np.where(eco_mask == 1, (slope_eco_bb),(0)) slope_eco = np.where(slope_eco0 == 65535.0, (float('NaN')),(slope_eco0)) maskslope = np.isnan(slope_eco) slope_eco[maskslope] = np.interp(np.flatnonzero(maskslope), np.flatnonzero(~maskslope), slope_eco[~maskslope]) print 'eco slope' xoff = int((gt_eco[0]-gt_ndvimax_global[0])/1000) yoff = int((gt_ndvimax_global[3]-gt_eco[3])/1000) ndvimax_eco_bb0 = ndvimax_global.ReadAsArray(xoff,yoff,eco.XSize,eco.YSize).astype(np.float32) ndvimax_eco_bb = ndvimax_eco_bb0.flatten() ndvimax_eco0 = np.where(eco_mask == 1, (ndvimax_eco_bb),(0)) ndvimax_eco = np.where(ndvimax_eco0 == 65535.0, (float('NaN')),(ndvimax_eco0)) maskndvimax = np.isnan(ndvimax_eco) ndvimax_eco[maskndvimax] = np.interp(np.flatnonzero(maskndvimax), np.flatnonzero(~maskndvimax), ndvimax_eco[~maskndvimax]) print 'eco ndvimax' xoff = int((gt_eco[0]-gt_ndvimin_global[0])/1000) yoff = int((gt_ndvimin_global[3]-gt_eco[3])/1000) ndvimin_eco_bb0 = ndvimin_global.ReadAsArray(xoff,yoff,eco.XSize,eco.YSize).astype(np.float32) ndvimin_eco_bb = ndvimin_eco_bb0.flatten() ndvimin_eco0 = np.where(eco_mask == 1, (ndvimin_eco_bb),(0)) ndvimin_eco = np.where(ndvimin_eco0 == 65535.0, (float('NaN')),(ndvimin_eco0)) maskndvimin = np.isnan(ndvimin_eco) ndvimin_eco[maskndvimin] = np.interp(np.flatnonzero(maskndvimin), np.flatnonzero(~maskndvimin), ndvimin_eco[~maskndvimin]) print 'eco ndvimin' xoff = int((gt_eco[0]-gt_ndwi_global[0])/1000) yoff = int((gt_ndwi_global[3]-gt_eco[3])/1000) ndwi_eco_bb0 = ndwi_global.ReadAsArray(xoff,yoff,eco.XSize,eco.YSize).astype(np.float32) ndwi_eco_bb = ndwi_eco_bb0.flatten() ndwi_eco0 = np.where(eco_mask == 1, (ndwi_eco_bb),(0)) ndwi_eco = np.where(ndwi_eco0 == 255.0, (float('NaN')),(ndwi_eco0)) maskndwi = np.isnan(ndwi_eco) ndwi_eco[maskndwi] = np.interp(np.flatnonzero(maskndwi), np.flatnonzero(~maskndwi), ndwi_eco[~maskndwi]) print 'eco ndwi' xoff = int((gt_eco[0]-gt_pre_global[0])/1000) yoff = int((gt_pre_global[3]-gt_eco[3])/1000) pre_eco_bb0 = pre_global.ReadAsArray(xoff,yoff,eco.XSize,eco.YSize).astype(np.float32) pre_eco_bb = pre_eco_bb0.flatten() pre_eco0 = np.where(eco_mask == 1, (pre_eco_bb),(0)) pre_eco = np.where(pre_eco0 == 65535.0, (float('NaN')),(pre_eco0)) maskpre = np.isnan(pre_eco) pre_eco[maskpre] = np.interp(np.flatnonzero(maskpre), np.flatnonzero(~maskpre), pre_eco[~maskpre]) print 'eco pre' xoff = int((gt_eco[0]-gt_bio_global[0])/1000) yoff = int((gt_bio_global[3]-gt_eco[3])/1000) bio_eco_bb0 = bio_global.ReadAsArray(xoff,yoff,eco.XSize,eco.YSize).astype(np.float32) bio_eco_bb = bio_eco_bb0.flatten() bio_eco0 = np.where(eco_mask == 1, (bio_eco_bb),(0)) bio_eco = np.where(bio_eco0 == 65535.0, (float('NaN')),(bio_eco0)) maskbio = np.isnan(bio_eco) bio_eco[maskbio] = np.interp(np.flatnonzero(maskbio), np.flatnonzero(~maskbio), bio_eco[~maskbio]) print 'eco bio' xoff = int((gt_eco[0]-gt_tree_global[0])/1000) yoff = int((gt_tree_global[3]-gt_eco[3])/1000) tree_eco_bb0 = tree_global.ReadAsArray(xoff,yoff,eco.XSize,eco.YSize).astype(np.float32) tree_eco_bb = tree_eco_bb0.flatten() tree_eco0 = np.where(eco_mask == 1, (tree_eco_bb),(0)) tree_eco = np.where(tree_eco0 == 255.0, (float('NaN')),(tree_eco0)) masktree = np.isnan(tree_eco) tree_eco[masktree] = np.interp(np.flatnonzero(masktree), np.flatnonzero(~masktree), tree_eco[~masktree]) print 'eco tree' xoff = int((gt_eco[0]-gt_herb_global[0])/1000) yoff = int((gt_herb_global[3]-gt_eco[3])/1000) herb_eco_bb0 = herb_global.ReadAsArray(xoff,yoff,eco.XSize,eco.YSize).astype(np.float32) herb_eco_bb = herb_eco_bb0.flatten() herb_eco0 = np.where(eco_mask == 1, (herb_eco_bb),(0)) herb_eco = np.where(herb_eco0 == 255.0, (float('NaN')),(herb_eco0)) maskherb = np.isnan(herb_eco) herb_eco[maskherb] = np.interp(np.flatnonzero(maskherb), np.flatnonzero(~maskherb), herb_eco[~maskherb]) print 'eco herb' ind_eco0 = np.column_stack((bio_eco,pre_eco,epr_eco,herb_eco,ndvimax_eco,ndvimin_eco,ndwi_eco,slope_eco,tree_eco)) print 'ecovars stacked' print ecoparksf pa_list0 = np.genfromtxt(ecoparksf,dtype='string') # crear este archivo en subpas! pa_list = np.unique(pa_list0) n = len(pa_list) for px in range(0,n): # 0,n pa = pa_list[px] print pa outfile = os.path.join(os.path.sep, outdir, str(ecor)+'_'+str(pa)+'.tif') outfile2 = os.path.join(os.path.sep, outdir, str(ecor)+'_'+str(pa)+'_lp.tif') outfile3 = os.path.join(os.path.sep, outdir, str(ecor)+'_'+str(pa)+'_mask.tif') #outfile = outdir+'/'+str(ecor)+'_'+str(pa)+'.tif' # LOCAL FOLDER pa_infile = 'pa_'+str(pa)+'.tif' pa4 = os.path.join(os.path.sep, nwpath, 'pas', pa_infile) #pa4 = os.path.join(os.path.sep, nwpath, os.path.sep, 'pas', os.path.sep, pa_infile) print pa4 #pa4 = nwpath+'/pas/pa_'+str(pa)+'.tif' dropcols = np.arange(9,dtype=int) done = os.path.isfile(outfile) avail2 = os.path.isfile(pa4) if done == False and avail2 == True: pafile=pa4 src_ds_pa = gdal.Open(pafile) par = src_ds_pa.GetRasterBand(1) pa_mask0 = par.ReadAsArray(0,0,par.XSize,par.YSize).astype(np.int32) pa_mask = pa_mask0.flatten() ind = pa_mask > 0 #==int(pa) go = 1 sum_pa_mask = sum(pa_mask[ind])#/int(pa) if sum_pa_mask < 3: go = 0 # not processing areas smaller than 3 pixels print sum_pa_mask sum_pa_mask_inv = len(pa_mask[pa_mask == 0]) print sum_pa_mask_inv print len(pa_mask) ratiogeom = 10000 if sum_pa_mask > 0: ratiogeom = sum_pa_mask_inv/sum_pa_mask #print ratiogeom gt_pa = src_ds_pa.GetGeoTransform() xoff = int((gt_pa[0]-gt_pre_global[0])/1000) yoff = int((gt_pre_global[3]-gt_pa[3])/1000) if xoff>0 and yoff>0 and go == 1: num_bands=src_ds_eco.RasterCount driver = gdal.GetDriverByName("GTiff") dst_options = ['COMPRESS=LZW'] dst_ds = driver.Create( outfile,src_ds_eco.RasterXSize,src_ds_eco.RasterYSize,num_bands,gdal.GDT_Float32,dst_options) dst_ds.SetGeoTransform( src_ds_eco.GetGeoTransform()) dst_ds.SetProjection( src_ds_eco.GetProjectionRef()) xoff = int((gt_pa[0]-gt_tree_global[0])/1000) yoff = int((gt_tree_global[3]-gt_pa[3])/1000) tree_pa_bb0 = tree_global.ReadAsArray(xoff,yoff,par.XSize,par.YSize).astype(np.float32) tree_pa_bb = tree_pa_bb0.flatten() tree_pa0 = tree_pa_bb[ind] tree_pa = np.where(tree_pa0 == 255.0, (float('NaN')),(tree_pa0)) mask2tree = np.isnan(tree_pa) if mask2tree.all() == True: dropcols[8] = -8 else: tree_pa[mask2tree] = np.interp(np.flatnonzero(mask2tree), np.flatnonzero(~mask2tree), tree_pa[~mask2tree]) tree_pa = np.random.random_sample(len(tree_pa),)/1000 + tree_pa print 'pa tree' treepamin = round(tree_pa.min(),2) treepamax = round(tree_pa.max(),2) treepamean = round(np.mean(tree_pa),2) print treepamin print treepamax treediff = abs(tree_pa.min()-tree_pa.max()) if treediff < 0.001: dropcols[8] = -8 xoff = int((gt_pa[0]-gt_epr_global[0])/1000) yoff = int((gt_epr_global[3]-gt_pa[3])/1000) epr_pa_bb0 = epr_global.ReadAsArray(xoff,yoff,par.XSize,par.YSize).astype(np.float32) epr_pa_bb = epr_pa_bb0.flatten() epr_pa0 = epr_pa_bb[ind] epr_pa = np.where(epr_pa0 == 65535.0, (float('NaN')),(epr_pa0)) mask2epr = np.isnan(epr_pa) if mask2epr.all() == True: dropcols[2] = -2 else: epr_pa[mask2epr] = np.interp(np.flatnonzero(mask2epr), np.flatnonzero(~mask2epr), epr_pa[~mask2epr]) epr_pa = np.random.random_sample(len(epr_pa),)/1000 + epr_pa print 'pa epr' eprpamin = round(epr_pa.min(),2) eprpamax = round(epr_pa.max(),2) eprpamean = round(np.mean(epr_pa),2) print eprpamin print eprpamax eprdiff = abs(epr_pa.min()-epr_pa.max()) if eprdiff < 0.001: dropcols[2] = -2 xoff = int((gt_pa[0]-gt_pre_global[0])/1000) yoff = int((gt_pre_global[3]-gt_pa[3])/1000) pre_pa_bb0 = pre_global.ReadAsArray(xoff,yoff,par.XSize,par.YSize).astype(np.float32) pre_pa_bb = pre_pa_bb0.flatten() pre_pa0 = pre_pa_bb[ind] pre_pa = np.where(pre_pa0 == 65535.0, (float('NaN')),(pre_pa0)) mask2pre = np.isnan(pre_pa) if mask2pre.all() == True: dropcols[1] = -1 else: pre_pa[mask2pre] = np.interp(np.flatnonzero(mask2pre), np.flatnonzero(~mask2pre), pre_pa[~mask2pre]) pre_pa = np.random.random_sample(len(pre_pa),)/1000 + pre_pa print 'pa pre' prepamin = round(pre_pa.min(),2) prepamax = round(pre_pa.max(),2) prepamean = round(np.mean(pre_pa),2) print prepamin print prepamax prediff = abs(pre_pa.min()-pre_pa.max()) if prediff < 0.001: dropcols[1] = -1 xoff = int((gt_pa[0]-gt_bio_global[0])/1000) yoff = int((gt_bio_global[3]-gt_pa[3])/1000) bio_pa_bb0 = bio_global.ReadAsArray(xoff,yoff,par.XSize,par.YSize).astype(np.float32) bio_pa_bb = bio_pa_bb0.flatten() bio_pa0 = bio_pa_bb[ind] bio_pa = np.where(bio_pa0 == 65535.0, (float('NaN')),(bio_pa0)) mask2bio = np.isnan(bio_pa) if mask2bio.all() == True: dropcols[0] = -0 else: bio_pa[mask2bio] = np.interp(np.flatnonzero(mask2bio), np.flatnonzero(~mask2bio), bio_pa[~mask2bio]) bio_pa = np.random.random_sample(len(bio_pa),)/1000 + bio_pa print 'pa bio' biopamin = round(bio_pa.min(),2) biopamax = round(bio_pa.max(),2) biopamean = round(np.mean(bio_pa),2) print biopamin print biopamax biodiff = abs(bio_pa.min()-bio_pa.max()) if biodiff < 0.001: dropcols[0] = -0 xoff = int((gt_pa[0]-gt_slope_global[0])/1000) yoff = int((gt_slope_global[3]-gt_pa[3])/1000) slope_pa_bb0 = slope_global.ReadAsArray(xoff,yoff,par.XSize,par.YSize).astype(np.float32) slope_pa_bb = slope_pa_bb0.flatten() slope_pa0 = slope_pa_bb[ind] slope_pa = np.where(slope_pa0 == 65535.0, (float('NaN')),(slope_pa0)) mask2slope = np.isnan(slope_pa) if mask2slope.all() == True: dropcols[7] = -7 else: slope_pa[mask2slope] = np.interp(np.flatnonzero(mask2slope), np.flatnonzero(~mask2slope), slope_pa[~mask2slope]) slope_pa = np.random.random_sample(len(slope_pa),)/1000 + slope_pa print 'pa slope' slopepamin = round(slope_pa.min(),2) slopepamax = round(slope_pa.max(),2) slopepamean = round(np.mean(slope_pa),2) print slopepamin print slopepamax slopediff = abs(slope_pa.min()-slope_pa.max()) if slopediff < 0.001: dropcols[7] = -7 xoff = int((gt_pa[0]-gt_ndwi_global[0])/1000) yoff = int((gt_ndwi_global[3]-gt_pa[3])/1000) ndwi_pa_bb0 = ndwi_global.ReadAsArray(xoff,yoff,par.XSize,par.YSize).astype(np.float32) ndwi_pa_bb = ndwi_pa_bb0.flatten() ndwi_pa0 = ndwi_pa_bb[ind] ndwi_pa = np.where(ndwi_pa0 == 255.0, (float('NaN')),(ndwi_pa0)) mask2ndwi = np.isnan(ndwi_pa) if mask2ndwi.all() == True: dropcols[6] = -6 else: ndwi_pa[mask2ndwi] = np.interp(np.flatnonzero(mask2ndwi), np.flatnonzero(~mask2ndwi), ndwi_pa[~mask2ndwi]) ndwi_pa = np.random.random_sample(len(ndwi_pa),)/1000 + ndwi_pa print 'pa ndwi' ndwipamin = round(ndwi_pa.min(),2) ndwipamax = round(ndwi_pa.max(),2) ndwipamean = round(np.mean(ndwi_pa),2) print ndwipamin print ndwipamax ndwidiff = abs(ndwi_pa.min()-ndwi_pa.max()) if ndwidiff < 0.001: dropcols[6] = -6 xoff = int((gt_pa[0]-gt_ndvimax_global[0])/1000) yoff = int((gt_ndvimax_global[3]-gt_pa[3])/1000) ndvimax_pa_bb0 = ndvimax_global.ReadAsArray(xoff,yoff,par.XSize,par.YSize).astype(np.float32) ndvimax_pa_bb = ndvimax_pa_bb0.flatten() ndvimax_pa0 = ndvimax_pa_bb[ind] ndvimax_pa = np.where(ndvimax_pa0 == 65535.0, (float('NaN')),(ndvimax_pa0)) mask2ndvimax = np.isnan(ndvimax_pa) if mask2ndvimax.all() == True: dropcols[4] = -4 else: ndvimax_pa[mask2ndvimax] = np.interp(np.flatnonzero(mask2ndvimax), np.flatnonzero(~mask2ndvimax), ndvimax_pa[~mask2ndvimax]) ndvimax_pa = np.random.random_sample(len(ndvimax_pa),)/1000 + ndvimax_pa print 'pa ndvimax' ndvimaxpamin = round(ndvimax_pa.min(),2) ndvimaxpamax = round(ndvimax_pa.max(),2) ndvimaxpamean = round(np.mean(ndvimax_pa),2) print ndvimaxpamin print ndvimaxpamax ndvimaxdiff = abs(ndvimax_pa.min()-ndvimax_pa.max()) if ndvimaxdiff < 0.001: dropcols[4] = -4 xoff = int((gt_pa[0]-gt_ndvimin_global[0])/1000) yoff = int((gt_ndvimin_global[3]-gt_pa[3])/1000) ndvimin_pa_bb0 = ndvimin_global.ReadAsArray(xoff,yoff,par.XSize,par.YSize).astype(np.float32) ndvimin_pa_bb = ndvimin_pa_bb0.flatten() ndvimin_pa0 = ndvimin_pa_bb[ind] ndvimin_pa = np.where(ndvimin_pa0 == 65535.0, (float('NaN')),(ndvimin_pa0)) mask2ndvimin = np.isnan(ndvimin_pa) if mask2ndvimin.all() == True: dropcols[5] = -5 else: ndvimin_pa[mask2ndvimin] = np.interp(np.flatnonzero(mask2ndvimin), np.flatnonzero(~mask2ndvimin), ndvimin_pa[~mask2ndvimin]) ndvimin_pa = np.random.random_sample(len(ndvimin_pa),)/1000 + ndvimin_pa print 'pa ndvimin' ndviminpamin = round(ndvimin_pa.min(),2) ndviminpamax = round(ndvimin_pa.max(),2) ndviminpamean = round(np.mean(ndvimin_pa),2) print ndviminpamin print ndviminpamax ndvimindiff = abs(ndvimin_pa.min()-ndvimin_pa.max()) if ndvimindiff < 0.001: dropcols[5] = -5 xoff = int((gt_pa[0]-gt_herb_global[0])/1000) yoff = int((gt_herb_global[3]-gt_pa[3])/1000) herb_pa_bb0 = herb_global.ReadAsArray(xoff,yoff,par.XSize,par.YSize).astype(np.float32) herb_pa_bb = herb_pa_bb0.flatten() herb_pa0 = herb_pa_bb[ind] herb_pa = np.where(herb_pa0 == 255.0, (float('NaN')),(herb_pa0)) mask2herb = np.isnan(herb_pa) if mask2herb.all() == True: dropcols[3] = -3 else: herb_pa[mask2herb] = np.interp(np.flatnonzero(mask2herb), np.flatnonzero(~mask2herb), herb_pa[~mask2herb]) herb_pa = np.random.random_sample(len(herb_pa),)/1000 + herb_pa print 'pa herb' hpamin = round(herb_pa.min(),2) hpamax = round(herb_pa.max(),2) hpamean = round(np.mean(herb_pa),2) print hpamin print hpamax hdiff = abs(herb_pa.min()-herb_pa.max()) if hdiff < 0.001: dropcols[3] = -3 cols = dropcols[dropcols>=0] ind_pa0 = np.column_stack((bio_pa,pre_pa,epr_pa,herb_pa,ndvimax_pa,ndvimin_pa,ndwi_pa,slope_pa,tree_pa)) ind_pa = ind_pa0[:,cols] ind_eco = ind_eco0[:,cols] print ind_pa.shape hr1sum = hr1insum = indokpsz = pszok = sumpszok = lpratio2 = numpszok = hr1averpa = hr3aver = hr2aver = pszmax = num_featuresaver = lpratio = hr1medianpa = hr1insumaver = pxpa = aggregation = None print "PA masked" #print ind_pa if ind_pa.shape[0]>4 and ind_pa.shape[1]>1: Ymean = np.mean(ind_pa,axis=0) print 'Max. mean value is '+ str(Ymean.max()) print "Ymean ok" Ycov = np.cov(ind_pa,rowvar=False) print 'Max. cov value is '+ str(Ycov.max()) print "Ycov ok" #mh = mahalanobis_distances(Ymean, Ycov, ind_eco, parallel=False) #mh2 = mahalanobis_distances(Ymean, Ycov, ind_eco, parallel=True) mh2 = mahalanobis_distances_scipy(Ymean, Ycov, ind_eco, parallel=True) # previous working version #mh2 = mahalanobis_distances_scipy(Ymean, Ycov, ind_eco, parallel=False) maxmh=mh2.max() print 'Max. mh value is '+ str(maxmh) print 'Max. mh value is nan: '+ str(np.isnan(maxmh)) mh = mh2*mh2 print "mh ok" pmh = chi2.sf(mh,len(cols)).reshape((eco.YSize,eco.XSize)) # chisqprob pmhh = np.where(pmh <= 0.001,None, pmh) print "pmh ok" # quitar valores muy bajos! pmhhmax = pmhh.max() print 'Max. similarity value is '+ str(pmhhmax) dst_ds.GetRasterBand(1).WriteArray(pmhh) dst_ds = None hr11 = np.where(pmhh>0,1,0) # 0.5 hr1 = hr11.flatten() hr1sum = sum(hr1) print 'Number of pixels with similarity higher than 0 is '+str(hr1sum) hr1insumaver = hr1insum = 0 hr1sumaver = hr1sum src_ds_sim = gdal.Open(outfile) sim = src_ds_sim.GetRasterBand(1) gt_sim = src_ds_sim.GetGeoTransform() xoff = int((gt_pa[0]-gt_sim[0])/1000) yoff = int((gt_sim[3]-gt_pa[3])/1000) xextentpa = xoff + par.XSize yextentpa = yoff + par.YSize xless = sim.XSize - xextentpa yless = sim.YSize - yextentpa xsize = par.XSize ysize = par.YSize if xoff>0 and yoff>0 and pmhhmax>0.01 and hr1sum>1 and maxmh!=float('NaN'):#and ratiogeom < 100: # also checks if results are not empty # reading the similarity ecoregion without the PA (tmp mask) os.system('gdal_merge.py '+str(ecofile)+' '+str(pa4)+' -o '+str(outfile3)+' -ot Int32') hri_pa_bb03 = sim.ReadAsArray().astype(np.float32) hri_pa_bb3 = hri_pa_bb03.flatten() src_ds_sim2 = gdal.Open(outfile3) sim2 = src_ds_sim2.GetRasterBand(1) gt_sim2 = src_ds_sim2.GetGeoTransform() hri_pa_bb02 = sim2.ReadAsArray().astype(np.int32) #hri_pa_bb2 = hri_pa_bb02.flatten() hri_pa_bb02_max = hri_pa_bb02.max() print 'PA: '+str(pa) print 'PA (= max) value from mask = '+str(hri_pa_bb02_max) if hri_pa_bb02.shape == hri_pa_bb03.shape: hri_pa02 = np.where(hri_pa_bb02 == pa,0,hri_pa_bb03) # hri_pa_bb02_max if xless < 0: xsize = xsize + xless if yless < 0: ysize = ysize + yless hri_pa_bb0 = sim.ReadAsArray(xoff,yoff,xsize,ysize).astype(np.float32) hri_pa_bb = hri_pa_bb0.flatten() indd = hri_pa_bb > 0 hri_pa0 = hri_pa_bb[indd] print 'Total number of pixels with similarity values in PA: '+str(len(hri_pa0)) hr1averpa = round(np.mean(hri_pa0[~np.isnan(hri_pa0)]),2) #print hr1averpa #hr1medianpa = np.median(hri_pa0[~np.isnan(hri_pa0)]) print 'mean similarity in the park is '+str(hr1averpa) #hr1insum = sum(np.where(hri_pa0 >= 0.5, 1,0)) # use hr1averpa as threshold instead! hr1inaver = np.where(hri_pa0 >= hr1averpa, 1,0) hr1insumaver = sum(hr1inaver) #print hr1insum ##labeled_arrayin, num_featuresin = nd.label(hr1inaver, structure=s) hr1averr = np.where(hri_pa02 >= hr1averpa, 1,0) # pmhh hr1aver = hr1averr.flatten() print 'Total number of pixels with similarity values in ECO: '+str(sum(hr1aver)) labeled_arrayaver, num_featuresaver = nd.label(hr1averr, structure=s) print 'Nr of similar patches found: '+str(num_featuresaver) if num_featuresaver > 0: lbls = np.arange(1, num_featuresaver+1) psizes = nd.labeled_comprehension(labeled_arrayaver, labeled_arrayaver, lbls, np.count_nonzero, float, 0) #-1 pszmax = psizes.max()#-hr1insumaver dst_ds2 = driver.Create(outfile2,src_ds_eco.RasterXSize,src_ds_eco.RasterYSize,num_bands,gdal.GDT_Int32,dst_options) dst_ds2.SetGeoTransform(src_ds_eco.GetGeoTransform()) dst_ds2.SetProjection(src_ds_eco.GetProjectionRef()) dst_ds2.GetRasterBand(1).WriteArray(labeled_arrayaver) dst_ds2 = None #num_feats = num_features - num_featuresaver hr1sumaver = sum(hr1aver) hr2aver = hr1sumaver #- hr1insumaver pxpa = ind_pa.shape[0] indokpsz = psizes >= pxpa pszsok = psizes[indokpsz] # NEW sumpszok = sum(pszsok) lpratio=round(float(pszmax/pxpa),2) lpratio2=round(float(sumpszok/pxpa),2) numpszok = len(pszsok) hr3aver = round(float(hr2aver/pxpa),2) aggregation = round(float(hr2aver/num_featuresaver),2) #hr2 = hr1sumaver - hr1insumaver #print hr2 #hr3 = float(hr2/ind_pa.shape[0]) #print hr3 wb = open(csvname,'a') var = str(ecor)+' '+str(pa)+' '+str(hr1averpa)+' '+str(hr2aver)+' '+str(pxpa)+' '+str(hr1insumaver)+' '+str(hr3aver)+' '+str(num_featuresaver)+' '+str(lpratio)+' '+str(lpratio2)+' '+str(numpszok)+' '+str(pszmax)+' '+str(aggregation)+' '+str(treepamin)+' '+str(treepamax)+' '+str(eprpamin)+' '+str(eprpamax)+' '+str(prepamin)+' '+str(prepamax)+' '+str(biopamin)+' '+str(biopamax)+' '+str(slopepamin)+' '+str(slopepamax)+' '+str(ndwipamin)+' '+str(ndwipamax)+' '+str(ndvimaxpamin)+' '+str(ndvimaxpamax)+' '+str(ndviminpamin)+' '+str(ndviminpamax)+' '+str(hpamin)+' '+str(hpamax)+' '+str(treepamean)+' '+str(eprpamean)+' '+str(prepamean)+' '+str(biopamean)+' '+str(slopepamean)+' '+str(ndwipamean)+' '+str(ndvimaxpamean)+' '+str(ndviminpamean)+' '+str(hpamean)# exclude PA! #+' '+str(hr1p25pa)# '+str(hr3)+' +' '+str(hr1medianpa)+' '+str(num_features)+' ' wb.write(var) wb.write('\n') wb.close() print "results exported" os.system('rm '+str(outfile3)) wb = open(csvname1,'a') # LOCAL FOLDER var = str(ecor) wb.write(var) wb.write('\n') wb.close() print "END ECOREG: " + str(ecor)