def alpha_on_determinist_compound_closed_form(lmb=10.0,t1=10,\ t2=10,l=3,verbose=False): alpha_hats = np.arange(0.00001, 1.0, 0.01) #alpha_hats = np.array([0.05]) p = t2 / (t1 + t2) alphas = np.zeros(len(alpha_hats)) k = int(lmb * (t1 + t2)) alpha_dels = np.ones(len(alpha_hats)) total_pois_mass = 0.0 #TODO: Replace this with other condition. while sum(alpha_dels) > 1e-7 * len(alpha_dels): isfs = binom.isf(alpha_hats, k * l, p) cdfs = binom.sf((isfs / l).astype(int), k, p) pmf = poisson.pmf(k, lmb * (t1 + t2)) total_pois_mass += pmf alpha_dels = pmf * cdfs alphas += alpha_dels if verbose and (k - int(lmb * (t1 + t2))) % 100 == 0: print("k="+str(k-int(lmb*(t1+t2))) + " alpha_dels sum: "\ + str(sum(alpha_dels))) k += 1 if verbose: print("Completed first loop") k = int(lmb * (t1 + t2)) - 1 while k >= 0: isfs = binom.isf(alpha_hats, k * l, p) cdfs = binom.sf((isfs / l).astype(int), k, p) pmf = poisson.pmf(k, lmb * (t1 + t2)) total_pois_mass += pmf alpha_dels = pmf * cdfs if np.isnan(sum(alpha_dels)): print(k) alphas += alpha_dels k -= 1 return alphas, alpha_hats, total_pois_mass
def beta_on_poisson_closed_form2(t1=25,t2=25,\ lmb_base=12,effect=3,alpha=0.05): beta = 0 n = 0 beta_n = 0 beta_del = 0 p = lmb_base * t1 / (lmb_base * t1 + (lmb_base + effect) * t2) q = t1 / (t1 + t2) mu_1 = t1 * (lmb_base + effect) mu_2 = t2 * lmb_base poisson_mu = lmb_base * t1 + (lmb_base + effect) * t2 int_poisson_mu = int(poisson_mu) n = int_poisson_mu - 1 while beta_del > 1e-9 or n == int_poisson_mu - 1: n += 1 surv_inv = int(binom.isf(alpha, n, q)) beta_del = 0 for j in range(surv_inv + 1): beta_n = poisson.pmf(j, (lmb_base + effect) * t2) * poisson.pmf( n - j, lmb_base * t1) beta_del += beta_n beta += beta_n n = int_poisson_mu while beta_del > 1e-9 or n == int_poisson_mu: n -= 1 surv_inv = int(binom.isf(alpha, n, q)) beta_del = 0 for j in range(surv_inv + 1): beta_n = poisson.pmf(j, (lmb_base + effect) * t2) * poisson.pmf( n - j, lmb_base * t1) beta_del += beta_n beta += beta_n return beta
def transition_prob_naive(result, rollout, pre_bid, bid, call_belief): r = rollout[0] + rollout r[0] = rollout[0] N = len(result) - 1 #num of total dice other_dice = N - sum(rollout) odds = np.zeros((1 + N, 6)) if pre_bid is not None: lower_lim = get_legit_bids(pre_bid) for i in range(6): p = 1 / 6 + (i != 0) / 6 upper = int(binom.isf(0.15, other_dice, p)) + r[i] if pre_bid is None: lower = max(1, int(binom.isf(0.85, other_dice, p)) + r[i]) else: lower = max(1, lower_lim[i]) odds[lower:upper + 1, i] = (1 - binom.cdf(np.arange(-r[i] - 1, -r[i] + N), other_dice, p) * binom.cdf(np.arange(-1, N), N, p))[lower:upper + 1] odds = odds**(3 + int(9 / other_dice**2)) # if sum(rollout)==1: # print(rollout,pre_bid,bid) # print(odds) if odds[bid[0], bid[1]] == 0: return 0 return odds[bid[0], bid[1]] / np.sum(odds)
def initial_bid_candidates(rollout, num_other_dice, aggresive): L = [] for i in range(6): p = (2 - (i == 0)) / 6 upper = int(binom.isf(0.5 - aggresive / 2, num_other_dice, p)) lower = int(binom.isf(0.5 + aggresive / 2, num_other_dice, p)) L += [[max(1, rollout[i] + k), i] for k in range(int(lower), int(upper) + 1)] return L
def preprocess(self): N=sum(self.rollout)+self.others_num_dice #num of total dice self.vinilla_call_belief=np.zeros((1+N,6)) self.bid_upper_lim=[] for i in range(6): p=1/6+(i!=0)/6 crit=binom.isf(self.call_level,N,p) upper_dev=int(binom.isf(self.tolerance,N,p)) self.bid_upper_lim.append(self.rollout[i]+upper_dev+self.rollout[0]*(i!=0)) self.vinilla_call_belief[:,i]=binom.cdf(np.arange(-1,N)-crit,N,p)
def qbinom(q, size=1, prob=0.5, lowertail=True): """ ============================================================================ qbinom() ============================================================================ The quantile function for the binomial distribution. You provide a quantile (eg q=0.75) or array of quantiles, and it returns the value along the binomial distribution that corresponds to the qth quantile. USAGE: dbinom(x, size, prob=0.5, log=False) pbinom(q, size, prob=0.5, lowertail=True, log=False) qbinom(p, size, prob=0.5, lowertail=True) rbinom(n=1, size=1, prob=0.5) :param q: float. or array of floats. The quantile () :param size: int. Number of trials :param prob: float. Probability of a success :param log: bool. take the log? :return: an array of the value(s) corresponding to the quantiles q ============================================================================ """ # TODO: BUG: qbinom(0, size=11, prob=0.3) gives -1. It should be 0 # TODO: check that q is between 0.0 and 1.0 if lowertail: return binom.ppf(q=q, n=size, p=prob) else: return binom.isf(q=q, n=size, p=prob)
def naive_call(self, player_id): r = self.sim_rollout[player_id][0] + self.sim_rollout[player_id] r[0] = self.sim_rollout[player_id][0] N = sum(self.dice) other_dice = N - self.dice[player_id] if self.last_bid[0] > r[self.last_bid[1]] + other_dice: return [0] p_call_liar = binom.cdf(self.last_bid[0] - r[self.last_bid[1]] - 1, other_dice, 1 / 6 + (self.last_bid[1] != 0) / 6) odds = np.zeros((1 + N, 6)) lower_lim = get_legit_bids(self.last_bid) for i in range(6): p = 1 / 6 + (i != 0) / 6 upper = int(binom.isf(0.15, other_dice, p)) + r[i] lower = lower_lim[i] odds[lower:upper + 1, i] = ( 1 - binom.cdf(np.arange(-r[i] - 1, -r[i] + N), other_dice, p) * binom.cdf(np.arange(-1, N), N, p))[lower:upper + 1] if p_call_liar > 0.7 or np.random.sample() < p_call_liar / ( p_call_liar + np.sum(odds)): return [0] else: odds = (odds**3).flatten() odds /= np.sum(odds) #print('odd',odds) index = np.random.choice(np.arange(len(odds)), p=odds) return [index // 6, index % 6]
def simple_call(self, player_id): r = self.sim_rollout[player_id][0] + self.sim_rollout[player_id] r[0] = self.sim_rollout[player_id][0] N = sum(self.dice) if self.last_bid[0] > N - self.dice[player_id] + r[self.last_bid[1]]: return [0] result = np.ones((N + 1, 6)) for i in range(6): result[r[i]:r[i] + len(self.belief[player_id].agg_info.others_agg_dist), i] = self.belief[player_id].agg_info.others_agg_dist[:, i] result[r[i] + len(self.belief[player_id].agg_info.others_agg_dist):, i] = 0 payoff_call_liar = -result[self.last_bid[0], self.last_bid[1]] # simple payoff call liar if payoff_call_liar > -1 / 4: return [0] candidate = get_legit_bids(self.last_bid) payoff = [] for i in range(6): if candidate[i] <= N: p = (2 - (i == 0)) / 6 crit = binom.isf(1 / 3, N, p) payoff.append((1 - result[candidate[i], i]) * binom.cdf(-crit + candidate[i], N, p)) if payoff: index = np.argmin(np.array(payoff)) return [candidate[index], index] return [0]
def qbinom(q, size=1, prob=0.5, lowertail=True): """ ============================================================================ qbinom() ============================================================================ The quantile function for the binomial distribution. You provide a quantile (eg q=0.75) or array of quantiles, and it returns the value along the binomial distribution that corresponds to the qth quantile. USAGE: dbinom(x, size, prob=0.5, log=False) pbinom(q, size, prob=0.5, lowertail=True, log=False) qbinom(p, size, prob=0.5, lowertail=True) rbinom(n=1, size=1, prob=0.5) :param q: float. or array of floats. The quantile () :param size: int. Number of trials :param prob: float. Probability of a success :param log: bool. take the log? :return: an array of the value(s) corresponding to the quantiles q ============================================================================ """ # TODO: BUG: qbinom(0, size=11, prob=0.3) gives -1. It should be 0 # TODO: check that q is between 0.0 and 1.0 if lowertail: return binom.ppf(q=q, n=size, p=prob) else: return binom.isf(q=q, n=size, p=prob)
def calculate_alt_thresholds(Ds, min_alt=2, max_error_probability=1e-02, pvalue=0.05): # returns num alts such that p-value < 0.05 # in simple binomial model with max error probability return numpy.fmax(binom.isf(pvalue, Ds, max_error_probability), min_alt)
def beta_on_poisson_closed_form2(t1=25,t2=25,\ lmb_base=12,effect=3,alpha=0.05): """ Much, much slower than beta_on_poisson_closed_form. Included only for demonstration of alternate summation. """ beta = 0 n = 0 beta_n = 0 beta_del = 0 #p=lmb_base*t1/(lmb_base*t1+(lmb_base+effect)*t2) q = t1 / (t1 + t2) #mu_1 = t1*(lmb_base+effect); mu_2 = t2*lmb_base poisson_mu = lmb_base * t1 + (lmb_base + effect) * t2 int_poisson_mu = int(poisson_mu) n = int_poisson_mu - 1 while beta_del > 1e-9 or n == int_poisson_mu - 1: n += 1 surv_inv = int(binom.isf(alpha, n, q)) beta_del = 0 for j in range(surv_inv + 1): beta_n = poisson.pmf(j,(lmb_base+effect)*t2)*\ poisson.pmf(n-j,lmb_base*t1) beta_del += beta_n beta += beta_n n = int_poisson_mu while beta_del > 1e-9 or n == int_poisson_mu: n -= 1 surv_inv = int(binom.isf(alpha, n, q)) beta_del = 0 for j in range(surv_inv + 1): beta_n = poisson.pmf(j,(lmb_base+effect)*t2)*\ poisson.pmf(n-j,lmb_base*t1) beta_del += beta_n beta += beta_n return beta
def concrete_case_rising_beta(): UMPPoisson.beta_on_poisson(\ t1=0.695,t2=0.23,\ lmb_base=2,\ alpha=0.1,effect=10.0) UMPPoisson.beta_on_poisson(\ t1=0.4988,t2=0.23,\ lmb_base=2,\ alpha=0.1,effect=10.0) alpha_hats = np.arange(0.001, 1.0, 0.0001) alphas = binom.sf(binom.isf(alpha_hats, 10, 0.5), 10, 0.5) plt.plot(alpha_hats, alphas) x1, y1 = [0, 1], [0, 1] x2, y2 = [0, 1], [0, 1] plt.plot(x1, y1, marker='o') plt.show()
def binomial_chance_level(test_set, p=0.05, n_classes=2): """Computes the chance level according to the binomial law Parameters ---------- test_set : array or int The array used for testing, or the total number of trials in the test set. p : float, optional (default=0.05) The p value you want to reach. n_classes : int, optional (default=2) The number of different classes in your classification problem. Returns ------- score : float The score threshold. Any score higher or equal than this score is significant with the pvalue that was given in input. """ if not isinstance(test_set, int): test_set = len(test_set) return binom.isf(p, test_set, 1 / n_classes) / test_set
file_path = RAW_PATH + "matdata/CC120264_ICA_transdef_mf.mat" tmp = loadmat(file_path) ch_names, ch_pos = tmp["ch_info"][:306, 0], tmp["ch_pos"] mags_index = np.arange(2, 306, 3) if channel_type == "MAG": ch_names = ch_names[mags_index] ch_pos = ch_pos[mags_index] elif channel_type == "GRAD": grads_index = np.array(list(set(np.arange(306)) - set(mags_index))) ch_names = ch_names[grads_index] ch_pos = ch_pos[grads_index] if classif == "sex": n_trials = 17809 chance_level = binom.isf(PVAL, n_trials, 0.5) / n_trials vmin = 0.45 vmax = 0.70 if classif == "subject": n_subj = 315 / 2 n_trials = 17980 chance_level = binom.isf(PVAL, n_trials, 1 / n_subj) / n_trials vmin = 0.38 vmax = 0.62 if classif == "age": n_trials = 17969 chance_level = binom.isf(PVAL, n_trials, 1 / 7) / n_trials vmin = 0.10 vmax = 0.26 all_scores = [] print(classif, chance_level)
def beta_on_negbinom_closed_form2(t1=25,t2=25,\ theta_base=10,m=100.0,effect=3,alpha=0.05,\ cut_dat=1e4): beta = 0 n = 0 beta_n = 0 beta_del = 0 q = t1 / (t1 + t2) lmb_base = m / theta_base #mu_1 = t1*(lmb_base+effect); mu_2 = t2*lmb_base p1 = theta_base / (theta_base + t1) del_theta = theta_base**2 * effect / (m + theta_base * effect) theta2 = theta_base - del_theta p2 = theta2 / (t2 + theta2) poisson_mu = lmb_base * t1 + (lmb_base + effect) * t2 int_poisson_mu = int(poisson_mu) n = int_poisson_mu - 1 dels1 = [] ns1 = [] if effect == 0: nbinom_s1 = {} nbinom_s2 = nbinom_s1 else: nbinom_s1 = {} nbinom_s2 = {} while (beta_del > 1e-9 or n == int_poisson_mu - 1): n += 1 if n - int_poisson_mu > cut_dat: break surv_inv = int(binom.isf(alpha, n, q)) beta_del = 0 for j in range(surv_inv + 1): #for j in range(n+1): if j in nbinom_s1: nb1 = nbinom_s1[j] else: nb1 = nbinom.pmf(j, m, p2) nbinom_s1[j] = nb1 if n - j in nbinom_s2: nb2 = nbinom_s2[n - j] else: nb2 = nbinom.pmf(n - j, m, p1) nbinom_s2[n - j] = nb2 beta_n = nb1 * nb2 beta_del += beta_n beta += beta_n dels1.append(beta_del) ns1.append(n) n = int_poisson_mu dels2 = [] ns2 = [] while beta_del > 1e-9 or n == int_poisson_mu: n -= 1 if int_poisson_mu - n > cut_dat: break surv_inv = int(binom.isf(alpha, n, q)) beta_del = 0 for j in range(surv_inv + 1): #for j in range(n+1): if j in nbinom_s1: nb1 = nbinom_s1[j] else: nb1 = nbinom.pmf(j, m, p2) nbinom_s1[j] = nb1 if n - j in nbinom_s2: nb2 = nbinom_s2[n - j] else: nb2 = nbinom.pmf(n - j, m, p1) nbinom_s2[n - j] = nb2 beta_n = nb1 * nb2 beta_del += beta_n beta += beta_n dels2.append(beta_del) ns2.append(n) dels1 = np.array(dels1) dels2 = np.array(dels2) dels2 = dels2[::-1] ns1 = np.array(ns1) ns2 = np.array(ns2) ns2 = ns2[::-1] ns = np.concatenate((ns2, ns1), axis=0) dels = np.concatenate((dels2, dels1), axis=0) return beta, dels, ns, int_poisson_mu
def main(args=None): args = process_args(args) global F_gc, N_gc, R_gc data = np.loadtxt(args.GCbiasFrequenciesFile.name) F_gc = data[:, 0] N_gc = data[:, 1] R_gc = data[:, 2] global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that max_dup_gc = [ binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 for x in range(len(F_gc)) ] global_vars['max_dup_gc'] = max_dup_gc tbit = twobit.TwoBitFile(global_vars['2bit']) bam = pysam.Samfile(global_vars['bam']) global_vars['genome_size'] = sum(tbit.sequence_sizes().values()) global_vars['total_reads'] = bam.mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize # apply correction print("applying correction") # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) chunkSize = int(4e5 / global_vars['reads_per_bp']) # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] regionStart = 0 if args.region: chromSizes, regionStart, regionEnd, chunkSize = \ mapReduce.getUserRegion(chromSizes, args.region, max_chunk_size=chunkSize) print("genome partition size for multiprocessing: {}".format(chunkSize)) print("using region {}".format(args.region)) mp_args = [] bedGraphStep = args.binSize chrNameBitToBam = tbitToBamChrName(list(tbit.sequence_sizes().keys()), bam.references) chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) print(chrNameBitToBam, chrNameBamToBit) c = 1 for chrom, size in chromSizes: start = 0 if regionStart == 0 else regionStart for i in range(start, size, chunkSize): try: chrNameBamToBit[chrom] except KeyError: print("no sequence information for ") "chromosome {} in 2bit file".format(chrom) print("Reads in this chromosome will be skipped") continue length = min(size, i + chunkSize) mp_args.append( (chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) if args.correctedFile.name.endswith('bam'): if len(mp_args) > 1 and args.numberOfProcessors > 1: print(("using {} processors for {} " "number of tasks".format(args.numberOfProcessors, len(mp_args)))) res = pool.map_async(writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrectedSam_wrapper, mp_args)) if len(res) == 1: command = "cp {} {}".format(res[0], args.correctedFile.name) run_shell_command(command) else: print("concatenating (sorted) intermediate BAMs") header = pysam.Samfile(res[0]) of = pysam.Samfile(args.correctedFile.name, "wb", template=header) header.close() for f in res: f = pysam.Samfile(f) for e in f.fetch(until_eof=True): of.write(e) f.close() of.close() print("indexing BAM") pysam.index(args.correctedFile.name) for tempFileName in res: os.remove(tempFileName) if args.correctedFile.name.endswith('bg') or \ args.correctedFile.name.endswith('bw'): _temp_bg_file_name = utilities.getTempFileName(suffix='_all.bg') if len(mp_args) > 1 and args.numberOfProcessors > 1: res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrected_wrapper, mp_args)) # concatenate intermediary bedgraph files _temp_bg_file = open(_temp_bg_file_name, 'w') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), _temp_bg_file) os.remove(tempFileName) _temp_bg_file.close() args.correctedFile.close() if args.correctedFile.name.endswith('bg'): shutil.move(_temp_bg_file_name, args.correctedFile.name) else: chromSizes = [(k, v) for k, v in tbit.sequence_sizes().items()] writeBedGraph.bedGraphToBigWig(chromSizes, _temp_bg_file_name, args.correctedFile.name) os.remove(_temp_bg_file)
# Work out the copyright year range year = datetime.today().year if year != 2019: year = "2019-%d" % year print("""/* * WARNING: do not edit! * Generated by statistics/bn_rand_range.py in the OpenSSL tool repository. * * Copyright %s The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License 2.0 (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy * in the file LICENSE in the source distribution or at * https://www.openssl.org/source/license.html */ static const struct { unsigned int range; unsigned int iterations; double critical; } rand_range_cases[] = {""" % year) num_cases = len(list(map(do_case, test_cases))) print("};\n") # Finally, calculate and output the lower tail binomial threshold. b_thresh = binom.isf(alpha_binomial, num_cases, alpha_chi2) print("static const int binomial_critical = %d;\n" % b_thresh)
def main(args=None): args = process_args(args) global F_gc, N_gc, R_gc data = np.loadtxt(args.GCbiasFrequenciesFile.name) F_gc = data[:, 0] N_gc = data[:, 1] R_gc = data[:, 2] global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that max_dup_gc = [binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 for x in range(len(F_gc))] global_vars['max_dup_gc'] = max_dup_gc tbit = py2bit.open(global_vars['2bit']) bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors) global_vars['genome_size'] = sum(tbit.chroms().values()) global_vars['total_reads'] = mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize # apply correction print("applying correction") # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) chunkSize = int(4e5 / global_vars['reads_per_bp']) # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] regionStart = 0 if args.region: chromSizes, regionStart, regionEnd, chunkSize = \ mapReduce.getUserRegion(chromSizes, args.region, max_chunk_size=chunkSize) print("genome partition size for multiprocessing: {}".format(chunkSize)) print("using region {}".format(args.region)) mp_args = [] bedGraphStep = args.binSize chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references) chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) print(chrNameBitToBam, chrNameBamToBit) c = 1 for chrom, size in chromSizes: start = 0 if regionStart == 0 else regionStart for i in range(start, size, chunkSize): try: chrNameBamToBit[chrom] except KeyError: print("no sequence information for ") "chromosome {} in 2bit file".format(chrom) print("Reads in this chromosome will be skipped") continue length = min(size, i + chunkSize) mp_args.append((chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) if args.correctedFile.name.endswith('bam'): if len(mp_args) > 1 and args.numberOfProcessors > 1: print(("using {} processors for {} " "number of tasks".format(args.numberOfProcessors, len(mp_args)))) res = pool.map_async( writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrectedSam_wrapper, mp_args)) if len(res) == 1: command = "cp {} {}".format(res[0], args.correctedFile.name) run_shell_command(command) else: print("concatenating (sorted) intermediate BAMs") header = pysam.Samfile(res[0]) of = pysam.Samfile(args.correctedFile.name, "wb", template=header) header.close() for f in res: f = pysam.Samfile(f) for e in f.fetch(until_eof=True): of.write(e) f.close() of.close() print("indexing BAM") pysam.index(args.correctedFile.name) for tempFileName in res: os.remove(tempFileName) if args.correctedFile.name.endswith('bg') or \ args.correctedFile.name.endswith('bw'): if len(mp_args) > 1 and args.numberOfProcessors > 1: res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrected_wrapper, mp_args)) oname = args.correctedFile.name args.correctedFile.close() if oname.endswith('bg'): f = open(oname, 'wb') for tempFileName in res: if tempFileName: shutil.copyfileobj(open(tempFileName, 'rb'), f) os.remove(tempFileName) f.close() else: chromSizes = [(k, v) for k, v in tbit.chroms().items()] writeBedGraph.bedGraphToBigWig(chromSizes, res, oname)
def gen_figure(stage): k, j = 1, 1 fig = plt.figure(figsize=(8, 10)) stds_tot = [] for freq in FREQS: scores, pscores_all_elec = [], [] stds = [] HR, LR = [], [] for elec in CHANNEL_NAMES: file_name = (PREFIX + NAME + "_{}_{}_{}_{}_{:.2f}.mat".format( stage, freq, elec, WINDOW, OVERLAP)) perm_file = (PERM_FILE_PREFIX + NAME + "_{}_{}_{}_{}_{:.2f}.mat".format( stage, freq, elec, WINDOW, OVERLAP)) results = loadmat(RESULTS_PATH / file_name) perm_f = loadmat(RESULTS_PATH / perm_file) score_key = "acc" pscores_key = "acc_pscores" acc_scores = results[score_key].ravel() score = float(acc_scores.mean()) std_acc = acc_scores.std() pscores = list(perm_f[pscores_key].squeeze()) n_rep = int(results["n_rep"]) scores.append(score) stds.append(std_acc) pscores_all_elec.append(pscores) file_name = NAME + "_{}_{}_{}_{}_{:.2f}.mat".format( stage, freq, elec, WINDOW, OVERLAP) try: PSD = loadmat(DATA_PATH / file_name)["data"].ravel() moy_PSD = [0] * 36 for i, submat in enumerate(PSD): if BOOTSTRAPPED: for random_state in range(n_rep): index = np.random.RandomState(random_state).choice( range(len(submat.ravel())), N_TRIALS, replace=False) prep_submat = submat.ravel()[index] mean_for_the_sub = np.mean(prep_submat) moy_PSD[i] += mean_for_the_sub / n_rep else: moy_PSD[i] = np.mean(submat) # subject 10 has artefact on FC2, so we just remove it moy_PSD = np.delete(moy_PSD, 9, 0) except TypeError: print(file_name) HR.append(np.mean(moy_PSD[:17])) LR.append(np.mean(moy_PSD[17:])) print(stage, freq, elec, ":", score, "+/-", std_acc) pscores_all_elec = np.asarray(pscores_all_elec) if MAXSTAT_ELEC: pscores_all_elec = np.max(pscores_all_elec, axis=0) pvalues = [] for i, score in enumerate(scores): if MAXSTAT_ELEC: pscores = pscores_all_elec else: pscores = pscores_all_elec[i] pvalues.append(compute_pval(score, pscores)) print(pvalues) ttest = loadmat(TTEST_RESULTS_PATH / "ttest_perm_{}_{}.mat".format(stage, freq)) tt_pvalues = ttest["p_values"].ravel() t_values = zscore(ttest["t_values"].ravel()) HR = np.asarray(HR) LR = np.asarray(LR) DA = 100 * np.asarray(scores) da_pvalues = np.asarray(pvalues) da_mask = np.full((len(CHANNEL_NAMES)), False, dtype=bool) tt_mask = np.full((len(CHANNEL_NAMES)), False, dtype=bool) tt_mask[tt_pvalues < PVAL] = True if BINOM: thresholds = [ 100 * binom.isf(PVAL, n_trials, .5) / n_trials for n_trials in TRIALS ] da_mask[DA > thresholds[j]] = True else: da_mask[da_pvalues < PVAL] = True mask_params = dict(marker="*", markerfacecolor="white", markersize=9, markeredgecolor="white") data = [ { "name": "PSD HR", "cmap": "jet", "mask": None, "cbarlim": [min(HR), max(HR)], "data": HR, }, { "name": "PSD LR", "cmap": "jet", "mask": None, "cbarlim": [min(LR), max(LR)], "data": LR, }, # { # "name": "Relative Power Changes", # "cmap": "inferno", # "mask": None, # "cbarlim": [min(RPC), max(RPC)], # "data": RPC / max(RPC), # }, { "name": "corrected T-values", "data": t_values, "cmap": "viridis", "mask": tt_mask, "cbarlim": [min(t_values), max(t_values)], }, { "name": "Decoding Accuracies (%)", "cmap": "viridis", "mask": da_mask, "cbarlim": [50, 65], "data": DA, }, ] for i, subset in enumerate(data): plt.subplot(len(FREQS), len(data), i + k) ch_show = False if i > 1 else True if subset["name"] == "Decoding Accuracies (%)": print(subset["data"]) ax, _ = plot_topomap( subset["data"], SENSORS_POS, res=128, cmap=subset["cmap"], show=False, vmin=subset["cbarlim"][0], vmax=subset["cbarlim"][1], names=CHANNEL_NAMES, show_names=ch_show, mask=subset["mask"], mask_params=mask_params, contours=0, ) if freq == FREQS[-1]: plt.xlabel(subset["name"]) if freq == FREQS[-1]: pass # cb = fig.colorbar(ax, orientation="horizontal") # tick_locator = ticker.MaxNLocator(nbins=5) # cb.locator = tick_locator # cb.update_ticks() if i == 0: plt.ylabel(freq) j += 1 k += len(data) print(np.mean(stds)) stds_tot.append(np.mean(stds)) print(np.mean(stds_tot)) plt.subplots_adjust(left=None, bottom=0.05, right=None, top=None, wspace=None, hspace=None) plt.tight_layout() file_name = "topomap_{}{}_{}_p{}".format(PREFIX, NAME, stage, str(PVAL)[2:]) print(file_name) plt.savefig(SAVE_PATH / "../figures" / file_name, dpi=400)
def ExecuteSFFS(x, y, featureNames, featureList, clusters, clusterNames, svc, kFolds, nbOfSplit, featMaxNbrSFFS, standardizationType, removedData, permutation_flag, nbPermutation, balance_flag, currentDateTime, resultDir, debug_flag, verbose): import scipy import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split as tts from sklearn.metrics import confusion_matrix from mlxtend.feature_selection import SequentialFeatureSelector as SFS from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs from sklearn.model_selection import RandomizedSearchCV from slpClass_toolbox import BalanceClasses from slpClass_toolbox import Standardize from slpClass_toolbox import Permute from slpClass_toolbox import ComputePermutationAvgDA from slpClass_toolbox import PlotPermHist from slpClass_toolbox import ApplyStandardization from slpClass_toolbox import plot_confusion_matrix plt.rcParams.update({'figure.max_open_warning': 0}) # Get features values since SFFS works only with numpy array! bestFeaturesHist = np.zeros([len(featureNames)]) CvResult = pd.DataFrame() permResults = pd.DataFrame() tmpBest = [] DA = [] avg_perm_DA = [] skipFS = False # flag to skip feature selection fitFeatOverTresh = False # fit classifier with most frequent features in best set #********************** TRAIN pre-procesing ******************************* for it in list(range(nbOfSplit)): print('\nSplit #{}'.format(str(it))) # Use all features or given ones only if len(featureList) == 0: xx = x elif isinstance(featureList[0], float): xx = x fitFeatOverTresh = True else: xx = x[featureList] skipFS = True # Balance the number of old woman and old man or not if balance_flag: X, Y = BalanceClasses(xx, y) else: X, Y = xx, y # slpit dataset into train and test random subset X_train, X_test, y_train, y_test = tts(X, Y['Cluster'], test_size=0.33, stratify=Y['Cluster']) # Data z-score standardisation xTrainSet, zPrm = Standardize(X_train, y_train, standardizationType, debug_flag) #**************************** SVM optimisation ************************ params_dict = { 'C': scipy.stats.expon(scale=100), 'kernel': ['linear'], 'class_weight': ['balanced', None] } n_iter_search = 20 random_search = RandomizedSearchCV(svc, param_distributions=params_dict, n_iter=n_iter_search) random_search.fit(xTrainSet, y_train) optimClf = random_search.best_estimator_ #*************************** TRAIN ************************************ print('Fitting...') if skipFS: optimClf = optimClf.fit(xTrainSet.as_matrix(), y_train) yPred = optimClf.predict(xTrainSet.as_matrix()) # Compute the accuracy of the test prediction acc = float((y_train == yPred).sum()) / yPred.shape[0] print('Train predicted accuracy: %.2f %%' % (acc * 100)) fitRes = pd.DataFrame(data=[acc], columns=['CV_DA_' + str(it + 1)]) else: # set k_features = (1,X.shape[1]) to test all possible combinations sffs = SFS(optimClf, k_features=(1, featMaxNbrSFFS), forward=True, floating=False, scoring='accuracy', cv=kFolds, n_jobs=-1) sffs = sffs.fit(xTrainSet.as_matrix(), y_train) print('Best combination for fit #%d (ACC: %.3f): %s' % \ (it,sffs.k_score_, sffs.k_feature_idx_)) # Fit the estimator using the new feature subset and make a # prediction on the test data X_train_sfs = sffs.transform(xTrainSet.as_matrix()) optimClf.fit(X_train_sfs, y_train) fitRes = pd.DataFrame.from_dict(sffs.get_metric_dict()).T fitRes['avg_over_std'] = fitRes['avg_score'] / fitRes['std_dev'] if featMaxNbrSFFS > 1: # plot feature selection process metrics fig1 = plot_sfs(sffs.get_metric_dict(), kind='std_err') savedPlotName = resultDir+'Decoding_accuracy_'+clusters+'_'+\ str(it)+'_'+str(nbOfSplit)+'.png' tmpBest.append(sffs.k_feature_idx_) bestFeaturesHist[[tmpBest[-1]]] += 1 fig1.set_dpi(300) plt.tight_layout() plt.savefig(savedPlotName, bbox_inches='tight') plt.clf() plt.close(fig1) # plot mean / std plt.figure(dpi=300) plt.title('Moyenne sur ecart-type') plt.xlabel("nb attributs dans combinaison") plt.xticks(range(featMaxNbrSFFS)) plt.ylabel("Moyenne sur ecart-type") plt.plot(list(range(1, featMaxNbrSFFS + 1)), fitRes['avg_over_std']) figName = resultDir+'SFFS_'+clusters+'_bestSet_metric_'+ \ str(it)+'_'+str(nbOfSplit) plt.savefig(figName, bbox_inches='tight') plt.clf() plt.close() # add metrics iteration identifier fitRes = fitRes.add_suffix('_' + str(it + 1)) CvResult = pd.concat([CvResult, fitRes], axis=1) #***************************** TEST *********************************** print('Testing...') # standardize test set using trainset standardization parameters xTestSet = ApplyStandardization(X_test, zPrm) # prepare test data if skipFS: xTest = xTestSet savedPlotName = resultDir+clusters+'_ConfusionMatrix_'+str(it+1)+ \ '_'+str(nbOfSplit) else: # Generate a new subset of data according to selected features xTest = sffs.transform(xTestSet.as_matrix()) savedPlotName = resultDir+'SFFS_'+clusters+'_ConfusionMatrix_'+ \ str(it+1)+'_'+str(nbOfSplit) # actually test classifier and compute decoding accuracy on predictions y_pred = optimClf.predict(xTest) acc = float((y_test == y_pred).sum()) / y_pred.shape[0] print('Test set accuracy: %.2f %%' % (acc * 100)) DA.append(acc) # stack test DA for further use # plot confusion matrix cm = confusion_matrix(y_test, y_pred) fig_CM = plt.figure(dpi=300) plot_confusion_matrix(cm, clusterNames, title=savedPlotName, normalize=True, precision=2) plt.clf() plt.close(fig_CM) #**************** STATISTICAL ASSESSMENT (PERMUTATION) **************** if permutation_flag: permResults['permutation_DA_' + str(it)] = Permute( clusters, xTrainSet, xTestSet, y_train, y_test, nbPermutation, standardizationType, debug_flag=0) avg_perm_DA.append( np.mean(permResults['permutation_DA_' + str(it)])) dfDA = pd.DataFrame(data=DA, columns=['DA_test']) # CvResult = pd.concat([CvResult, dfDA[:]], axis=1) CvResult = pd.concat([ CvResult, dfDA[:], pd.DataFrame(data=[np.mean(DA)], columns=['avg_DA']) ], axis=1) #***************** COMPUTE STATISTICAL ASSESSMENT RESULTS ***************** if permutation_flag: # compute permutation DA average and keep results in a dataframe print('\nAverage permutation DA') for i in list(range(len(avg_perm_DA))): print('\t' + str(avg_perm_DA[i])) savedHistName = resultDir + 'Average_Permutation_hist_' + clusters + '.png' PlotPermHist(permResults, CvResult['avg_DA'].iloc[0], currentDateTime, savedHistName) #formating permutation results to save in excel file permResults = pd.concat( [permResults, ComputePermutationAvgDA(avg_perm_DA)], axis=1) print('Mean permutation decoding accuracy : {}'.format( np.mean(permResults['Avg_Permutation_DA_per_epoch']))) else: # binomial law from scipy.stats import binom q = 0.001 # p value n = X.shape[0] + 1 # nombre d'observation (sujets) p = 1 / len(clusterNames) # probablité d'avoir un essai correctement luckLvl = pd.DataFrame(date=[binom.isf(q, n, p) / n], columns=['Chance_Level']) #****************************** Compute results ******************************* if not skipFS: # Build structure of histogram data to save in excel hist = pd.DataFrame(data=featureNames, columns=['Features_Name']) hist['Occurence_Best'] = bestFeaturesHist # Search best set across every iteration best set best_Combination = tmpBest[np.argmax(DA)] # Compute average size of best combination l = 0 for n in list(range(len(tmpBest))): l += len(tmpBest[n]) avgBestCombSize = pd.DataFrame(data=[np.ceil(l / len(tmpBest))], columns=['avgBestCombSize']) # subsetHist = GetSubsetOccurence(tmpBest) # PlotHist(subsetHist[1],'Subsets occurences',subsetHist[0],'Comb_Hist.png') # Get best set's feature names tmp = [] tmp.append(np.max(DA)) for i in best_Combination: tmp.append(featureNames[i]) print('\t' + featureNames[i]) bestFeatNames = pd.DataFrame(data=tmp, columns=['Best_Features_Set']) sffsRes = pd.concat([hist, bestFeatNames, avgBestCombSize], axis=1) # Plot best combination custom metric (mean / std_dev) from slpClass_toolbox import PlotBestCombinationMetrics filteredData = CvResult.filter(regex=r'avg_over_std_', axis=1) metrics = pd.DataFrame(data=filteredData) metrics.dropna(inplace=True) figName = resultDir + 'SFFS_' + clusters + '_bestSet_metric_aggreg.png' PlotBestCombinationMetrics(metrics, figName) #save training and permutation results in an excel file nbSubject = pd.DataFrame(data=[len(X)], columns=['Number_Of_Subjects']) #************************ Build results structure ************************* excelResults = pd.concat([ CvResult, permResults if permutation_flag else luckLvl, sffsRes if not skipFS else None, removedData, nbSubject ], axis=1) print('Mean Decoding accuracy :{}'.format(np.mean(DA))) # compute occurence of every subset in bestsets of every iteration # from slpClass_toolbox import GetSubsetOccurence # subsetHist = GetSubsetOccurence(tmpBest) # excelResults = pd.concat([excelResults, subsetHist], axis=1) # excelResults.to_excel(saveTo, sheet_name=xlSheetName) if fitFeatOverTresh: tresh = featureList[0] * nbOfSplit bestFeatColumns = hist.iloc[:, 0][hist.iloc[:, 1] > tresh] bestDataSet = xx[bestFeatColumns] classes = y DABestFeat = [] print('Fitting with features occuring over %d times in best sets' % tresh) for i in list(range(nbOfSplit)): print('\rFit #{} of {}\n'.format(i + 1, nbOfSplit), end='\r', flush=True) # Balance the number of old woman and old man or not if balance_flag: XX, YY = BalanceClasses(bestDataSet, classes) else: XX, YY = bestDataSet, classes # slpit dataset into train and test random subset XXtrain, XXtest, yytrain, yytest = tts(XX, YY['Cluster'], test_size=0.33, stratify=YY['Cluster']) # Data z-score standardisation xxTrainSet, zzPrm = Standardize(XXtrain, yytrain, standardizationType, debug_flag) # fit and predict on training data optimClf = optimClf.fit(xxTrainSet.as_matrix(), yytrain) yPred = optimClf.predict(xxTrainSet.as_matrix()) # Compute accuracy of prediction on trainnnig set acc = float((yytrain == yPred).sum()) / yPred.shape[0] print('Train predicted accuracy: %.2f %%' % (acc * 100)) fitRes = pd.DataFrame(data=[acc], columns=['CV_DA_' + str(it + 1)]) # test classifier and compute decoding accuracy on predictions xxTestSet = ApplyStandardization(XXtest, zzPrm) yypred = optimClf.predict(xxTestSet) acc = float((yytest == yypred).sum()) / yypred.shape[0] print('Test set accuracy: %.2f %%' % (acc * 100)) DABestFeat.append(acc) # stack test DA for further use # plot confusion matrix cm = confusion_matrix(yytest, yypred) fig_CM = plt.figure(dpi=300) plot_confusion_matrix(cm, clusterNames, title=savedPlotName, normalize=True, precision=2) plt.clf() plt.close(fig_CM) df = pd.DataFrame(data=DABestFeat, columns=['optim DA']) df = pd.concat([ df, pd.DataFrame(data=[np.mean(DABestFeat)], columns=['optim avg DA']) ], axis=1) print('Classifier trained with best features (occ > %d) only' % tresh) print(df) excelResults = pd.concat([excelResults, df], axis=1) return excelResults
def binom_tst_beta(p_null=0.5, p_alt=0.6, n=10, alpha_hat=0.05): if n == 0: return 1.0 x_a = binom.isf(alpha_hat, n, p_null) return binom.cdf(x_a, n, p_alt)
def binom_tst_alpha(hat_alpha=0.5, p=0.4, n=10): return binom.sf(binom.isf(hat_alpha, p=p, n=n), p=p, n=n)
def beta(null, alter, cut_dat=1e4, alpha=0.05): beta = 0 n = 0 beta_n = 0 beta_del = 0 q = null.t / (null.t + alter.t) poisson_mu = null.e_lmb * null.t + (alter.e_lmb) * alter.t int_poisson_mu = int(poisson_mu) n = int_poisson_mu - 1 dels1 = [] ns1 = [] nbinom_s1 = {} nbinom_s2 = {} while (beta_del > 1e-9 or n == int_poisson_mu - 1): n += 1 if n - int_poisson_mu > cut_dat: break surv_inv = int(binom.isf(alpha, n, q)) beta_del = 0 for j in range(surv_inv + 1): if j in nbinom_s1: nb1 = nbinom_s1[j] else: nb1 = null.pmf(j) nbinom_s1[j] = nb1 if n - j in nbinom_s2: nb2 = nbinom_s2[n - j] else: nb2 = alter.pmf(n - j) nbinom_s2[n - j] = nb2 beta_n = nb1 * nb2 beta_del += beta_n beta += beta_n dels1.append(beta_del) ns1.append(n) n = int_poisson_mu dels2 = [] ns2 = [] while beta_del > 1e-9 or n == int_poisson_mu: n -= 1 if int_poisson_mu - n > cut_dat: break surv_inv = int(binom.isf(alpha, n, q)) beta_del = 0 for j in range(surv_inv + 1): if j in nbinom_s1: nb1 = nbinom_s1[j] else: nb1 = null.pmf(j) nbinom_s1[j] = nb1 if n - j in nbinom_s2: nb2 = nbinom_s2[n - j] else: nb2 = alter.pmf(n - j) nbinom_s2[n - j] = nb2 beta_n = nb1 * nb2 beta_del += beta_n beta += beta_n dels2.append(beta_del) ns2.append(n) dels1 = np.array(dels1) dels2 = np.array(dels2) dels2 = dels2[::-1] ns1 = np.array(ns1) ns2 = np.array(ns2) ns2 = ns2[::-1] ns = np.concatenate((ns2, ns1), axis=0) dels = np.concatenate((dels2, dels1), axis=0) return beta, dels, ns, int_poisson_mu
def fn(alp, n=10): return binom.cdf(binom.isf(alp, n, .5), n, .5 + .1)