def reduce_data(beta_path, df, is_nice): if is_nice: df = df[['startCpG', 'endCpG']].astype(int) start = df['startCpG'].values[0] end = df['endCpG'].values[df.shape[0] - 1] return fast_method(load_beta_data(beta_path, (start, end)), df) else: return slow_method(load_beta_data(beta_path), df)
def merge_betas(betas, opath): """ Merge all betas by summing their values element-wise, while keeping the dimensions :param betas: list of beta files :param opath: merged beta file """ data = load_beta_data(betas[0]).astype(np.int) for b in betas[1:]: data += load_beta_data(b) # Trim / normalize to range [0, 256) data = trim_to_uint8(data) # Dump data.tofile(opath) return data
def apply_filter_wrapper(args, blocks_bins, finds, beta_path, df): try: # load beta file: data = load_beta_data(beta_path) # reduce to blocks: blocks_bins[-1] -= 1 reduced_data = np.add.reduceat(data, blocks_bins)[finds][:-1] # dump to file out_name = splitext(splitext(basename(args.blocks_file))[0])[0] out_name = splitext(basename(beta_path))[0] + '_' + out_name + '.bin' out_name = out_name.replace('_genome', '') out_name = op.join(args.out_dir, out_name) trim_to_uint8(reduced_data).tofile(out_name) print(out_name) if args.bedGraph: with np.errstate(divide='ignore', invalid='ignore'): beta_vals = reduced_data[:, 0] / reduced_data[:, 1] eprint(beta_vals.shape, df.shape) # beta_vals[reduced_data[:, 1] == 0] = np.nan df['beta'] = beta_vals df.to_csv(out_name.replace('.bin', '.bedGraph'), sep='\t', index=None, header=None, na_rep=-1, float_format='%.2f') except Exception as e: print('Failed with beta', beta_path) print('Exception:', e)
def compare_all_paires(args): betas = args.betas sites = GenomicRegion(args).sites tables = [load_beta_data(b, sites) for b in betas] names = [op.splitext(op.basename(b))[0] for b in betas] # break names to lines nnames = [] k = 20 for n in names: lst = [n[0 + i:k + i] for i in range(0, len(n), k)] nn = '\n'.join(lst) nnames.append(nn) N = len(tables) fig, axs = plt.subplots(N, N) for i in range(N): for j in range(i + 1): comp2(tables[i], tables[j], args.min_cov, axs[i, j]) axs[i, 0].set_ylabel(nnames[i], fontsize=8) for j in range(N): axs[0, j].set_title(nnames[j], fontsize=8) for ax in axs.flat: ax.label_outer() fig.tight_layout() if args.outpath is not None: plt.savefig(args.outpath) eprint(f'[wt cmp] dumped figure to {args.outpath}') if args.show or args.outpath is None: plt.show()
def compare_all_paires(betas, min_cov, sites): tables = [load_beta_data(b, sites) for b in betas] names = [op.splitext(op.basename(b))[0] for b in betas] for x, y in combinations(range(len(tables)), r=2): plt.figure() comp2(tables[x], tables[y], (names[x], names[y]), min_cov) plt.show()
def beta_cov(beta_path, sites=None, bed_wrapper=None, print_res=False): if bed_wrapper: res = beta_cov_by_bed(beta_path, bed_wrapper) else: res = np.mean(load_beta_data(beta_path, sites)[:, 1]) if print_res: print('{}\t{:.2f}'.format(pretty_name(beta_path), res)) return res
def beta_cov_by_bed(beta_path, bed_wrapper): nr_sites = 0 total_cov = 0 for gr in bed_wrapper.iter_grs(): table = load_beta_data(beta_path, gr.sites)[:, 1] nr_sites += table.size total_cov += table.sum() return total_cov / nr_sites if nr_sites else 0
def view_beta(beta_path, gr, opath): """ View beta file in given region/sites range :param beta_path: beta file path :param gr: a GenomicRegion object :param opath: output path (or stdout) """ data = load_beta_data(beta_path, gr.sites) np.savetxt(opath, data, fmt='%s', delimiter='\t')
def mult_pat2beta(pat_path, out_beta, nr_sites, args): processes = [] with Pool(args.threads) as p: chroms = list( GenomeRefPaths(args.genome).get_chrom_cpg_size_table()['chr']) for chrom in sorted(chroms): beta = '{}.{}.beta'.format(op.splitext(out_beta)[0], chrom) params = (chrom, pat_path, beta, nr_sites) processes.append(p.apply_async(chr_thread, params)) p.close() p.join() res = np.zeros((nr_sites, 2), dtype=np.uint8) for bpath in [pr.get() for pr in processes]: res += load_beta_data(bpath) os.remove(bpath) res.tofile(out_beta) return out_beta
def single_beta(beta_path, indices, cov_thresh): return op.splitext(op.basename(beta_path))[0], \ beta2vec(load_beta_data(beta_path)[indices - 1], min_cov=cov_thresh).astype(np.float16)
def load_data(self): # raw table from *beta files: dsets = np.zeros((len(self.files), self.nr_sites, 2)) for i, file in enumerate(self.files): dsets[i] = load_beta_data(file, (self.start, self.end)) return dsets
def load_data(self): # raw table from *beta files: dsets = np.zeros((len(self.files), self.nr_sites, 2)) for i, fpath in enumerate(self.files): dsets[i] = load_beta_data(fpath, self.gr.sites) return dsets
def load_beta(self, beta_path): """ Load beta to a numpy array """ sites = (1, DEBUG_NR + 1) if self.debug else self.gr.sites barr = load_beta_data(beta_path, sites=sites) assert (barr.shape[0] == self.ref_dict.shape[0]) return barr