def main(): """ Calculate the average coverage of one or more beta files. Print the results. """ args = parse_args() sites = GenomicRegion(args).sites blocks_df = load_blocks_file(args.bed_file) if args.bed_file else None params = [(beta, sites, blocks_df, False) for beta in args.betas] # covs = [beta_cov(*p) for p in params] # return p = Pool(args.threads) covs = p.starmap(beta_cov, params) p.close() p.join() for cov, beta_path in zip(covs, args.betas): print('{}\t{:.2f}'.format(pretty_name(beta_path), cov)) if args.plot: plot_hist([pretty_name(b) for b in args.betas], covs)
def main(): """ Generage homog files. Given a blocks file and pat[s], count the number of U,X,M reads for each block for each file """ args = parse_args() if args.nr_bits not in (8 , 16): raise IllegalArgumentError('nr_bits must be in {8, 16}') if args.rlen < 3: raise IllegalArgumentError('rlen must be >= 3') if args.thresholds is not None: th = args.thresholds.split(',') if not len(th) == 2: # and th[0].is_number(): raise IllegalArgumentError('Invalid thresholds') th = float(th[0]), float(th[1]) if not (1 > th[1] > th[0] > 0): raise IllegalArgumentError('Invalid thresholds') # make sure homog tool is valid: validate_local_exe(homog_tool) pats = args.input_files validate_file_list(pats, '.pat.gz') outdir, prefix = parse_outdir_prefix(args) # load blocks: blocks_df = load_blocks_file(args.blocks_file) is_nice, msg = is_block_file_nice(blocks_df) if not is_nice: homog_log(msg) raise IllegalArgumentError(f'Invalid blocks file: {args.blocks_file}') for pat in sorted(pats): homog_process(pat, blocks_df, args, outdir, prefix)
def load_blocks(self): # load blocks file and filter it by CpG and bg length df = load_blocks_file(self.args.blocks_path) orig_nr_blocks = df.shape[0] # filter by lenCpG df['lenCpG'] = df['endCpG'] - df['startCpG'] df = df[df['lenCpG'] >= self.args.min_cpg] df = df[df['lenCpG'] <= self.args.max_cpg] # filter by len in bp df['len'] = df['end'] - df['start'] df = df[df['len'] >= self.args.min_bp] df = df[df['len'] <= self.args.max_bp] df.reset_index(drop=True, inplace=True) # print stats if self.verbose: eprint(f'loaded {orig_nr_blocks:,} blocks') if df.shape[0] != orig_nr_blocks: eprint(f'droppd to {df.shape[0]:,} ') return df
def break_to_chunks(self): """ Break range of sites to chunks of size 'step', while keeping chromosomes separated """ # print a warning in case chunk size is too small step = self.args.chunk_size if step < self.args.max_cpg: msg = '[wt segment] WARNING: chunk_size is small compared to max_cpg and/or max_bp.\n' \ ' It may cause wt segment to fail. It\'s best setting\n' \ ' chunk_size > min{max_cpg, max_bp/2}' eprint(msg) if self.args.bed_file: df = load_blocks_file(self.args.bed_file)[['startCpG', 'endCpG']].dropna() # make sure bed file has no overlaps or duplicated regions is_nice, msg = is_block_file_nice(df) if not is_nice: msg = '[wt segment] ERROR: invalid bed file.\n' \ f' {msg}\n' \ f' Try: sort -k1,1 -k2,2n {self.args.bed_file} | ' \ 'bedtools merge -i - | wgbstools convert --drop_empty -p -L -' eprint(msg) raise IllegalArgumentError('Invalid bed file') if df.shape[0] > 2 * 1e4: msg = '[wt segment] WARNING: bed file contains many regions.\n' \ ' Segmentation will take a long time.\n' \ f' Consider running w/o -L flag and intersect the results\n' eprint(msg) else: # No bed file provided gr = GenomicRegion(self.args) # whole genome - make a dummy "bed file" of the full chromosomes if gr.is_whole(): cf = self.genome.get_chrom_cpg_size_table() cf['endCpG'] = np.cumsum(cf['size']) + 1 cf['startCpG'] = cf['endCpG'] - cf['size'] df = cf[['startCpG', 'endCpG']] # one region else: df = pd.DataFrame(columns=['startCpG', 'endCpG'], data=[gr.sites]) # build a DataFrame of chunks, with a "tag"/label field, # so we know which chunks to merge later on. rf = pd.DataFrame() tags = [] starts = [] ends = [] for ind, row in df.iterrows(): start, end = row bords = list(range(start, end, step)) + [end] tags += [f'{start}-{end}'] * (len(bords) - 1) starts += bords[:-1] ends += bords[1:] return tags, starts, ends
def beta2table_generator(betas, blocks, groups_file, min_cov, threads, chunk_size=None, verbose=False): validate_single_file(blocks) gf = groups_load_wrap(groups_file, betas) blocks_df = load_blocks_file(blocks) if chunk_size is None: chunk_size = blocks_df.shape[0] for start in range(0, blocks_df.shape[0], chunk_size): subset_blocks = blocks_df.iloc[start:start + chunk_size].copy() yield get_table(subset_blocks, gf, min_cov, threads, verbose)
def __init__(self, args): eprint('mixing...') self.args = args self.gr = GenomicRegion(args) self.pats = args.pat_files self.dest_cov = args.cov self.bed = load_blocks_file(args.bed_file) if args.bed_file else None self.stats = pd.DataFrame( index=[splitextgz(op.basename(f))[0] for f in self.pats]) self.nr_pats = len(self.pats) self.labels = self.validate_labels(args.labels) self.dest_rates = self.validate_rates(args.rates) self.covs = self.read_covs() self.adj_rates = self.adjust_rates() self.prefix = self.generate_prefix(args.out_dir, args.prefix)
def read_covs(self): covs = [] for pat in self.pats: suff = '.lbeta' if self.args.lbeta else '.beta' beta = pat.replace('.pat.gz', suff) if not op.isfile(beta): eprint('No {} file compatible to {} was found. Generate it...'. format(suff, pat)) pat2beta(pat, op.dirname(pat), args=self.args, force=True) if self.bed is not None: cov = beta_cov_by_bed(beta, self.bed) elif self.args.bed_cov: # todo: this is messy. fix it. Better read coverage from pat file. cov = beta_cov_by_bed(beta, load_blocks_file(self.args.bed_cov)) else: cov = beta_cov(beta, self.gr.sites, print_res=True) covs.append(cov) self.add_stats_col('OrigCov', covs) return covs
def view_bed(pat, args): # assume columns 4-5 of args.bed_file are startCpG, endCpG: bpath = args.bed_file # validate blocks file. If it's long, and starts with "chr1", use gunzip instead of tabix. df = load_blocks_file(bpath, nrows=1e6) if df.shape[0] == 1e6 and df.iloc[0, 0] in ('1', 'chr1'): tabix_cmd = f'gunzip -c {pat} ' else: # extended blocks: tabix_cmd = 'gunzip -c' if bpath.endswith('.gz') else 'cat' tabix_cmd += f' {bpath} | {cview_extend_blocks_script} | tabix -R - {pat} ' view_flags = set_view_flags(args) cmd = tabix_cmd + f' | {cview_tool} {view_flags} --blocks_path {bpath}' if args.sub_sample is not None: # sub-sample reads validate_local_exe(pat_sampler) cmd += f' | {pat_sampler} {args.sub_sample} ' cmd += f' | sort -k2,2n -k3,3 | {collapse_pat_script} - ' if args.out_path is not None: cmd += f' > {args.out_path}' subprocess_wrap_sigpipe(cmd)
def betas2table(betas, blocks, groups_file, min_cov, threads=8, verbose=False): validate_single_file(blocks) gf = groups_load_wrap(groups_file, betas) blocks_df = load_blocks_file(blocks) return get_table(blocks_df, gf, min_cov, threads, verbose)