def main(): """ Merge files. Accumulate all reads / observations from multiple (>=2) input files, and output a single file of the same format. Supported formats: pat.gz, beta """ args = parse_args() # validate input files input_files = args.input_files validate_files_list(input_files, min_len=2) # construct output path out_path = args.prefix + splitextgz(args.input_files[0])[1] if not delete_or_skip(out_path, args.force): return files_type = splitextgz(input_files[0])[1][1:] if files_type in ('beta', 'bin'): merge_betas(input_files, out_path) elif files_type == 'pat.gz': MergePats(input_files, args.prefix + '.pat', args.labels, args).merge_pats() elif files_type == 'unq.gz': merge_unqs() else: print('Unknown input format:', input_files[0]) return
def main(): """ Merge files. Accumulate all reads / observations from multiple (>=2) input files, and output a single file of the same format. Supported formats: pat.gz, beta """ args = parse_args() # validate input files input_files = args.input_files # construct output path out_path = args.prefix + splitextgz(args.input_files[0])[1] if op.realpath(out_path) in [op.realpath(p) for p in args.input_files]: eprint('[wt merge] Error output path is identical ' \ 'to one of the input files {out_path}') return if not delete_or_skip(out_path, args.force): return files_type = splitextgz(input_files[0])[1][1:] if files_type in ('beta', 'bin'): merge_betas(input_files, out_path) elif files_type == 'pat.gz': MergePats(input_files, args.prefix + '.pat.gz', args.labels, args).merge_pats() else: print('Unknown input format:', input_files[0]) return
def __init__(self, input_file, args): self.args = args self.in_file = input_file self.suff = splitextgz(self.in_file)[1][1:] c = BedTsv if 'bed' in self.suff or 'tsv' in self.suff else PatUnq self.ftype = c(input_file) self.validate_file()
def bed2betas(args): # merge with the reference CpG bed file, # so the #lines in file will include all 28217448 sites (with NaN as 0) nrows = 100000 if args.debug else None try: rf = None # Reference dictionary for bed in args.bed_paths: eprint('Converting {}...'.format(op.basename(bed))) # Check if bed should be skipped: outpath = op.join(args.outdir, splitextgz(op.basename(bed))[0]) + '.beta' if not delete_or_skip(outpath, args.force): continue # Load dict (at most once) and bed if rf is None: rf = load_dict(nrows=nrows, genome_name=args.genome) df = load_bed(bed, nrows, args.genome == 'mm9') # merge dict with bed, then dump res = rf.merge(df, how='left', on=['chr', 'start']).fillna(0) trim_to_uint8(np.array(res[['meth', 'total']])).tofile(outpath) except pd.errors.ParserError as e: eprint('Invalid input file.\n{}'.format(e)) return
def bed2betas(args): # merge with the reference CpG bed file, # so the #lines in file will include all 28217448 sites (with NaN as 0) region = 'chr1:10469-876225' if args.debug else None nrows = 10000 if args.debug else None try: rf = None # Reference dictionary for bed in args.bed_paths: eprint(f'[wt bed] Converting {op.basename(bed)}...') # Check if bed should be skipped outpath = op.join(args.outdir, splitextgz(op.basename(bed))[0] + '.beta') if not delete_or_skip(outpath, args.force): continue # Load dict (at most once) and bed if rf is None: rf = load_dict_section(region, args.genome) df = load_bed(bed, nrows, args.add_one) # todo: implement in C++. # merge dict with bed, then dump res = rf.merge(df, how='left', on=['chr', 'start']).fillna(0) trim_to_uint8(np.array(res[['meth', 'total']])).tofile(outpath) except pd.errors.ParserError as e: eprint(f'[wt bed] Invalid input file.\n{e}') return
def homog_process(pat, blocks, args, outdir, prefix): name = splitextgz(op.basename(pat))[0] if prefix is None: prefix = op.join(outdir, name) bin_path = prefix + '.uxm' bed_path = prefix + '.uxm.bed.gz' bed = args.bed binary = args.binary or (not args.binary and not bed) if should_be_skipped(args.force, bin_path, bed_path, binary, bed): homog_log(f'skipping {name}. Use -f to overwrite') return # generate rate_cmd: l = args.rlen rate_cmd = f' -l {l} -r ' if args.thresholds: rate_cmd += f'0,{args.thresholds},1' else: th1 = round(1 - (l - 1) / l, 3) + 0.001 th2 = round((l - 1) / l, 3) rate_cmd += f'0,{th1},{th2},1 ' # for a long marker file (>10K marker), # parse the whole pat file instead of running "cview -L BED" view_full = blocks.shape[0] > 1e4 df = ctool_wrap(pat, name, args.blocks_file, rate_cmd, view_full, args.verbose) df = pd.concat([blocks.reset_index(drop=True), df], axis=1) df = blocks.merge(df, how='left', on=COORDS_COLS5) if binary: trim_uxm_to_uint8(df[list('UXM')].values, args.nr_bits).tofile(bin_path) if bed: df.to_csv(bed_path, sep='\t', header=None, index=None) return df
def pat2beta(pat_path, out_dir, args, force=True): validate_single_file(pat_path) if pat_path.endswith('.pat.gz'): cmd = 'gunzip -cd' elif pat_path.endswith('.pat'): cmd = 'cat' else: raise IllegalArgumentError(f'Invalid pat suffix: {pat_path}') suff = '.lbeta' if args.lbeta else '.beta' out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + suff) if not delete_or_skip(out_beta, force): return if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile( pat_path + '.csi'): arr = mult_pat2beta(pat_path, args) else: nr_sites = GenomeRefPaths(args.genome).get_nr_sites() cmd += f' {pat_path} | {pat2beta_tool} {1} {nr_sites + 1}' x = subprocess.check_output(cmd, shell=True).decode() arr = np.fromstring(x, dtype=int, sep=' ').reshape((-1, 2)) trim_to_uint8(arr, args.lbeta).tofile(out_beta) return out_beta
def main(args): validate_files_list(args.input_files, '.pat.gz') gr = GenomicRegion(args) print(gr) for pat_file in args.input_files: print(splitextgz(op.basename(pat_file))[0]) # print file name PatVis(args, pat_file).print_results()
def __init__(self, input_file, force=True, threads=multiprocessing.cpu_count()): self.force = force self.threads = threads self.in_file = input_file self.suff = splitextgz(self.in_file)[1][1:] self.ftype = Bed() if 'bed' in self.suff else Pat() self.validate_file()
def compose_fig_path(unq, outdir, grs): if not outdir: return res = op.join(outdir, op.basename(splitextgz(unq)[0])) if grs and len(grs) == 1: res += '.{}'.format(grs[0].region_str) res += '.png' return res
def filter_existing_files(files, out_dir, lbeta): files_to_process = [] suff = '.lbeta' if lbeta else '.bin' for beta in files: prefix = op.join(out_dir, splitextgz(op.basename(beta))[0]) if not op.isfile(prefix + suff): files_to_process.append(beta) else: b2b_log(f'Skipping {beta}. Use -f flag to overwrite') return files_to_process
def validate_labels(self, labels): if labels is None: labels = [ splitextgz(op.basename(p))[0].split('-')[0].lower() for p in self.pats ] if len(labels) != self.nr_pats: raise IllegalArgumentError('len(labels) != len(files)') return labels
def main(args): validate_file_list(args.input_files, '.pat.gz') # drop duplicated files, while keeping original order input_files = drop_dup_keep_order(args.input_files) gr = GenomicRegion(args) print(gr) for pat_file in input_files: print(splitextgz(op.basename(pat_file))[0]) # print file name PatVis(args, pat_file).print_results()
def plot_hist(data, max_frag_size, pat): fig = plt.figure() ax = fig.add_subplot(111) ax.plot(np.arange(1, data.size + 1), data) major_ticks = np.arange(1, max_frag_size, 5) minor_ticks = np.arange(1, max_frag_size, 1) ax.set_xticks(major_ticks) ax.set_xticks(minor_ticks, minor=True) # Or if you want different settings for the grids: ax.grid(which='minor', alpha=0.2) ax.grid(which='major', alpha=0.5) plt.ylim(bottom=0) plt.xlim(left=1) plt.title('Fragment lengths (CpGs)\n' + op.basename(splitextgz(pat)[0]))
def generate_prefix(self, outdir, prefix): if prefix: if op.dirname(prefix): validate_dir(op.dirname(prefix)) return prefix else: validate_dir(outdir) # compose output path: pats_bnames = [splitextgz(op.basename(f))[0] for f in self.pats] res = '_'.join( [str(x) for t in zip(pats_bnames, self.dest_rates) for x in t]) region = '' if self.gr.sites is None else '_{}'.format( self.gr.region_str) res += '_cov_{:.2f}{}'.format(self.dest_cov, region) res = op.join(outdir, res) return res
def __init__(self, args): print('in mixer') self.args = args self.gr = GenomicRegion(args) self.pats = args.pat_files self.dest_cov = args.cov self.bed = None if not args.bed_file else BedFileWrap(args.bed_file) self.stats = pd.DataFrame( index=[splitextgz(op.basename(f))[0] for f in self.pats]) self.nr_pats = len(self.pats) self.labels = self.validate_labels(args.labels) self.dest_rates = self.validate_rates(args.rates) self.covs = self.read_covs() self.adj_rates = self.adjust_rates() self.prefix = self.generate_prefix(args.out_dir, args.prefix)
def __init__(self, args): eprint('mixing...') self.args = args self.gr = GenomicRegion(args) self.pats = args.pat_files self.dest_cov = args.cov self.bed = load_blocks_file(args.bed_file) if args.bed_file else None self.stats = pd.DataFrame( index=[splitextgz(op.basename(f))[0] for f in self.pats]) self.nr_pats = len(self.pats) self.labels = self.validate_labels(args.labels) self.dest_rates = self.validate_rates(args.rates) self.covs = self.read_covs() self.adj_rates = self.adjust_rates() self.prefix = self.generate_prefix(args.out_dir, args.prefix)
def run_single_unq(unq, grs, args): eprint(unq) if not grs: # process the whole unq file (no -L,-s,-r was specified) x = FragLen(unq, args).run_whole_genome() else: x = np.sum([FragLen(unq, args, gr).run_small_region() for gr in grs], axis=0) # print values to stdout: if args.verbose: np.savetxt(sys.stdout, x.reshape((1, -1)), fmt='%s', delimiter=' ') # plot: plt.figure() plt.plot(np.arange(x.size), x.flatten()) plt.title('Fragment lengths\n' + op.basename(splitextgz(unq)[0])) # dump figure: if args.outdir: plt.savefig(compose_fig_path(unq, args.outdir, grs))
def main(): """ Visualize wgbs files Possible inputs: - a pat.gz file - One or more beta files """ args = parse_args() file_type = splitextgz(args.input_files[0])[1] # print title if args.title: print('{}'.format(args.title)) if file_type in ('.beta', '.bin'): beta_vis_main(args) elif file_type == '.pat.gz': pat_vis_main(args) else: print('Unsupported file type:', file_type)
def pat2beta(pat_path, out_dir, args, force=True): validate_single_file(pat_path) if pat_path.endswith('.pat.gz'): cmd = 'gunzip -cd' elif pat_path.endswith('.pat'): cmd = 'cat' else: raise IllegalArgumentError('Invalid pat suffix: {}'.format(pat_path)) out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + '.beta') if not delete_or_skip(out_beta, force): return nr_sites = GenomeRefPaths(args.genome).nr_sites if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile( pat_path + '.csi'): return mult_pat2beta(pat_path, out_beta, nr_sites, args) cmd += ' {} | {} {} {}'.format(pat_path, PAT2BETA_TOOL, out_beta, nr_sites) subprocess.check_call(cmd, shell=True) return out_beta
def compose_fig_path(pat, outdir): if outdir: return op.join(outdir, op.basename(splitextgz(pat)[0])) + '.png'