def get_fasta(self): # download fasta from UCSC, unless the fasta file is provided if self.ref_path is not None: validate_single_file(self.ref_path) return # no FASTA path provided. Attempt to download one ref_path = op.join(self.out_dir, f'{self.name}.fa.gz') url = f'https://hgdownload.soe.ucsc.edu/goldenPath/{self.name}/bigZips/{self.name}.fa.gz' cmd = f'curl {url} -o {ref_path}' eprint( f'[wt init] No reference FASTA provided. Attempting to download from\n\t{url}' ) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) output, error = p.communicate() if p.returncode: eprint( f'[wt init] Failed downloading reference for genome {self.name}: %d\n%s\n%s' % (p.returncode, output.decode(), error.decode())) eprint( f'[wt init] Try downloading yourself and use --fasta_name flag, or check the "name" parameter' ) raise IllegalArgumentError(f'[wt init] No reference FASTA found') eprint( f'[wt init] successfully downloaded FASTA. Now gunzip and bgzip it...' ) cmd = f'gunzip {ref_path} && bgzip -@ {self.args.threads} {ref_path[:-3]}' subprocess.check_call(cmd, shell=True) self.ref_path = ref_path
def main(): """ View the content of input file (pat/beta) as plain text. Possible filter by genomic region or sites range Output to stdout as default """ parser = parse_args() args = parser.parse_args() if args.sub_sample is not None and not 1 >= args.sub_sample >= 0: parser.error('[wt view] sub-sampling rate must be within [0.0, 1.0]') # validate input file input_file = args.input_file validate_single_file(input_file) try: if input_file.endswith('.beta'): gr = GenomicRegion(args) view_beta(input_file, gr, args.out_path, args.bed_file) elif op.splitext(input_file)[1] in ('.lbeta', '.bin'): view_other_bin(input_file, args) elif input_file.endswith('.pat.gz'): cview(input_file, args) else: raise IllegalArgumentError('Unknown input format:', input_file) except BrokenPipeError: catch_BrokenPipeError()
def pat2beta(pat_path, out_dir, args, force=True): validate_single_file(pat_path) if pat_path.endswith('.pat.gz'): cmd = 'gunzip -cd' elif pat_path.endswith('.pat'): cmd = 'cat' else: raise IllegalArgumentError(f'Invalid pat suffix: {pat_path}') suff = '.lbeta' if args.lbeta else '.beta' out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + suff) if not delete_or_skip(out_beta, force): return if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile( pat_path + '.csi'): arr = mult_pat2beta(pat_path, args) else: nr_sites = GenomeRefPaths(args.genome).get_nr_sites() cmd += f' {pat_path} | {pat2beta_tool} {1} {nr_sites + 1}' x = subprocess.check_output(cmd, shell=True).decode() arr = np.fromstring(x, dtype=int, sep=' ').reshape((-1, 2)) trim_to_uint8(arr, args.lbeta).tofile(out_beta) return out_beta
def view_gr(pat, args, get_cmd=False): validate_single_file(pat, '.pat.gz') gr = GenomicRegion(args) if gr.is_whole(): s = 1 e = gr.genome.get_nr_sites() + 1 cmd = f'gunzip -c {pat} ' else: s, e = gr.sites ms = max(1, s - MAX_PAT_LEN) cmd = f'tabix {pat} {gr.chrom}:{ms}-{e - 1} ' view_flags = set_view_flags(args) cmd += f' | {cview_tool} --sites "{s}\t{e}" ' + view_flags if hasattr( args, 'sub_sample') and args.sub_sample is not None: # sub-sample reads validate_local_exe(pat_sampler) cmd += f' | {pat_sampler} {args.sub_sample} ' if not gr.is_whole(): cmd += f' | sort -k2,2n -k3,3 ' cmd += f' | {collapse_pat_script} - ' if get_cmd: return cmd if args.out_path is not None: cmd += f' > {args.out_path}' subprocess_wrap_sigpipe(cmd)
def main(): """ Convert beta file to bed file. """ args = parse_args() validate_single_file(args.beta_path, '.beta') gr = GenomicRegion(args) beta_to_bed(args.beta_path, gr, args.bed_file, args.min_cov, args.mean, args.keep_na, args.force, args.outpath)
def bview_build_cmd(beta_path, gr, bed_path): # compose a shell command to output a beta file to stdout cmd = f'{view_beta_script} {gr.genome.revdict_path} {beta_path} ' if not gr.is_whole(): cmd += f' {gr.chrom} {gr.sites[0]} {gr.nr_sites}' if bed_path: validate_single_file(bed_path) cmd += f' | bedtools intersect -b {bed_path} -a stdin -wa ' return cmd
def main(): """ view pat file with the c++ engine """ parser = parse_args() args = parser.parse_args() # validate input file pat = args.pat validate_single_file(pat) if (args.sub_sample is not None) and (args.sub_sample < 0): parser.error('[wt view] sub-sampling rate must be >= 0') validate_local_exe(cview_tool) cview(pat, args)
def load_from_file(param_file): if not param_file: return validate_single_file(param_file) d = pd.read_csv(param_file, sep=':', comment='#', header=None, names=['val'], index_col=0, skipinitialspace=True).to_dict()['val'] return MFParams.set_param_type(d)
def validate_args(self): # validate integers if self.min_cpg < 0: raise IllegalArgumentError('min_cpg must be non negative') if self.max_cpg < 1: raise IllegalArgumentError('max_cpg must larger than 0') if self.min_bp < 0: raise IllegalArgumentError('min_bp must be non negative') if self.max_bp < 2: raise IllegalArgumentError('max_bp must larger than 1') if self.chunk_size < 1: raise IllegalArgumentError('chunk_size must larger than 1') # validate the [0.0, 1.0] fractions for key in ('na_rate_tg', 'na_rate_bg', 'delta', 'tg_quant', \ 'bg_quant', 'unmeth_thresh', 'meth_thresh', \ 'unmeth_mean_thresh', 'meth_mean_thresh'): if not (1.0 >= getattr(self, key) >= 0): eprint( f'Invalid value for {key} ({val}): must be in ({low}, {high})' ) raise IllegalArgumentError() # validate hyper hypo: if self.only_hyper and self.only_hypo: eprint(f'at most one of (only_hyper, only_hypo) can be specified') raise IllegalArgumentError() # validate input files for key in ('blocks_path', 'groups_file'): val = getattr(self, key) if val is None: eprint(f'[wt fm] missing required parameter: {key}') raise IllegalArgumentError() validate_single_file(val) # change path to absolute path setattr(self, key, op.abspath(val)) # validate betas if (self.betas is None and self.beta_list_file is None) or \ (self.betas is not None and self.beta_list_file is not None): eprint( f'[wt fm] Exactly one of the following must be specified: betas, beta_list_file' ) raise IllegalArgumentError() if self.beta_list_file: validate_single_file(self.beta_list_file) with open(self.beta_list_file, 'r') as f: self.betas = [l.strip() for l in f.readlines()] validate_file_list(self.betas)
def groups_load_wrap(groups_file, betas): if groups_file is not None: validate_single_file(groups_file) validate_file_list(betas) gf = load_gfile_helper(groups_file) else: # otherwise, generate dummy group file for all binary files in input_dir # first drop duplicated files, while keeping original order betas = drop_dup_keep_order(betas.copy()) fnames = [op.splitext(op.basename(b))[0] for b in betas] gf = pd.DataFrame(columns=['fname'], data=fnames) gf['group'] = gf['fname'] gf['full_path'] = match_prefix_to_bin(gf['fname'], betas, '.beta') return gf
def load_blocks_file(blocks_path, nrows=None): # validate blocks_path validate_single_file(blocks_path) try: # see if blocks_path has a header: peek_df = pd.read_csv(blocks_path, sep='\t', nrows=1, header=None, comment='#') header = None if str(peek_df.iloc[0, 1]).isdigit() else 0 names = COORDS_COLS5 if len(peek_df.columns) < len(names): msg = f'Invalid blocks file: {blocks_path}. less than {len(names)} columns.\n' msg += f'Run wgbstools convert -L {blocks_path} -o OUTPUT_REGION_FILE to add the CpG columns' raise IllegalArgumentError(msg) # load # dtypes = {'chr':str, 'start', 'end', 'startCpG', 'endCpG'} dtypes = {'startCpG': 'Int64', 'endCpG': 'Int64'} df = pd.read_csv(blocks_path, sep='\t', usecols=range(len(names)), dtype=dtypes, header=header, names=names, nrows=None, comment='#') # blocks start before they end - invalid file dfnona = df.dropna() # allow blocks with missing values if not ((dfnona['endCpG'] - dfnona['startCpG']) >= 0).all(): raise IllegalArgumentError( f'Invalid CpG columns in blocks file {blocks_path}') if dfnona.shape[0] == df.shape[0]: df['startCpG'] = df['startCpG'].astype(int) df['endCpG'] = df['endCpG'].astype(int) except pd.errors.ParserError as e: eprint(f'Invalid input file.\n{e}') return pd.DataFrame() except pd.errors.EmptyDataError as e: eprint(f'Empty blocks file.\n{e}') return pd.DataFrame() return df
def beta2table_generator(betas, blocks, groups_file, min_cov, threads, chunk_size=None, verbose=False): validate_single_file(blocks) gf = groups_load_wrap(groups_file, betas) blocks_df = load_blocks_file(blocks) if chunk_size is None: chunk_size = blocks_df.shape[0] for start in range(0, blocks_df.shape[0], chunk_size): subset_blocks = blocks_df.iloc[start:start + chunk_size].copy() yield get_table(subset_blocks, gf, min_cov, threads, verbose)
def __init__(self, args): self.args = args self.ref_path = args.genome_ref self.force = args.force self.name = args.name self.out_dir = self.make_output_dir() # validate input files validate_single_file(self.ref_path, '.fa') # abort if files exists and --force was not specified eprint('Setting up genome reference files in {}'.format(self.out_dir)) if not delete_or_skip(op.join(self.out_dir, 'CpG.bed.gz'), self.force): return self.fai_df = self.load_fai()
def set_lists(self): # black/white lists: blacklist = self.args.blacklist whitelist = self.args.whitelist if blacklist == True: blacklist = GenomeRefPaths(self.args.genome).blacklist elif whitelist == True: whitelist = GenomeRefPaths(self.args.genome).whitelist if blacklist: validate_single_file(blacklist) elif whitelist: validate_single_file(whitelist) if self.verbose: eprint(f'[wt bam2pat] blacklist: {blacklist}') eprint(f'[wt bam2pat] whitelist: {whitelist}') return blacklist, whitelist
def main(): """ View the content of input file (pat/unq/beta) as plain text. Possible filter by genomic region or sites range Output to stdout as default """ args = parse_args() # validate input file input_file = args.input_file validate_single_file(input_file) if args.sub_sample is not None and not 1 > args.sub_sample > 0: eprint('sub-sampling rate must be within (0.0, 1.0)') return if args.bed_file and (args.region or args.sites): eprint('-L, -s and -r are mutually exclusive') return bed_wrapper = BedFileWrap(args.bed_file) if args.bed_file else None gr = GenomicRegion(args) try: if input_file.endswith('.beta') or input_file.endswith('.bin'): view_beta(input_file, gr, args.out_path) elif input_file.endswith('.pat.gz'): if bed_wrapper: view_pat_bed_multiprocess(args, bed_wrapper) else: vp = ViewPat(input_file, args.out_path, gr, args.strict, args.sub_sample, bed_wrapper, args.min_len) vp.view_pat(args.awk_engine) elif input_file.endswith('.unq.gz'): grs = bed_wrapper.iter_grs() if bed_wrapper else [gr] for gr in grs: ViewUnq(input_file, args.out_path, gr, args.inflate).view() else: raise IllegalArgumentError('Unknown input format:', input_file) except BrokenPipeError: # Python flushes standard streams on exit; redirect remaining output # to devnull to avoid another BrokenPipeError at shutdown devnull = os.open(os.devnull, os.O_WRONLY) os.dup2(devnull, sys.stdout.fileno()) sys.exit(1) # Python exits with error code 1 on EPIPE
def parse_betas_input(args): """ parse user input to get the list of beta files to segment Either args.betas is a list of beta files, or args.beta_file is a text file in which each line is a beta file return: list of beta files """ if args.betas: betas = args.betas elif args.beta_file: validate_single_file(args.beta_file) with open(args.beta_file, 'r') as f: betas = [ b.strip() for b in f.readlines() if b.strip() and not b.startswith('#') ] if not betas: raise IllegalArgumentError( f'no beta files found in file {args.beta_file}') validate_file_list(betas) return betas
def pat2beta(pat_path, out_dir, args, force=True): validate_single_file(pat_path) if pat_path.endswith('.pat.gz'): cmd = 'gunzip -cd' elif pat_path.endswith('.pat'): cmd = 'cat' else: raise IllegalArgumentError('Invalid pat suffix: {}'.format(pat_path)) out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + '.beta') if not delete_or_skip(out_beta, force): return nr_sites = GenomeRefPaths(args.genome).nr_sites if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile( pat_path + '.csi'): return mult_pat2beta(pat_path, out_beta, nr_sites, args) cmd += ' {} | {} {} {}'.format(pat_path, PAT2BETA_TOOL, out_beta, nr_sites) subprocess.check_call(cmd, shell=True) return out_beta
def read_reference(ref): # read Illumina-to-CpG_Index table: validate_single_file(ilmn2cpg_dict) df = pd.read_csv(ilmn2cpg_dict, sep='\t', header=None, names=['ilmn', 'cpg']) if ref is None: return df # validate and read reference file validate_single_file(ref) rf = pd.read_csv(ref, header=None, usecols=[0], names=['ilmn']) # remove first row if it's not a cg entry: if pd.isna(rf['ilmn'][0]) or not rf['ilmn'][0].startswith('cg'): rf = rf.iloc[1:, :] # merge reference file with map table mf = df.merge(rf, how='right', on='ilmn') # if there are sites that appear in the reference but not in the map table, # remove them and print a warning missing_sites = mf[mf['cpg'].isna()] if not missing_sites.empty: msg = 'WARNING: Skipping some unrecognized Illumina IDs \n' msg += f'(not found in the map table {ilmn2cpg_dict})\n' if not missing_sites['ilmn'].empty: eprint(missing_sites['ilmn']) eprint(list(missing_sites['ilmn'])) msg += 'The missing sites: {}'.format(','.join( map(str, missing_sites['ilmn']))) eprint(msg) mf = mf[~mf['cpg'].isna()] mf['cpg'] = mf['cpg'].astype(int) return mf
def validate_args(self): if self.args.min_cpg < 1: raise IllegalArgumentError('min_cpg must be a positive integer') validate_single_file(self.args.blocks_path) validate_single_file(self.args.groups_file)
def betas2table(betas, blocks, groups_file, min_cov, threads=8, verbose=False): validate_single_file(blocks) gf = groups_load_wrap(groups_file, betas) blocks_df = load_blocks_file(blocks) return get_table(blocks_df, gf, min_cov, threads, verbose)
def load_group_file(groups_file, betas): validate_single_file(groups_file) validate_file_list(betas) gf = load_gfile_helper(groups_file) gf['full_path'] = match_prefix_to_bin(gf['fname'], betas, '.beta') return gf