def validate_input(self): # validate bam path: print('bam:', self.bam_path) if not (op.isfile(self.bam_path) and self.bam_path.endswith('.bam')): raise IllegalArgumentError('Invalid bam: {}'.format(self.bam_path)) # check if bam is sorted by coordinate: peek_cmd = 'samtools view -H {} | head -1'.format(self.bam_path) if 'coordinate' not in subprocess.check_output(peek_cmd, shell=True).decode(): raise IllegalArgumentError('bam file must be sorted by coordinate') # check if bam is indexed: if not (op.isfile(self.bam_path + '.bai')): print('bai file was not found! Generating...') r = subprocess.call(['samtools', 'index', self.bam_path]) if r: raise IllegalArgumentError('Failed indexing bam: {}'.format( self.bam_path)) # validate output dir: if not (op.isdir(self.out_dir)): raise IllegalArgumentError('Invalid output dir: {}'.format( self.out_dir))
def find_region_format(self, region): region = region.replace(',', '') # remove commas # In case region is a whole chromosome chrome_match = re.match(r'^(chr)?([\d]+|[XYM]|(MT))$', region) if chrome_match: if region not in self.genome.get_chroms(): raise IllegalArgumentError(f'Unknown chromosome: {region}') self.chrom = region return region, 1, self._chrome_size() # match region string to format chrom:from uni_region_match = re.match(r'^(chr)?([\d]+|[XYM]|(MT)):([\d]+)$', region) if uni_region_match: region_from = uni_region_match.group(4) region += f'-{int(region_from) + 1}' # match region string to format chrom:from-to region_match = re.match( r'^((chr)?([\d]+|[XYM]|(MT))):([\d]+)-([\d]+)$', region) if not region_match: raise IllegalArgumentError(f'Invalid genomic region: {region}') self.chrom = region_match.group(1) if self.chrom not in self.genome.get_chroms(): raise IllegalArgumentError(f'Unknown chromosome: {region}') region_from = int(region_match.group(5)) region_to = int(region_match.group(6)) return region, region_from, region_to
def generate_fai(fasta): """ Generate fai file if it does not exist """ fai_path = fasta + '.fai' # If no fai file exists, return it if op.isfile(fai_path): return fai_path # otherwise, generate it using samtools faidx: eprint(f'[wt init] Indexing {fasta}') cmd = f'samtools faidx {fasta}' p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = p.communicate() # if failed to generate fai, print informative message and raise exception if p.returncode: eprint("[wt init] Failed with samtools idxstats %d\n%s\n%s" % (p.returncode, output.decode(), error.decode())) if fasta.endswith('.gz') and 'please use bgzip' in error.decode(): msg = f'[wt init] Seems like your reference FASTA cannot be indexed with samtools faidx.\n' \ f' Try one of the following:\n' \ f' 1. decompress it (gunzip {fasta}) and try again\n' \ f' 2. change the compression to bgzip:\n' \ f' gunzip {fasta} && bgzip {fasta[:-3]}' eprint(msg) raise IllegalArgumentError('[wt init] Invalid reference FASTA') if op.isfile(fai_path): eprint(f'[wt init] Generated index file: {fai_path}') else: raise IllegalArgumentError( '[wt init] Failed to generate index file (fai)') return fai_path
def parse_region(self, region): """ Parse input of the type -r / --region (e.g chr11:200-300) """ region = region.replace(',', '') # remove commas chrome_match = re.match(r'^chr([\d]+|[XYM])$', region) region_match = re.match(r'chr([\d]+|[XYM]):([\d]+)-([\d]+)', region) # In case region is a whole chromosome if chrome_match: self.chrom = 'chr' + chrome_match.group(1) region_from = 1 region_to = self._chrome_size() # match region string to format chrom:from-to elif region_match: self.chrom = 'chr' + region_match.group(1) region_from = int(region_match.group(2)) region_to = int(region_match.group(3)) if region_to <= region_from: raise IllegalArgumentError( 'Invalid genomic region: {}. end before start'.format( region)) if region_to > self._chrome_size() or region_from < 1: raise IllegalArgumentError( 'Invalid genomic region: {}. Out of range'.format(region)) else: raise IllegalArgumentError( 'Invalid genomic region: {}'.format(region)) # Update GR fields: self.region_str = region self.sites = self._region_str2sites() self.bp_tuple = (region_from, region_to)
def _sites_str_to_tuple(self, sites_str): """ extract integers tuple (e.g (120, 130)) from a sites string (e.g '120-130') """ if not sites_str: raise IllegalArgumentError(f'Empty sites string: {sites_str}') sites_str = sites_str.replace(',', '') # start-end syntax matchObj = re.match(r'([\d]+)-([\d]+)', sites_str) if matchObj: site1 = int(matchObj.group(1)) site2 = int(matchObj.group(2)) # single site syntax: elif '-' not in sites_str and sites_str.isdigit(): site1 = int(sites_str) site2 = site1 + 1 else: raise IllegalArgumentError( f'sites must be of format: "start-end" or "site" .\nGot: {sites_str}' ) # validate sites are in range: if not self.genome.get_nr_sites() + 1 >= site2 >= site1 >= 1: msg = 'sites violate the constraints: ' msg += f'{self.genome.get_nr_sites() + 1} >= {site2} > {site1} >= 1' raise IllegalArgumentError(msg) if site1 == site2: site2 += 1 return site1, site2
def stitch_2_dfs(b1, b2, params): # if b2 is not the direct extension of b1, we have a problem if b1[-1] != b2[0]: msg = '[wt segment] Patch stitching Failed! ' \ ' patches are not supposed to be merged' raise IllegalArgumentError(msg) n1 = b1[-1] - b1[0] n2 = b2[-1] - b2[0] patch1_size = min(50, n1) patch2_size = min(50, n2) patch = np.array([], dtype=int) while patch1_size <= n1 and patch2_size <= n2: # calculate blocks for patch: start = b1[-1] - patch1_size #- 1 end = b1[-1] + patch2_size cparams = dict(params, **{'sites': (start, end)}) patch = segment_process(cparams) # find the overlaps if is_2_overlap(b1, patch) and is_2_overlap(patch, b2): # successful stitch with patches return merge2(merge2(b1, patch), b2) else: # failed stitch - increase patch sizes if not is_2_overlap(b1, patch): patch1_size = increase_patch(patch1_size, n1) if not is_2_overlap(patch, b2): patch2_size = increase_patch(patch2_size, n2) # Failed: could not stich the two chuncks msg = '[wt segment] Patch stitching Failed! ' \ ' Try increasing chunk size (--chunk_size flag)' raise IllegalArgumentError(msg)
def get_table(blocks_df, gf, min_cov, threads=8, verbose=False, group=True): is_nice, _ = is_block_file_nice(blocks_df) if verbose: eprint(f'[wt table] reducing to {blocks_df.shape[0]:,} blocks') betas = drop_dup_keep_order(gf['full_path']) p = Pool(threads) params = [(b, blocks_df, is_nice, min_cov, verbose) for b in betas] # arr = [cwrap(*p) for p in params] # todo: remove arr = p.starmap(cwrap, params) p.close() p.join() dicts = [d for d in arr if d is not None] dres = {k: v for d in dicts for k, v in d.items()} if not group: for b in gf['fname']: blocks_df[b] = dres[b] return blocks_df if not dres: eprint( f'[ wt table ] failed reducing {gf["fname"].tolist()} to blocks\n{blocks_df}' ) raise IllegalArgumentError() if dres[list(dres.keys())[0]].size != blocks_df.shape[0]: eprint(f'[ wt table] beta2block returned wrong number of values') raise IllegalArgumentError() groups = drop_dup_keep_order(gf['group']) with np.warnings.catch_warnings(): np.warnings.filterwarnings('ignore', r'Mean of empty slice') for group in groups: blocks_df[group] = np.nanmean(np.concatenate([dres[k][None, :] for k \ in gf['fname'][gf['group'] == group]]), axis=0).T return blocks_df
def main(): """ Generage homog files. Given a blocks file and pat[s], count the number of U,X,M reads for each block for each file """ args = parse_args() if args.nr_bits not in (8 , 16): raise IllegalArgumentError('nr_bits must be in {8, 16}') if args.rlen < 3: raise IllegalArgumentError('rlen must be >= 3') if args.thresholds is not None: th = args.thresholds.split(',') if not len(th) == 2: # and th[0].is_number(): raise IllegalArgumentError('Invalid thresholds') th = float(th[0]), float(th[1]) if not (1 > th[1] > th[0] > 0): raise IllegalArgumentError('Invalid thresholds') # make sure homog tool is valid: validate_local_exe(homog_tool) pats = args.input_files validate_file_list(pats, '.pat.gz') outdir, prefix = parse_outdir_prefix(args) # load blocks: blocks_df = load_blocks_file(args.blocks_file) is_nice, msg = is_block_file_nice(blocks_df) if not is_nice: homog_log(msg) raise IllegalArgumentError(f'Invalid blocks file: {args.blocks_file}') for pat in sorted(pats): homog_process(pat, blocks_df, args, outdir, prefix)
def validate_file(self): """ Make sure file exists, and its suffix is one of the following: pat, bed [.gz] """ if not op.isfile(self.in_file): raise IllegalArgumentError(f'no such file: {self.in_file}') suffs = self.ftype.suff if not self.suff in (suffs, suffs + '.gz'): raise IllegalArgumentError('Index only supports pat, bed formats')
def validate_file(self): """ Make sure file exists, and its suffix is one of the following: pat, unq, bed, tsv [.gz] """ if not op.isfile(self.in_file): raise IllegalArgumentError("no such file: {}".format(self.in_file)) suffs = self.ftype.suffixes if not self.suff in [x + '.gz' for x in suffs] + suffs: raise IllegalArgumentError( 'Index only supports pat, unq, bed, tsv formats')
def load_blocks_file(blocks_path, nrows=None): # validate blocks_path validate_single_file(blocks_path) try: # see if blocks_path has a header: peek_df = pd.read_csv(blocks_path, sep='\t', nrows=1, header=None, comment='#') header = None if str(peek_df.iloc[0, 1]).isdigit() else 0 names = COORDS_COLS5 if len(peek_df.columns) < len(names): msg = f'Invalid blocks file: {blocks_path}. less than {len(names)} columns.\n' msg += f'Run wgbstools convert -L {blocks_path} -o OUTPUT_REGION_FILE to add the CpG columns' raise IllegalArgumentError(msg) # load # dtypes = {'chr':str, 'start', 'end', 'startCpG', 'endCpG'} dtypes = {'startCpG': 'Int64', 'endCpG': 'Int64'} df = pd.read_csv(blocks_path, sep='\t', usecols=range(len(names)), dtype=dtypes, header=header, names=names, nrows=None, comment='#') # blocks start before they end - invalid file dfnona = df.dropna() # allow blocks with missing values if not ((dfnona['endCpG'] - dfnona['startCpG']) >= 0).all(): raise IllegalArgumentError( f'Invalid CpG columns in blocks file {blocks_path}') if dfnona.shape[0] == df.shape[0]: df['startCpG'] = df['startCpG'].astype(int) df['endCpG'] = df['endCpG'].astype(int) except pd.errors.ParserError as e: eprint(f'Invalid input file.\n{e}') return pd.DataFrame() except pd.errors.EmptyDataError as e: eprint(f'Empty blocks file.\n{e}') return pd.DataFrame() return df
def set_regions(self): # if user specified a region, just use it if self.gr.region_str: return [self.gr.region_str] # get all chromosomes present in the bam file header cmd = f'samtools idxstats {self.bam_path} | cut -f1 ' p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = p.communicate() if p.returncode or not output: eprint("[wt bam2pat] Failed with samtools idxstats %d\n%s\n%s" % (p.returncode, output.decode(), error.decode())) eprint(cmd) eprint('[wt bam2pat] falied to find chromosomes') return [] bam_chroms = output.decode()[:-1].split('\n') # get all chromosomes from the reference genome: ref_chroms = self.gr.genome.get_chroms() # intersect the chromosomes from the bam and from the reference intersected_chroms = list(set(bam_chroms) & set(ref_chroms)) if not intersected_chroms: msg = '[wt bam2pat] Failed retrieving valid chromosome names. ' msg += 'Perhaps you are using a wrong genome reference. ' msg += 'Try running:\n\t\twgbstools set_default_ref -ls' raise IllegalArgumentError('Failed') return list( sorted(intersected_chroms, key=chromosome_order) ) # todo use the same order as in ref_chroms instead of resorting it
def __init__(self, args): self.args = args self.dfU = pd.DataFrame() self.dfM = pd.DataFrame() self.blocks = pd.DataFrame() self.nr_blocks = 0 self.orig_nr_blocks = 0 self.keepinds = None self.groups = None self.verbose = args.verbose self.hyper, self.hypo = self.set_hypo_hyper(args.hyper, args.hypo) self.validate_args() # validate output dir: if not op.isdir(args.out_dir): os.mkdir(args.out_dir) # load groups self.gf = load_groups_file(args.groups_file, args.input_dir, args.verbose) self.gf_nodup = self.gf.drop_duplicates(subset='fname').reset_index( drop=True) # validate target is in groups file target = self.args.target if target and target not in self.gf['group'].values: eprint( f'target {target} not in groups file {self.args.groups_file}') eprint('Possible targets:', sorted(self.gf['group'].unique())) raise IllegalArgumentError()
def load_bins(self): if self.verbose: eprint('loading bins...') # breakpoint() nr_cols = (3 if self.args.uxm else 2) binsize = self.gf['binsize'][0] / self.orig_nr_blocks binsize /= nr_cols if binsize != int(binsize): raise IllegalArgumentError( 'Error: bin file size does not match blocks number') dtype = np.uint8 if binsize == 1 else np.uint16 dfU = pd.DataFrame() dfM = pd.DataFrame() if self.hypo: dfU = np.zeros((self.nr_blocks, self.gf_nodup.shape[0]), dtype=np.float) if self.hyper: dfM = np.zeros((self.nr_blocks, self.gf_nodup.shape[0]), dtype=np.float) from tqdm import tqdm # todo: only if installed for ind, row in tqdm(self.gf_nodup.iterrows(), total=self.gf_nodup.shape[0]): data = np.fromfile(row['full_path'], dtype).reshape( (-1, nr_cols))[self.keepinds, :] if self.hypo: dfU[:, ind] = table2vec(data, 'U', self.arsg.min_cov) if self.hyper: dfM[:, ind] = table2vec(data, 'M', self.arsg.min_cov) return self.array2df(dfU), self.array2df(dfM)
def main(): """ View the content of input file (pat/beta) as plain text. Possible filter by genomic region or sites range Output to stdout as default """ parser = parse_args() args = parser.parse_args() if args.sub_sample is not None and not 1 >= args.sub_sample >= 0: parser.error('[wt view] sub-sampling rate must be within [0.0, 1.0]') # validate input file input_file = args.input_file validate_single_file(input_file) try: if input_file.endswith('.beta'): gr = GenomicRegion(args) view_beta(input_file, gr, args.out_path, args.bed_file) elif op.splitext(input_file)[1] in ('.lbeta', '.bin'): view_other_bin(input_file, args) elif input_file.endswith('.pat.gz'): cview(input_file, args) else: raise IllegalArgumentError('Unknown input format:', input_file) except BrokenPipeError: catch_BrokenPipeError()
def __init__(self, args=None, region=None, name='hg19'): self.genome_name = name self.chrom = None self.sites = None self.region_str = None self.bp_tuple = None self.chrs_sz = None # DataFrame of chromosomes sizes (in number of sites) self.name = name self.args = args # todo: this could be prettier if args is not None: self.name = args.genome self.genome = GenomeRefPaths(self.name) if args.sites: self.parse_sites(args.sites) elif args.region: self.parse_region(args.region) elif region is not None: self.genome = GenomeRefPaths(self.name) self.parse_region(region) else: raise IllegalArgumentError('Invalid GR init {}'.format(region)) self.nr_sites = None if self.sites is None else self.sites[ 1] - self.sites[0] self.annotation = self.add_anno()
def get_fasta(self): # download fasta from UCSC, unless the fasta file is provided if self.ref_path is not None: validate_single_file(self.ref_path) return # no FASTA path provided. Attempt to download one ref_path = op.join(self.out_dir, f'{self.name}.fa.gz') url = f'https://hgdownload.soe.ucsc.edu/goldenPath/{self.name}/bigZips/{self.name}.fa.gz' cmd = f'curl {url} -o {ref_path}' eprint( f'[wt init] No reference FASTA provided. Attempting to download from\n\t{url}' ) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) output, error = p.communicate() if p.returncode: eprint( f'[wt init] Failed downloading reference for genome {self.name}: %d\n%s\n%s' % (p.returncode, output.decode(), error.decode())) eprint( f'[wt init] Try downloading yourself and use --fasta_name flag, or check the "name" parameter' ) raise IllegalArgumentError(f'[wt init] No reference FASTA found') eprint( f'[wt init] successfully downloaded FASTA. Now gunzip and bgzip it...' ) cmd = f'gunzip {ref_path} && bgzip -@ {self.args.threads} {ref_path[:-3]}' subprocess.check_call(cmd, shell=True) self.ref_path = ref_path
def __init__(self, args=None, region=None, sites=None, genome_name=None): self.genome_name = get_genome_name(genome_name) self.chrom = None self.sites = sites self.region_str = region self.bp_tuple = None self.args = args # todo: this could be prettier if args is not None: self.genome_name = get_genome_name(args.genome) self.genome = GenomeRefPaths(self.genome_name) if args.sites: self.parse_sites(args.sites) elif args.region: self.parse_region(args.region) elif region is not None: self.genome = GenomeRefPaths(self.genome_name) self.parse_region(region) elif sites is not None: self.genome = GenomeRefPaths(self.genome_name) self.parse_sites(sites) else: raise IllegalArgumentError(f'Invalid GR init {region}') self.nr_sites = None if self.sites is None else self.sites[ 1] - self.sites[0] self.annotation = self.add_anno()
def load_fai(self): """ Generate, link and load the fai file to a DataFrame """ fai_path = generate_fai(self.ref_path) # Link fa + fai (or fa.gz+fa.gz.gzi+fa.gz.fai) to the output dir fasta_name = 'genome.fa' + ('.gz' if self.ref_path.endswith('.gz') else '') # fasta_name = 'genome.fa' self.link_file(self.ref_path, fasta_name) self.link_file(fai_path, fasta_name + '.fai') if fasta_name.endswith('.gz'): self.link_file(self.ref_path + '.gzi', fasta_name + '.gzi') # load fai file try: df = pd.read_csv(fai_path, sep='\t', header=None, usecols=[0, 1, 2, 4], names=['chr', 'size', 'offset', 'width']) # filter invalid chromosomes df = df[df.apply(lambda x: is_valid_chrome(x['chr']), axis=1)] # sort chromosomes: if not self.args.no_sort: df = pd.DataFrame(sorted(df['chr'], key=chromosome_order), columns=['chr']).merge(df, how='left') return df except pd.errors.ParserError as e: raise IllegalArgumentError(f'Invalid fai file.\n{e}')
def insert_read_to_table(self, read, table, shift): read_start = int(read[1]) patt = read[2] count = int(read[3]) # skip empty (all dots) reads: if not patt.strip('.'): return if self.uxm: patt = self.read_uxm(patt, count) patt_ints = [str2int[l] for l in patt] # perform multiple times for reads with count > 1, but no more than "max_reps" times: for c in range(min(self.max_reps, count)): # find the relative starting point of the current read col = read_start - shift if col < 0: raise IllegalArgumentError('Error: Pat is not sorted!') # find the first available row to insert current read: if self.args.no_dense: # no_dense: present each read in a new line row = np.argmin(table.sum(axis=1)) else: row = np.argmin(table[:, col]) # make sure the slots are free assert (table[row, col:col + len(patt)].sum() == 0) assert (row < table.shape[0]) # insert read and spaces: table[row, col:col + len(patt)] = patt_ints table[row, :col][table[row, :col] == 0] = 1 # before read table[row, col + len(patt)] = 1 # after read
def load_seq_by_chrom(chrom, ref_path, fai_df, debug): eprint(chrom) # get chromosome's location in the fasta chrom, size, offset, width = fai_df[fai_df['chr'] == chrom].values[0] # load the chromosome's subsequence from fasta with open(ref_path, 'r') as f: f.seek(offset) nr_lines = size // ( width - 1) + 1 # number of lines to read for current chromosome to_read = nr_lines * width if debug: to_read = min(to_read, 100 * width) txt = f.read(to_read) seq = ''.join(s.strip() for s in txt.split('\n')).upper() # remove possible trailing characters (belonging to the next chromosome) end_pos = seq.rfind('>') if end_pos != -1: seq = seq[:end_pos] # validate sequence length if len(seq) != size and not debug: raise IllegalArgumentError('Error while loading {} from fasta: ' 'read {} bases instead of {}'.format( chrom, len(seq), size)) # Find CpG sites loci tf = pd.DataFrame([m.start() + 1 for m in re.finditer('CG', seq)], columns=['loc']) tf['chr'] = chrom return tf[['chr', 'loc']]
def set_regions(self): if self.gr.region_str: return [self.gr.region_str] cmd = f'samtools idxstats {self.bam_path} | cut -f1 ' p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = p.communicate() if p.returncode or not output: eprint("[wt bam2pat] Failed with samtools idxstats %d\n%s\n%s" % (p.returncode, output.decode(), error.decode())) eprint(cmd) eprint('[wt bam2pat] falied to find chromosomes') return [] nofilt_chroms = output.decode()[:-1].split('\n') filt_chroms = [c for c in nofilt_chroms if 'chr' in c] if filt_chroms: filt_chroms = [ c for c in filt_chroms if re.match(r'^chr([\d]+|[XYM])$', c) ] else: filt_chroms = [c for c in nofilt_chroms if c in CHROMS] chroms = list(sorted(filt_chroms, key=chromosome_order)) if not chroms: eprint('[wt bam2pat] Failed retrieving valid chromosome names') raise IllegalArgumentError('Failed') return chroms
def pat2beta(pat_path, out_dir, args, force=True): validate_single_file(pat_path) if pat_path.endswith('.pat.gz'): cmd = 'gunzip -cd' elif pat_path.endswith('.pat'): cmd = 'cat' else: raise IllegalArgumentError(f'Invalid pat suffix: {pat_path}') suff = '.lbeta' if args.lbeta else '.beta' out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + suff) if not delete_or_skip(out_beta, force): return if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile( pat_path + '.csi'): arr = mult_pat2beta(pat_path, args) else: nr_sites = GenomeRefPaths(args.genome).get_nr_sites() cmd += f' {pat_path} | {pat2beta_tool} {1} {nr_sites + 1}' x = subprocess.check_output(cmd, shell=True).decode() arr = np.fromstring(x, dtype=int, sep=' ').reshape((-1, 2)) trim_to_uint8(arr, args.lbeta).tofile(out_beta) return out_beta
def validate_args(self): # validate integers if self.min_cpg < 0: raise IllegalArgumentError('min_cpg must be non negative') if self.max_cpg < 1: raise IllegalArgumentError('max_cpg must larger than 0') if self.min_bp < 0: raise IllegalArgumentError('min_bp must be non negative') if self.max_bp < 2: raise IllegalArgumentError('max_bp must larger than 1') if self.chunk_size < 1: raise IllegalArgumentError('chunk_size must larger than 1') # validate the [0.0, 1.0] fractions for key in ('na_rate_tg', 'na_rate_bg', 'delta', 'tg_quant', \ 'bg_quant', 'unmeth_thresh', 'meth_thresh', \ 'unmeth_mean_thresh', 'meth_mean_thresh'): if not (1.0 >= getattr(self, key) >= 0): eprint( f'Invalid value for {key} ({val}): must be in ({low}, {high})' ) raise IllegalArgumentError() # validate hyper hypo: if self.only_hyper and self.only_hypo: eprint(f'at most one of (only_hyper, only_hypo) can be specified') raise IllegalArgumentError() # validate input files for key in ('blocks_path', 'groups_file'): val = getattr(self, key) if val is None: eprint(f'[wt fm] missing required parameter: {key}') raise IllegalArgumentError() validate_single_file(val) # change path to absolute path setattr(self, key, op.abspath(val)) # validate betas if (self.betas is None and self.beta_list_file is None) or \ (self.betas is not None and self.beta_list_file is not None): eprint( f'[wt fm] Exactly one of the following must be specified: betas, beta_list_file' ) raise IllegalArgumentError() if self.beta_list_file: validate_single_file(self.beta_list_file) with open(self.beta_list_file, 'r') as f: self.betas = [l.strip() for l in f.readlines()] validate_file_list(self.betas)
def _sites_str_to_tuple(self, sites_str): """ extract integers tuple (e.g (120, 130)) from a sites string (e.g '120-130') """ if sites_str: sites_str = sites_str.replace(',', '') matchObj = re.match(r'([\d]+)-([\d]+)', sites_str) if matchObj: site1 = int(matchObj.group(1)) site2 = int(matchObj.group(2)) if not self.genome.nr_sites + 1 >= site2 > site1 >= 1: msg = 'sites violate the constraints: ' msg += '{} >= {} > {} >= 1'.format( self.genome.nr_sites + 1, site2, site1) raise IllegalArgumentError(msg) return site1, site2 raise IllegalArgumentError( 'sites must be of format: ([\d])-([\d]).\nGot: {}'.format( sites_str))
def load_gfile_helper(groups_file): # load and validate csv gf = pd.read_csv(groups_file, index_col=False, comment='#') if 'group' not in gf.columns: raise IllegalArgumentError( 'gropus file must have a column named "group"') # drop samples where include==False if 'include' in gf.columns: if gf['include'].dtype != bool: eprint('Invalid group file') raise IllegalArgumentError( 'Invalid group file. Include column must be boolean') gf = gf[gf['include']] # drop unnecessary columns gf = gf.rename(columns={gf.columns[0]: 'fname'}) gf = gf[['fname', 'group']].dropna().reset_index(drop=True) return gf
def parse_region(self, region): """ Parse input of the type -r / --region (e.g chr11:200-300) """ self.region_str, region_from, region_to = self.find_region_format( region) # validate region range: if region_to <= region_from: raise IllegalArgumentError( f'Invalid genomic region: {region}. end before start') if region_to > self._chrome_size() or region_from < 1: raise IllegalArgumentError( f'Invalid genomic region: {region}. Out of range') # Update GR fields: self.bp_tuple = (region_from, region_to) self.sites = self._region_str2sites()
def validate_rates(self, rates): if len(rates) == self.nr_pats - 1: rates.append(1.0 - np.sum(rates)) if len(rates) != self.nr_pats: raise IllegalArgumentError( 'len(rates) must be in {len(files), len(files) - 1}') if np.abs(np.sum(rates) - 1) > 1e-8: raise IllegalArgumentError('Sum(rates) == {} != 1'.format( np.sum(rates))) if np.min(rates) < 0 or np.max(rates) > 1: raise IllegalArgumentError('rates must be in range [0, 1)') self.add_stats_col('ReqstRates', rates) return rates
def validate_input(self): # validate bam path: validate_bam(self.bam_path) # validate output dir: if not (op.isdir(self.out_dir)): raise IllegalArgumentError('Invalid output dir: {}'.format( self.out_dir))
def __init__(self, args): self.args = args self.gr = GenomicRegion(args) self.outdir = args.outdir self.name = '' if not op.isdir(self.outdir): raise IllegalArgumentError('Invalid output directory: ' + self.outdir) self.chrom_sizes = GenomeRefPaths(args.genome).chrom_sizes