Пример #1
0
    def validate_input(self):

        # validate bam path:
        print('bam:', self.bam_path)
        if not (op.isfile(self.bam_path) and self.bam_path.endswith('.bam')):
            raise IllegalArgumentError('Invalid bam: {}'.format(self.bam_path))

        # check if bam is sorted by coordinate:
        peek_cmd = 'samtools view -H {} | head -1'.format(self.bam_path)
        if 'coordinate' not in subprocess.check_output(peek_cmd,
                                                       shell=True).decode():
            raise IllegalArgumentError('bam file must be sorted by coordinate')

        # check if bam is indexed:
        if not (op.isfile(self.bam_path + '.bai')):
            print('bai file was not found! Generating...')
            r = subprocess.call(['samtools', 'index', self.bam_path])
            if r:
                raise IllegalArgumentError('Failed indexing bam: {}'.format(
                    self.bam_path))

        # validate output dir:
        if not (op.isdir(self.out_dir)):
            raise IllegalArgumentError('Invalid output dir: {}'.format(
                self.out_dir))
Пример #2
0
    def find_region_format(self, region):
        region = region.replace(',', '')  # remove commas

        # In case region is a whole chromosome
        chrome_match = re.match(r'^(chr)?([\d]+|[XYM]|(MT))$', region)
        if chrome_match:
            if region not in self.genome.get_chroms():
                raise IllegalArgumentError(f'Unknown chromosome: {region}')
            self.chrom = region
            return region, 1, self._chrome_size()

        # match region string to format chrom:from
        uni_region_match = re.match(r'^(chr)?([\d]+|[XYM]|(MT)):([\d]+)$',
                                    region)
        if uni_region_match:
            region_from = uni_region_match.group(4)
            region += f'-{int(region_from) + 1}'

        # match region string to format chrom:from-to
        region_match = re.match(
            r'^((chr)?([\d]+|[XYM]|(MT))):([\d]+)-([\d]+)$', region)
        if not region_match:
            raise IllegalArgumentError(f'Invalid genomic region: {region}')

        self.chrom = region_match.group(1)
        if self.chrom not in self.genome.get_chroms():
            raise IllegalArgumentError(f'Unknown chromosome: {region}')
        region_from = int(region_match.group(5))
        region_to = int(region_match.group(6))

        return region, region_from, region_to
Пример #3
0
def generate_fai(fasta):
    """ Generate fai file if it does not exist """
    fai_path = fasta + '.fai'

    # If no fai file exists, return it
    if op.isfile(fai_path):
        return fai_path

    # otherwise, generate it using samtools faidx:
    eprint(f'[wt init] Indexing {fasta}')
    cmd = f'samtools faidx {fasta}'
    p = subprocess.Popen(cmd,
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    output, error = p.communicate()
    # if failed to generate fai, print informative message and raise exception
    if p.returncode:
        eprint("[wt init] Failed with samtools idxstats %d\n%s\n%s" %
               (p.returncode, output.decode(), error.decode()))
        if fasta.endswith('.gz') and 'please use bgzip' in error.decode():
            msg = f'[wt init] Seems like your reference FASTA cannot be indexed with samtools faidx.\n' \
                    f'     Try one of the following:\n' \
                    f'     1. decompress it (gunzip {fasta}) and try again\n' \
                    f'     2. change the compression to bgzip:\n' \
                    f'        gunzip {fasta} && bgzip {fasta[:-3]}'
            eprint(msg)
        raise IllegalArgumentError('[wt init] Invalid reference FASTA')
    if op.isfile(fai_path):
        eprint(f'[wt init] Generated index file: {fai_path}')
    else:
        raise IllegalArgumentError(
            '[wt init] Failed to generate index file (fai)')
    return fai_path
Пример #4
0
    def parse_region(self, region):
        """ Parse input of the type -r / --region (e.g chr11:200-300) """
        region = region.replace(',', '')  # remove commas
        chrome_match = re.match(r'^chr([\d]+|[XYM])$', region)
        region_match = re.match(r'chr([\d]+|[XYM]):([\d]+)-([\d]+)', region)

        # In case region is a whole chromosome
        if chrome_match:
            self.chrom = 'chr' + chrome_match.group(1)
            region_from = 1
            region_to = self._chrome_size()

        # match region string to format chrom:from-to
        elif region_match:
            self.chrom = 'chr' + region_match.group(1)
            region_from = int(region_match.group(2))
            region_to = int(region_match.group(3))
            if region_to <= region_from:
                raise IllegalArgumentError(
                    'Invalid genomic region: {}. end before start'.format(
                        region))
            if region_to > self._chrome_size() or region_from < 1:
                raise IllegalArgumentError(
                    'Invalid genomic region: {}. Out of range'.format(region))

        else:
            raise IllegalArgumentError(
                'Invalid genomic region: {}'.format(region))

        # Update GR fields:
        self.region_str = region
        self.sites = self._region_str2sites()
        self.bp_tuple = (region_from, region_to)
Пример #5
0
    def _sites_str_to_tuple(self, sites_str):
        """ extract integers tuple (e.g (120, 130)) from a sites string (e.g '120-130') """
        if not sites_str:
            raise IllegalArgumentError(f'Empty sites string: {sites_str}')

        sites_str = sites_str.replace(',', '')
        # start-end syntax
        matchObj = re.match(r'([\d]+)-([\d]+)', sites_str)
        if matchObj:
            site1 = int(matchObj.group(1))
            site2 = int(matchObj.group(2))
        # single site syntax:
        elif '-' not in sites_str and sites_str.isdigit():
            site1 = int(sites_str)
            site2 = site1 + 1
        else:
            raise IllegalArgumentError(
                f'sites must be of format: "start-end" or "site" .\nGot: {sites_str}'
            )
        # validate sites are in range:
        if not self.genome.get_nr_sites() + 1 >= site2 >= site1 >= 1:
            msg = 'sites violate the constraints: '
            msg += f'{self.genome.get_nr_sites() + 1} >= {site2} > {site1} >= 1'
            raise IllegalArgumentError(msg)
        if site1 == site2:
            site2 += 1
        return site1, site2
Пример #6
0
def stitch_2_dfs(b1, b2, params):

    # if b2 is not the direct extension of b1, we have a problem
    if b1[-1] != b2[0]:
        msg = '[wt segment] Patch stitching Failed! ' \
              '             patches are not supposed to be merged'
        raise IllegalArgumentError(msg)

    n1 = b1[-1] - b1[0]
    n2 = b2[-1] - b2[0]
    patch1_size = min(50, n1)
    patch2_size = min(50, n2)
    patch = np.array([], dtype=int)
    while patch1_size <= n1 and patch2_size <= n2:
        # calculate blocks for patch:
        start = b1[-1] - patch1_size  #- 1
        end = b1[-1] + patch2_size
        cparams = dict(params, **{'sites': (start, end)})
        patch = segment_process(cparams)

        # find the overlaps
        if is_2_overlap(b1, patch) and is_2_overlap(patch, b2):
            # successful stitch with patches
            return merge2(merge2(b1, patch), b2)
        else:
            # failed stitch - increase patch sizes
            if not is_2_overlap(b1, patch):
                patch1_size = increase_patch(patch1_size, n1)
            if not is_2_overlap(patch, b2):
                patch2_size = increase_patch(patch2_size, n2)

    # Failed: could not stich the two chuncks
    msg = '[wt segment] Patch stitching Failed! ' \
          '             Try increasing chunk size (--chunk_size flag)'
    raise IllegalArgumentError(msg)
Пример #7
0
def get_table(blocks_df, gf, min_cov, threads=8, verbose=False, group=True):
    is_nice, _ = is_block_file_nice(blocks_df)
    if verbose:
        eprint(f'[wt table] reducing to {blocks_df.shape[0]:,} blocks')
    betas = drop_dup_keep_order(gf['full_path'])
    p = Pool(threads)
    params = [(b, blocks_df, is_nice, min_cov, verbose) for b in betas]
    # arr = [cwrap(*p) for p in params] # todo: remove
    arr = p.starmap(cwrap, params)
    p.close()
    p.join()

    dicts = [d for d in arr if d is not None]
    dres = {k: v for d in dicts for k, v in d.items()}
    if not group:
        for b in gf['fname']:
            blocks_df[b] = dres[b]
        return blocks_df

    if not dres:
        eprint(
            f'[ wt table ] failed reducing {gf["fname"].tolist()} to blocks\n{blocks_df}'
        )
        raise IllegalArgumentError()
    if dres[list(dres.keys())[0]].size != blocks_df.shape[0]:
        eprint(f'[ wt table] beta2block returned wrong number of values')
        raise IllegalArgumentError()

    groups = drop_dup_keep_order(gf['group'])
    with np.warnings.catch_warnings():
        np.warnings.filterwarnings('ignore', r'Mean of empty slice')
        for group in groups:
            blocks_df[group] = np.nanmean(np.concatenate([dres[k][None, :] for k \
                in gf['fname'][gf['group'] == group]]), axis=0).T
    return blocks_df
Пример #8
0
def main():
    """
    Generage homog files. Given a blocks file and pat[s],
    count the number of U,X,M reads for each block for each file
    """

    args = parse_args()
    if args.nr_bits not in (8 , 16):
        raise IllegalArgumentError('nr_bits must be in {8, 16}')
    if args.rlen < 3:
        raise IllegalArgumentError('rlen must be >= 3')
    if args.thresholds is not None:
        th = args.thresholds.split(',')
        if not len(th) == 2: # and th[0].is_number():
            raise IllegalArgumentError('Invalid thresholds')
        th = float(th[0]), float(th[1])
        if not (1 > th[1] > th[0] > 0):
            raise IllegalArgumentError('Invalid thresholds')
    # make sure homog tool is valid:
    validate_local_exe(homog_tool)

    pats = args.input_files
    validate_file_list(pats, '.pat.gz')

    outdir, prefix = parse_outdir_prefix(args)

    # load blocks:
    blocks_df = load_blocks_file(args.blocks_file)
    is_nice, msg = is_block_file_nice(blocks_df)
    if not is_nice:
        homog_log(msg)
        raise IllegalArgumentError(f'Invalid blocks file: {args.blocks_file}')

    for pat in sorted(pats):
        homog_process(pat, blocks_df, args, outdir, prefix)
Пример #9
0
    def validate_file(self):
        """
        Make sure file exists, and its suffix is one of the following:
        pat, bed [.gz]
        """

        if not op.isfile(self.in_file):
            raise IllegalArgumentError(f'no such file: {self.in_file}')

        suffs = self.ftype.suff
        if not self.suff in (suffs, suffs + '.gz'):
            raise IllegalArgumentError('Index only supports pat, bed formats')
Пример #10
0
    def validate_file(self):
        """
        Make sure file exists, and its suffix is one of the following:
        pat, unq, bed, tsv [.gz]
        """

        if not op.isfile(self.in_file):
            raise IllegalArgumentError("no such file: {}".format(self.in_file))

        suffs = self.ftype.suffixes
        if not self.suff in [x + '.gz' for x in suffs] + suffs:
            raise IllegalArgumentError(
                'Index only supports pat, unq, bed, tsv formats')
Пример #11
0
def load_blocks_file(blocks_path, nrows=None):
    # validate blocks_path
    validate_single_file(blocks_path)

    try:
        # see if blocks_path has a header:
        peek_df = pd.read_csv(blocks_path,
                              sep='\t',
                              nrows=1,
                              header=None,
                              comment='#')
        header = None if str(peek_df.iloc[0, 1]).isdigit() else 0

        names = COORDS_COLS5
        if len(peek_df.columns) < len(names):
            msg = f'Invalid blocks file: {blocks_path}. less than {len(names)} columns.\n'
            msg += f'Run wgbstools convert -L {blocks_path} -o OUTPUT_REGION_FILE to add the CpG columns'
            raise IllegalArgumentError(msg)

        # load
        # dtypes = {'chr':str, 'start', 'end', 'startCpG', 'endCpG'}
        dtypes = {'startCpG': 'Int64', 'endCpG': 'Int64'}
        df = pd.read_csv(blocks_path,
                         sep='\t',
                         usecols=range(len(names)),
                         dtype=dtypes,
                         header=header,
                         names=names,
                         nrows=None,
                         comment='#')

        # blocks start before they end - invalid file
        dfnona = df.dropna()  # allow blocks with missing values
        if not ((dfnona['endCpG'] - dfnona['startCpG']) >= 0).all():
            raise IllegalArgumentError(
                f'Invalid CpG columns in blocks file {blocks_path}')

        if dfnona.shape[0] == df.shape[0]:
            df['startCpG'] = df['startCpG'].astype(int)
            df['endCpG'] = df['endCpG'].astype(int)

    except pd.errors.ParserError as e:
        eprint(f'Invalid input file.\n{e}')
        return pd.DataFrame()
    except pd.errors.EmptyDataError as e:
        eprint(f'Empty blocks file.\n{e}')
        return pd.DataFrame()

    return df
Пример #12
0
    def set_regions(self):
        # if user specified a region, just use it
        if self.gr.region_str:
            return [self.gr.region_str]

        # get all chromosomes present in the bam file header
        cmd = f'samtools idxstats {self.bam_path} | cut -f1 '
        p = subprocess.Popen(cmd,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        output, error = p.communicate()
        if p.returncode or not output:
            eprint("[wt bam2pat] Failed with samtools idxstats %d\n%s\n%s" %
                   (p.returncode, output.decode(), error.decode()))
            eprint(cmd)
            eprint('[wt bam2pat] falied to find chromosomes')
            return []
        bam_chroms = output.decode()[:-1].split('\n')

        # get all chromosomes from the reference genome:
        ref_chroms = self.gr.genome.get_chroms()
        # intersect the chromosomes from the bam and from the reference
        intersected_chroms = list(set(bam_chroms) & set(ref_chroms))

        if not intersected_chroms:
            msg = '[wt bam2pat] Failed retrieving valid chromosome names. '
            msg += 'Perhaps you are using a wrong genome reference. '
            msg += 'Try running:\n\t\twgbstools set_default_ref -ls'
            raise IllegalArgumentError('Failed')

        return list(
            sorted(intersected_chroms, key=chromosome_order)
        )  # todo use the same order as in ref_chroms instead of resorting it
Пример #13
0
 def __init__(self, args):
     self.args = args
     self.dfU = pd.DataFrame()
     self.dfM = pd.DataFrame()
     self.blocks = pd.DataFrame()
     self.nr_blocks = 0
     self.orig_nr_blocks = 0
     self.keepinds = None
     self.groups = None
     self.verbose = args.verbose
     self.hyper, self.hypo = self.set_hypo_hyper(args.hyper, args.hypo)
     self.validate_args()
     # validate output dir:
     if not op.isdir(args.out_dir):
         os.mkdir(args.out_dir)
     # load groups
     self.gf = load_groups_file(args.groups_file, args.input_dir,
                                args.verbose)
     self.gf_nodup = self.gf.drop_duplicates(subset='fname').reset_index(
         drop=True)
     # validate target is in groups file
     target = self.args.target
     if target and target not in self.gf['group'].values:
         eprint(
             f'target {target} not in groups file {self.args.groups_file}')
         eprint('Possible targets:', sorted(self.gf['group'].unique()))
         raise IllegalArgumentError()
Пример #14
0
    def load_bins(self):
        if self.verbose:
            eprint('loading bins...')
        # breakpoint()
        nr_cols = (3 if self.args.uxm else 2)
        binsize = self.gf['binsize'][0] / self.orig_nr_blocks
        binsize /= nr_cols
        if binsize != int(binsize):
            raise IllegalArgumentError(
                'Error: bin file size does not match blocks number')

        dtype = np.uint8 if binsize == 1 else np.uint16

        dfU = pd.DataFrame()
        dfM = pd.DataFrame()
        if self.hypo:
            dfU = np.zeros((self.nr_blocks, self.gf_nodup.shape[0]),
                           dtype=np.float)
        if self.hyper:
            dfM = np.zeros((self.nr_blocks, self.gf_nodup.shape[0]),
                           dtype=np.float)

        from tqdm import tqdm  # todo: only if installed
        for ind, row in tqdm(self.gf_nodup.iterrows(),
                             total=self.gf_nodup.shape[0]):
            data = np.fromfile(row['full_path'], dtype).reshape(
                (-1, nr_cols))[self.keepinds, :]
            if self.hypo:
                dfU[:, ind] = table2vec(data, 'U', self.arsg.min_cov)
            if self.hyper:
                dfM[:, ind] = table2vec(data, 'M', self.arsg.min_cov)

        return self.array2df(dfU), self.array2df(dfM)
Пример #15
0
def main():
    """
    View the content of input file (pat/beta) as plain text.
    Possible filter by genomic region or sites range
    Output to stdout as default
    """
    parser = parse_args()
    args = parser.parse_args()

    if args.sub_sample is not None and not 1 >= args.sub_sample >= 0:
        parser.error('[wt view] sub-sampling rate must be within [0.0, 1.0]')

    # validate input file
    input_file = args.input_file
    validate_single_file(input_file)

    try:
        if input_file.endswith('.beta'):
            gr = GenomicRegion(args)
            view_beta(input_file, gr, args.out_path, args.bed_file)
        elif op.splitext(input_file)[1] in ('.lbeta', '.bin'):
            view_other_bin(input_file, args)
        elif input_file.endswith('.pat.gz'):
            cview(input_file, args)
        else:
            raise IllegalArgumentError('Unknown input format:', input_file)

    except BrokenPipeError:
        catch_BrokenPipeError()
Пример #16
0
    def __init__(self, args=None, region=None, name='hg19'):
        self.genome_name = name
        self.chrom = None
        self.sites = None
        self.region_str = None
        self.bp_tuple = None
        self.chrs_sz = None  # DataFrame of chromosomes sizes (in number of sites)
        self.name = name
        self.args = args

        # todo: this could be prettier
        if args is not None:
            self.name = args.genome
            self.genome = GenomeRefPaths(self.name)
            if args.sites:
                self.parse_sites(args.sites)
            elif args.region:
                self.parse_region(args.region)
        elif region is not None:
            self.genome = GenomeRefPaths(self.name)
            self.parse_region(region)
        else:
            raise IllegalArgumentError('Invalid GR init {}'.format(region))

        self.nr_sites = None if self.sites is None else self.sites[
            1] - self.sites[0]
        self.annotation = self.add_anno()
Пример #17
0
    def get_fasta(self):
        # download fasta from UCSC, unless the fasta file is provided
        if self.ref_path is not None:
            validate_single_file(self.ref_path)
            return

        # no FASTA path provided. Attempt to download one
        ref_path = op.join(self.out_dir, f'{self.name}.fa.gz')
        url = f'https://hgdownload.soe.ucsc.edu/goldenPath/{self.name}/bigZips/{self.name}.fa.gz'
        cmd = f'curl {url} -o {ref_path}'
        eprint(
            f'[wt init] No reference FASTA provided. Attempting to download from\n\t{url}'
        )
        p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
        output, error = p.communicate()
        if p.returncode:
            eprint(
                f'[wt init] Failed downloading reference for genome {self.name}: %d\n%s\n%s'
                % (p.returncode, output.decode(), error.decode()))
            eprint(
                f'[wt init] Try downloading yourself and use --fasta_name flag, or check the "name" parameter'
            )
            raise IllegalArgumentError(f'[wt init] No reference FASTA found')
        eprint(
            f'[wt init] successfully downloaded FASTA. Now gunzip and bgzip it...'
        )
        cmd = f'gunzip {ref_path} && bgzip -@ {self.args.threads} {ref_path[:-3]}'
        subprocess.check_call(cmd, shell=True)
        self.ref_path = ref_path
Пример #18
0
    def __init__(self, args=None, region=None, sites=None, genome_name=None):
        self.genome_name = get_genome_name(genome_name)
        self.chrom = None
        self.sites = sites
        self.region_str = region
        self.bp_tuple = None
        self.args = args

        # todo: this could be prettier
        if args is not None:
            self.genome_name = get_genome_name(args.genome)
            self.genome = GenomeRefPaths(self.genome_name)
            if args.sites:
                self.parse_sites(args.sites)
            elif args.region:
                self.parse_region(args.region)
        elif region is not None:
            self.genome = GenomeRefPaths(self.genome_name)
            self.parse_region(region)
        elif sites is not None:
            self.genome = GenomeRefPaths(self.genome_name)
            self.parse_sites(sites)
        else:
            raise IllegalArgumentError(f'Invalid GR init {region}')

        self.nr_sites = None if self.sites is None else self.sites[
            1] - self.sites[0]
        self.annotation = self.add_anno()
Пример #19
0
    def load_fai(self):
        """ Generate, link and load the fai file to a DataFrame """
        fai_path = generate_fai(self.ref_path)

        # Link fa + fai (or fa.gz+fa.gz.gzi+fa.gz.fai) to the output dir
        fasta_name = 'genome.fa' + ('.gz'
                                    if self.ref_path.endswith('.gz') else '')
        # fasta_name = 'genome.fa'
        self.link_file(self.ref_path, fasta_name)
        self.link_file(fai_path, fasta_name + '.fai')
        if fasta_name.endswith('.gz'):
            self.link_file(self.ref_path + '.gzi', fasta_name + '.gzi')

        # load fai file
        try:
            df = pd.read_csv(fai_path,
                             sep='\t',
                             header=None,
                             usecols=[0, 1, 2, 4],
                             names=['chr', 'size', 'offset', 'width'])
            # filter invalid chromosomes
            df = df[df.apply(lambda x: is_valid_chrome(x['chr']), axis=1)]

            # sort chromosomes:
            if not self.args.no_sort:
                df = pd.DataFrame(sorted(df['chr'], key=chromosome_order),
                                  columns=['chr']).merge(df, how='left')
            return df
        except pd.errors.ParserError as e:
            raise IllegalArgumentError(f'Invalid fai file.\n{e}')
Пример #20
0
    def insert_read_to_table(self, read, table, shift):
        read_start = int(read[1])
        patt = read[2]
        count = int(read[3])

        # skip empty (all dots) reads:
        if not patt.strip('.'):
            return

        if self.uxm:
            patt = self.read_uxm(patt, count)
        patt_ints = [str2int[l] for l in patt]

        # perform multiple times for reads with count > 1, but no more than "max_reps" times:
        for c in range(min(self.max_reps, count)):
            # find the relative starting point of the current read
            col = read_start - shift
            if col < 0:
                raise IllegalArgumentError('Error: Pat is not sorted!')

            # find the first available row to insert current read:
            if self.args.no_dense:  # no_dense: present each read in a new line
                row = np.argmin(table.sum(axis=1))
            else:
                row = np.argmin(table[:, col])

            # make sure the slots are free
            assert (table[row, col:col + len(patt)].sum() == 0)
            assert (row < table.shape[0])

            # insert read and spaces:
            table[row, col:col + len(patt)] = patt_ints
            table[row, :col][table[row, :col] == 0] = 1  # before read
            table[row, col + len(patt)] = 1  # after read
Пример #21
0
def load_seq_by_chrom(chrom, ref_path, fai_df, debug):
    eprint(chrom)

    # get chromosome's location in the fasta
    chrom, size, offset, width = fai_df[fai_df['chr'] == chrom].values[0]

    # load the chromosome's subsequence from fasta
    with open(ref_path, 'r') as f:
        f.seek(offset)
        nr_lines = size // (
            width - 1) + 1  # number of lines to read for current chromosome
        to_read = nr_lines * width
        if debug:
            to_read = min(to_read, 100 * width)
        txt = f.read(to_read)
    seq = ''.join(s.strip() for s in txt.split('\n')).upper()

    # remove possible trailing characters (belonging to the next chromosome)
    end_pos = seq.rfind('>')
    if end_pos != -1:
        seq = seq[:end_pos]

    # validate sequence length
    if len(seq) != size and not debug:
        raise IllegalArgumentError('Error while loading {} from fasta: '
                                   'read {} bases instead of {}'.format(
                                       chrom, len(seq), size))

    # Find CpG sites loci
    tf = pd.DataFrame([m.start() + 1 for m in re.finditer('CG', seq)],
                      columns=['loc'])
    tf['chr'] = chrom
    return tf[['chr', 'loc']]
Пример #22
0
    def set_regions(self):
        if self.gr.region_str:
            return [self.gr.region_str]

        cmd = f'samtools idxstats {self.bam_path} | cut -f1 '
        p = subprocess.Popen(cmd,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        output, error = p.communicate()
        if p.returncode or not output:
            eprint("[wt bam2pat] Failed with samtools idxstats %d\n%s\n%s" %
                   (p.returncode, output.decode(), error.decode()))
            eprint(cmd)
            eprint('[wt bam2pat] falied to find chromosomes')
            return []
        nofilt_chroms = output.decode()[:-1].split('\n')
        filt_chroms = [c for c in nofilt_chroms if 'chr' in c]
        if filt_chroms:
            filt_chroms = [
                c for c in filt_chroms if re.match(r'^chr([\d]+|[XYM])$', c)
            ]
        else:
            filt_chroms = [c for c in nofilt_chroms if c in CHROMS]
        chroms = list(sorted(filt_chroms, key=chromosome_order))
        if not chroms:
            eprint('[wt bam2pat] Failed retrieving valid chromosome names')
            raise IllegalArgumentError('Failed')

        return chroms
Пример #23
0
def pat2beta(pat_path, out_dir, args, force=True):
    validate_single_file(pat_path)

    if pat_path.endswith('.pat.gz'):
        cmd = 'gunzip -cd'
    elif pat_path.endswith('.pat'):
        cmd = 'cat'
    else:
        raise IllegalArgumentError(f'Invalid pat suffix: {pat_path}')

    suff = '.lbeta' if args.lbeta else '.beta'
    out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + suff)
    if not delete_or_skip(out_beta, force):
        return

    if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile(
            pat_path + '.csi'):
        arr = mult_pat2beta(pat_path, args)
    else:
        nr_sites = GenomeRefPaths(args.genome).get_nr_sites()
        cmd += f' {pat_path} | {pat2beta_tool} {1} {nr_sites + 1}'
        x = subprocess.check_output(cmd, shell=True).decode()
        arr = np.fromstring(x, dtype=int, sep=' ').reshape((-1, 2))

    trim_to_uint8(arr, args.lbeta).tofile(out_beta)
    return out_beta
Пример #24
0
    def validate_args(self):

        # validate integers
        if self.min_cpg < 0:
            raise IllegalArgumentError('min_cpg must be non negative')
        if self.max_cpg < 1:
            raise IllegalArgumentError('max_cpg must larger than 0')
        if self.min_bp < 0:
            raise IllegalArgumentError('min_bp must be non negative')
        if self.max_bp < 2:
            raise IllegalArgumentError('max_bp must larger than 1')
        if self.chunk_size < 1:
            raise IllegalArgumentError('chunk_size must larger than 1')

        # validate the [0.0, 1.0] fractions
        for key in ('na_rate_tg', 'na_rate_bg', 'delta', 'tg_quant', \
                    'bg_quant', 'unmeth_thresh', 'meth_thresh', \
                    'unmeth_mean_thresh', 'meth_mean_thresh'):
            if not (1.0 >= getattr(self, key) >= 0):
                eprint(
                    f'Invalid value for {key} ({val}): must be in ({low}, {high})'
                )
                raise IllegalArgumentError()

        # validate hyper hypo:
        if self.only_hyper and self.only_hypo:
            eprint(f'at most one of (only_hyper, only_hypo) can be specified')
            raise IllegalArgumentError()

        # validate input files
        for key in ('blocks_path', 'groups_file'):
            val = getattr(self, key)
            if val is None:
                eprint(f'[wt fm] missing required parameter: {key}')
                raise IllegalArgumentError()
            validate_single_file(val)
            # change path to absolute path
            setattr(self, key, op.abspath(val))

        # validate betas
        if (self.betas is None and self.beta_list_file is None) or \
           (self.betas is not None and self.beta_list_file is not None):
            eprint(
                f'[wt fm] Exactly one of the following must be specified: betas, beta_list_file'
            )
            raise IllegalArgumentError()

        if self.beta_list_file:
            validate_single_file(self.beta_list_file)
            with open(self.beta_list_file, 'r') as f:
                self.betas = [l.strip() for l in f.readlines()]
        validate_file_list(self.betas)
Пример #25
0
 def _sites_str_to_tuple(self, sites_str):
     """ extract integers tuple (e.g (120, 130)) from a sites string (e.g '120-130') """
     if sites_str:
         sites_str = sites_str.replace(',', '')
         matchObj = re.match(r'([\d]+)-([\d]+)', sites_str)
         if matchObj:
             site1 = int(matchObj.group(1))
             site2 = int(matchObj.group(2))
             if not self.genome.nr_sites + 1 >= site2 > site1 >= 1:
                 msg = 'sites violate the constraints: '
                 msg += '{} >= {} > {} >= 1'.format(
                     self.genome.nr_sites + 1, site2, site1)
                 raise IllegalArgumentError(msg)
             return site1, site2
     raise IllegalArgumentError(
         'sites must be of format: ([\d])-([\d]).\nGot: {}'.format(
             sites_str))
Пример #26
0
def load_gfile_helper(groups_file):
    # load and validate csv
    gf = pd.read_csv(groups_file, index_col=False, comment='#')
    if 'group' not in gf.columns:
        raise IllegalArgumentError(
            'gropus file must have a column named "group"')
    # drop samples where include==False
    if 'include' in gf.columns:
        if gf['include'].dtype != bool:
            eprint('Invalid group file')
            raise IllegalArgumentError(
                'Invalid group file. Include column must be boolean')
        gf = gf[gf['include']]
    # drop unnecessary columns
    gf = gf.rename(columns={gf.columns[0]: 'fname'})
    gf = gf[['fname', 'group']].dropna().reset_index(drop=True)
    return gf
Пример #27
0
    def parse_region(self, region):
        """ Parse input of the type -r / --region (e.g chr11:200-300) """

        self.region_str, region_from, region_to = self.find_region_format(
            region)

        # validate region range:
        if region_to <= region_from:
            raise IllegalArgumentError(
                f'Invalid genomic region: {region}. end before start')
        if region_to > self._chrome_size() or region_from < 1:
            raise IllegalArgumentError(
                f'Invalid genomic region: {region}. Out of range')

        # Update GR fields:
        self.bp_tuple = (region_from, region_to)
        self.sites = self._region_str2sites()
Пример #28
0
    def validate_rates(self, rates):
        if len(rates) == self.nr_pats - 1:
            rates.append(1.0 - np.sum(rates))

        if len(rates) != self.nr_pats:
            raise IllegalArgumentError(
                'len(rates) must be in {len(files), len(files) - 1}')

        if np.abs(np.sum(rates) - 1) > 1e-8:
            raise IllegalArgumentError('Sum(rates) == {} != 1'.format(
                np.sum(rates)))

        if np.min(rates) < 0 or np.max(rates) > 1:
            raise IllegalArgumentError('rates must be in range [0, 1)')

        self.add_stats_col('ReqstRates', rates)
        return rates
Пример #29
0
    def validate_input(self):

        # validate bam path:
        validate_bam(self.bam_path)

        # validate output dir:
        if not (op.isdir(self.out_dir)):
            raise IllegalArgumentError('Invalid output dir: {}'.format(
                self.out_dir))
Пример #30
0
 def __init__(self, args):
     self.args = args
     self.gr = GenomicRegion(args)
     self.outdir = args.outdir
     self.name = ''
     if not op.isdir(self.outdir):
         raise IllegalArgumentError('Invalid output directory: ' +
                                    self.outdir)
     self.chrom_sizes = GenomeRefPaths(args.genome).chrom_sizes