示例#1
0
def pat2beta(pat_path, out_dir, args, force=True):
    validate_single_file(pat_path)

    if pat_path.endswith('.pat.gz'):
        cmd = 'gunzip -cd'
    elif pat_path.endswith('.pat'):
        cmd = 'cat'
    else:
        raise IllegalArgumentError(f'Invalid pat suffix: {pat_path}')

    suff = '.lbeta' if args.lbeta else '.beta'
    out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + suff)
    if not delete_or_skip(out_beta, force):
        return

    if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile(
            pat_path + '.csi'):
        arr = mult_pat2beta(pat_path, args)
    else:
        nr_sites = GenomeRefPaths(args.genome).get_nr_sites()
        cmd += f' {pat_path} | {pat2beta_tool} {1} {nr_sites + 1}'
        x = subprocess.check_output(cmd, shell=True).decode()
        arr = np.fromstring(x, dtype=int, sep=' ').reshape((-1, 2))

    trim_to_uint8(arr, args.lbeta).tofile(out_beta)
    return out_beta
示例#2
0
def convert_bed_file(args):
    """
    bed file should be of the format (tab-separated):
    Input:
    chr    start    end    [...]
    Output:
    chr    start    end    startCpG    endCpG   [...]
    """
    out_path = sys.stdout if args.out_path is None else args.out_path
    if not delete_or_skip(out_path, args.force):
        return
    # add CpG columns
    bed_file = args.bed_file
    # TODO: support stdin for -L in all wgbstools features, and add it to the help message
    if bed_file == '-':
        bed_file = sys.stdin
    add_anno = (not args.parsable) and (not args.no_anno)

    if not check_executable('bedtools', verbose=False):
        # eprint('continue with a slower implementation')
        r = add_cpgs_to_bed(bed_file=bed_file,
                            genome=args.genome,
                            drop_empty=args.drop_empty,
                            threads=args.threads,
                            add_anno=add_anno)
    else:
        r = bedtools_conversion(bed_file, args.genome, args.drop_empty,
                                add_anno, args.debug)
    r.to_csv(out_path, sep='\t', header=None, index=None, na_rep='NA')
示例#3
0
def main():
    """
    Merge files.
    Accumulate all reads / observations from multiple (>=2) input files,
    and output a single file of the same format.
    Supported formats: pat.gz, beta
    """
    args = parse_args()

    # validate input files
    input_files = args.input_files

    # construct output path
    out_path = args.prefix + splitextgz(args.input_files[0])[1]

    if op.realpath(out_path) in [op.realpath(p) for p in args.input_files]:
        eprint('[wt merge] Error output path is identical ' \
                'to one of the input files {out_path}')
        return

    if not delete_or_skip(out_path, args.force):
        return

    files_type = splitextgz(input_files[0])[1][1:]

    if files_type in ('beta', 'bin'):
        merge_betas(input_files, out_path)
    elif files_type == 'pat.gz':
        MergePats(input_files, args.prefix + '.pat.gz', args.labels,
                  args).merge_pats()
    else:
        print('Unknown input format:', input_files[0])
        return
示例#4
0
def bed2betas(args):

    # merge with the reference CpG bed file,
    # so the #lines in file will include all 28217448 sites (with NaN as 0)
    region = 'chr1:10469-876225' if args.debug else None
    nrows = 10000 if args.debug else None
    try:
        rf = None       # Reference dictionary
        for bed in args.bed_paths:
            eprint(f'[wt bed] Converting {op.basename(bed)}...')
            # Check if bed should be skipped
            outpath = op.join(args.outdir, splitextgz(op.basename(bed))[0] + '.beta')
            if not delete_or_skip(outpath, args.force):
                continue

            # Load dict (at most once) and bed
            if rf is None:
                rf = load_dict_section(region, args.genome)
            df = load_bed(bed, nrows, args.add_one)

            # todo: implement in C++.
            # merge dict with bed, then dump
            res = rf.merge(df, how='left', on=['chr', 'start']).fillna(0)
            trim_to_uint8(np.array(res[['meth', 'total']])).tofile(outpath)

    except pd.errors.ParserError as e:
        eprint(f'[wt bed] Invalid input file.\n{e}')
        return
示例#5
0
def main():
    """
    Merge files.
    Accumulate all reads / observations from multiple (>=2) input files,
    and output a single file of the same format.
    Supported formats: pat.gz, beta
    """
    args = parse_args()

    # validate input files
    input_files = args.input_files
    validate_files_list(input_files, min_len=2)

    # construct output path
    out_path = args.prefix + splitextgz(args.input_files[0])[1]
    if not delete_or_skip(out_path, args.force):
        return

    files_type = splitextgz(input_files[0])[1][1:]

    if files_type in ('beta', 'bin'):
        merge_betas(input_files, out_path)
    elif files_type == 'pat.gz':
        MergePats(input_files, args.prefix + '.pat', args.labels, args).merge_pats()
    elif files_type == 'unq.gz':
        merge_unqs()
    else:
        print('Unknown input format:', input_files[0])
        return
示例#6
0
def bed2betas(args):

    # merge with the reference CpG bed file,
    # so the #lines in file will include all 28217448 sites (with NaN as 0)
    nrows = 100000 if args.debug else None
    try:
        rf = None  # Reference dictionary
        for bed in args.bed_paths:
            eprint('Converting {}...'.format(op.basename(bed)))
            # Check if bed should be skipped:
            outpath = op.join(args.outdir,
                              splitextgz(op.basename(bed))[0]) + '.beta'
            if not delete_or_skip(outpath, args.force):
                continue

            # Load dict (at most once) and bed
            if rf is None:
                rf = load_dict(nrows=nrows, genome_name=args.genome)
            df = load_bed(bed, nrows, args.genome == 'mm9')

            # merge dict with bed, then dump
            res = rf.merge(df, how='left', on=['chr', 'start']).fillna(0)
            trim_to_uint8(np.array(res[['meth', 'total']])).tofile(outpath)

    except pd.errors.ParserError as e:
        eprint('Invalid input file.\n{}'.format(e))
        return
示例#7
0
def beta_to_bed(beta_path, gr, bed_file, min_cov, mean, keep_na, force, opath):
    if not delete_or_skip(opath, force):
        return

    cmd = beta2bed_build_cmd(beta_path, gr, bed_file, min_cov, mean, keep_na)
    if opath is not None:
        if opath.endswith('.gz'):
            cmd += ' | gzip -c '
        cmd += f' > {opath}'
    subprocess_wrap_sigpipe(cmd)
示例#8
0
def convert_site_file(args):
    """
    site file should be of the format (tab-separated):
    Input:
    startCpG    [endCpG]
    Output:
    chr    start    end    startCpG    [endCpG]
    """
    out_path = sys.stdout if args.out_path is None else args.out_path
    if not delete_or_skip(out_path, args.force):
        return
    # add loci columns
    add_bed_to_cpgs(args.site_file, args.genome, args.out_path)
示例#9
0
    def run(self):

        # if index already exists delete it or skip it
        if not delete_or_skip(self.in_file + self.ftype.ind_suff, self.force):
            return

        # if file is gzipped instead of bgzipped, uncompress it
        if self.is_file_gzipped():
            sp.check_call(['gunzip', self.in_file])
            self.in_file = self.in_file[:-3]

        if not self.in_file.endswith('.gz'):
            self.bgzip()
        self.index_bgzipped_file()
示例#10
0
    def __init__(self, args):
        self.args = args
        self.out_path = args.out_path
        self.debug = args.debug
        if not delete_or_skip(self.out_path, self.args.force):
            return

        # load bed file:
        self.df = load_bed(args.bed_path, 100000 if self.debug else None)

        self.genome = GenomeRefPaths(args.genome)

        # load chromosomes sizes (in GpGs):
        self.cf = self.genome.get_chrom_cpg_size_table()
        self.cf['size'] = np.cumsum(self.cf['size'])
        self.proc_bed()
示例#11
0
    def __init__(self, args):
        self.args = args
        self.ref_path = args.genome_ref
        self.force = args.force
        self.name = args.name
        self.out_dir = self.make_output_dir()

        # validate input files
        validate_single_file(self.ref_path, '.fa')

        # abort if files exists and --force was not specified
        eprint('Setting up genome reference files in {}'.format(self.out_dir))
        if not delete_or_skip(op.join(self.out_dir, 'CpG.bed.gz'), self.force):
            return

        self.fai_df = self.load_fai()
示例#12
0
    def run_beta_to_bed(self, beta_path):
        eprint('{}'.format(op.basename(beta_path)))
        prefix = self.set_prefix(beta_path)
        out_bed = prefix + '.bed'
        if not delete_or_skip(out_bed, self.args.force):
            return

        barr = self.load_beta(beta_path)

        # paste dict with beta, then dump
        self.ref_dict['meth'] = barr[:, 0]
        self.ref_dict['total'] = barr[:, 1]
        self.ref_dict[self.ref_dict['total'] > 0].to_csv(out_bed,
                                                         sep='\t',
                                                         header=None,
                                                         index=None)
        del self.ref_dict['meth'], self.ref_dict['total']
示例#13
0
    def run(self):

        # if index already exists delete it or skip it
        if not delete_or_skip(self.in_file + self.ftype.ind_suff,
                              self.args.force):
            return

        if self.in_file.endswith('.gz'):
            self.in_file = op.splitext(self.in_file)[0]
            # try indexing it:
            if not self.index_bgzipped_file():
                return  # success
            # couldn't index because the file is gzipped instead of bgzipped
            subprocess.check_call(['gunzip', self.in_file + '.gz'])

        self.bgzip()
        self.index_bgzipped_file()
示例#14
0
    def single_mix(self, rep):
        prefix_i = self.prefix + '_{}.pat'.format(rep + 1)
        if not delete_or_skip(prefix_i + '.gz', self.args.force):
            return

        view_flags = []
        for i in range(self.nr_pats):
            v = ' --awk '
            if self.args.strict:
                v += ' --strict'
            if self.args.bed_file is not None:
                v += ' -L {}'.format(self.args.bed_file)
            elif self.gr.sites is not None:
                v += ' -s {}-{}'.format(*self.gr.sites)
            v += ' --sub_sample {}'.format(self.adj_rates[i])
            view_flags.append(v)
        print('prefix:', prefix_i)
        m = MergePats(self.pats, prefix_i, self.labels, args=self.args)
        m.fast_merge_pats(view_flags=view_flags)
示例#15
0
def main():
    """
    Run the WGBS pipeline to generate pat & beta files out of an input bam file
    """
    parser = add_args_snp_splitt()
    args = parse_args_snp_split(parser)
    # validate output dir:
    if not op.isdir(args.out_dir):
        raise IllegalArgumentError(f'Invalid output dir: {args.out_dir}')

    validate_local_exe(allele_split_tool)
    for bam in [args.bam]:
        if not validate_bam(bam):
            eprint(f'[wt bam2pat] Skipping {bam}')
            continue

        pat = op.join(args.out_dir, op.basename(bam)[:-4] + PAT_SUFF)
        if not delete_or_skip(pat, args.force):
            continue
        SNPSplit(args, bam)
示例#16
0
def pat2beta(pat_path, out_dir, args, force=True):
    validate_single_file(pat_path)

    if pat_path.endswith('.pat.gz'):
        cmd = 'gunzip -cd'
    elif pat_path.endswith('.pat'):
        cmd = 'cat'
    else:
        raise IllegalArgumentError('Invalid pat suffix: {}'.format(pat_path))

    out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + '.beta')
    if not delete_or_skip(out_beta, force):
        return
    nr_sites = GenomeRefPaths(args.genome).nr_sites

    if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile(
            pat_path + '.csi'):
        return mult_pat2beta(pat_path, out_beta, nr_sites, args)

    cmd += ' {} | {} {} {}'.format(pat_path, PAT2BETA_TOOL, out_beta, nr_sites)
    subprocess.check_call(cmd, shell=True)
    return out_beta
示例#17
0
def main():
    """
    Run the WGBS pipeline to generate pat & beta files out of an input bam file
    """
    parser = argparse.ArgumentParser(description=main.__doc__)
    parser = add_args(parser)
    args = parse_args(parser)
    # validate output dir:
    if not op.isdir(args.out_dir):
        raise IllegalArgumentError(f'Invalid output dir: {args.out_dir}')

    validate_local_exe(match_maker_tool)
    validate_local_exe(patter_tool)
    for bam in args.bam:
        if not validate_bam(bam):
            eprint(f'[wt bam2pat] Skipping {bam}')
            continue

        pat = op.join(args.out_dir, op.basename(bam)[:-4] + PAT_SUFF)
        if not delete_or_skip(pat, args.force):
            continue
        Bam2Pat(args, bam)
示例#18
0
    def single_mix(self, rep):
        mix_i = self.prefix + f'_{rep + 1}.pat.gz'
        if not delete_or_skip(mix_i, self.args.force):
            return

        view_flags = []
        for i in range(self.nr_pats):
            v = ' '
            if self.args.strict:
                v += ' --strict'
            if self.args.strip:
                v += ' --strip'
            if self.args.min_len:
                v += f' --min_len {self.args.min_len}'
            if self.args.bed_file is not None:
                v += ' -L {}'.format(self.args.bed_file)
            elif not self.gr.is_whole():
                v += ' -s {}-{}'.format(*self.gr.sites)
            v += ' --sub_sample {}'.format(self.adj_rates[i])
            view_flags.append(v)
        eprint('mix:', mix_i)
        m = MergePats(self.pats, mix_i, self.labels, args=self.args)
        m.fast_merge_pats(view_flags=view_flags)
示例#19
0
    def run_beta_to_bw(self, beta_path):
        eprint('{}'.format(op.basename(beta_path)))

        prefix = self.set_prefix(beta_path)
        out_bigwig = prefix + BW_EXT
        out_bed_graph = prefix + BG_EXT
        cov_bigwig = prefix + COV_BW_EXT
        cov_bed_graph = prefix + COV_BG_EXT

        # Check if the current file should be skipped:
        if not delete_or_skip(out_bigwig, self.args.force):
            return

        # load beta file
        barr = self.load_beta(beta_path)

        # dump coverage:
        if self.args.dump_cov:
            eprint('Dumping cov...')
            self.ref_dict['cov'] = barr[:, 1]
            sort_and_dump_df(
                self.ref_dict[self.ref_dict['cov'] >= self.args.min_cov],
                cov_bed_graph)
            del self.ref_dict['cov']
            # convert bedGraph to bigWig:
            self.bed_graph_to_bigwig(cov_bed_graph, cov_bigwig)

        # dump beta values to bedGraph
        eprint('Dumping beta vals...')
        self.ref_dict['beta'] = np.round(beta2vec(barr, na=-1), 3)
        if self.args.remove_nan:
            self.ref_dict = self.ref_dict[self.ref_dict['beta'] != -1]
        sort_and_dump_df(self.ref_dict, out_bed_graph)
        del self.ref_dict['beta']

        # convert bedGraphs to bigWigs:
        self.bed_graph_to_bigwig(out_bed_graph, out_bigwig)
示例#20
0
    def run_beta_to_bw(self, beta_path):
        self.name = op.basename(beta_path)

        prefix = op.join(self.outdir, op.splitext(self.name)[0])
        out_bigwig = prefix + BW_EXT
        out_bed_graph = prefix + BG_EXT

        # Check if the current file should be skipped:
        if not delete_or_skip(out_bigwig, self.args.force):
            return

        # convert beta to bed:
        b2bw_log(f'[{self.name}] Dumping bed...')
        beta_to_bed(beta_path=beta_path,
                    gr=self.gr,
                    bed_file=self.args.bed_file,
                    min_cov=self.args.min_cov,
                    mean=True,
                    keep_na=self.args.keep_na,
                    force=True,
                    opath=out_bed_graph)

        # convert bedGraphs to bigWigs:
        self.bed_graph_to_bigwig(out_bed_graph, out_bigwig)