Пример #1
0
    def get_fasta(self):
        # download fasta from UCSC, unless the fasta file is provided
        if self.ref_path is not None:
            validate_single_file(self.ref_path)
            return

        # no FASTA path provided. Attempt to download one
        ref_path = op.join(self.out_dir, f'{self.name}.fa.gz')
        url = f'https://hgdownload.soe.ucsc.edu/goldenPath/{self.name}/bigZips/{self.name}.fa.gz'
        cmd = f'curl {url} -o {ref_path}'
        eprint(
            f'[wt init] No reference FASTA provided. Attempting to download from\n\t{url}'
        )
        p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
        output, error = p.communicate()
        if p.returncode:
            eprint(
                f'[wt init] Failed downloading reference for genome {self.name}: %d\n%s\n%s'
                % (p.returncode, output.decode(), error.decode()))
            eprint(
                f'[wt init] Try downloading yourself and use --fasta_name flag, or check the "name" parameter'
            )
            raise IllegalArgumentError(f'[wt init] No reference FASTA found')
        eprint(
            f'[wt init] successfully downloaded FASTA. Now gunzip and bgzip it...'
        )
        cmd = f'gunzip {ref_path} && bgzip -@ {self.args.threads} {ref_path[:-3]}'
        subprocess.check_call(cmd, shell=True)
        self.ref_path = ref_path
Пример #2
0
def main():
    """
    View the content of input file (pat/beta) as plain text.
    Possible filter by genomic region or sites range
    Output to stdout as default
    """
    parser = parse_args()
    args = parser.parse_args()

    if args.sub_sample is not None and not 1 >= args.sub_sample >= 0:
        parser.error('[wt view] sub-sampling rate must be within [0.0, 1.0]')

    # validate input file
    input_file = args.input_file
    validate_single_file(input_file)

    try:
        if input_file.endswith('.beta'):
            gr = GenomicRegion(args)
            view_beta(input_file, gr, args.out_path, args.bed_file)
        elif op.splitext(input_file)[1] in ('.lbeta', '.bin'):
            view_other_bin(input_file, args)
        elif input_file.endswith('.pat.gz'):
            cview(input_file, args)
        else:
            raise IllegalArgumentError('Unknown input format:', input_file)

    except BrokenPipeError:
        catch_BrokenPipeError()
Пример #3
0
def pat2beta(pat_path, out_dir, args, force=True):
    validate_single_file(pat_path)

    if pat_path.endswith('.pat.gz'):
        cmd = 'gunzip -cd'
    elif pat_path.endswith('.pat'):
        cmd = 'cat'
    else:
        raise IllegalArgumentError(f'Invalid pat suffix: {pat_path}')

    suff = '.lbeta' if args.lbeta else '.beta'
    out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + suff)
    if not delete_or_skip(out_beta, force):
        return

    if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile(
            pat_path + '.csi'):
        arr = mult_pat2beta(pat_path, args)
    else:
        nr_sites = GenomeRefPaths(args.genome).get_nr_sites()
        cmd += f' {pat_path} | {pat2beta_tool} {1} {nr_sites + 1}'
        x = subprocess.check_output(cmd, shell=True).decode()
        arr = np.fromstring(x, dtype=int, sep=' ').reshape((-1, 2))

    trim_to_uint8(arr, args.lbeta).tofile(out_beta)
    return out_beta
Пример #4
0
def view_gr(pat, args, get_cmd=False):
    validate_single_file(pat, '.pat.gz')
    gr = GenomicRegion(args)
    if gr.is_whole():
        s = 1
        e = gr.genome.get_nr_sites() + 1
        cmd = f'gunzip -c {pat} '
    else:
        s, e = gr.sites
        ms = max(1, s - MAX_PAT_LEN)
        cmd = f'tabix {pat} {gr.chrom}:{ms}-{e - 1} '

    view_flags = set_view_flags(args)
    cmd += f' | {cview_tool} --sites "{s}\t{e}" ' + view_flags
    if hasattr(
            args,
            'sub_sample') and args.sub_sample is not None:  # sub-sample reads
        validate_local_exe(pat_sampler)
        cmd += f' | {pat_sampler} {args.sub_sample} '
    if not gr.is_whole():
        cmd += f' | sort -k2,2n -k3,3 '
    cmd += f' | {collapse_pat_script} - '
    if get_cmd:
        return cmd
    if args.out_path is not None:
        cmd += f' > {args.out_path}'
    subprocess_wrap_sigpipe(cmd)
Пример #5
0
def main():
    """
    Convert beta file to bed file.
    """
    args = parse_args()
    validate_single_file(args.beta_path, '.beta')
    gr = GenomicRegion(args)
    beta_to_bed(args.beta_path, gr, args.bed_file, args.min_cov, args.mean, args.keep_na, args.force, args.outpath)
Пример #6
0
def bview_build_cmd(beta_path, gr, bed_path):
    # compose a shell command to output a beta file to stdout
    cmd = f'{view_beta_script} {gr.genome.revdict_path} {beta_path} '
    if not gr.is_whole():
        cmd += f' {gr.chrom} {gr.sites[0]} {gr.nr_sites}'
    if bed_path:
        validate_single_file(bed_path)
        cmd += f' | bedtools intersect -b {bed_path} -a stdin -wa '
    return cmd
Пример #7
0
def main():
    """ view pat file with the c++ engine """
    parser = parse_args()
    args = parser.parse_args()
    # validate input file
    pat = args.pat
    validate_single_file(pat)
    if (args.sub_sample is not None) and (args.sub_sample < 0):
        parser.error('[wt view] sub-sampling rate must be >= 0')
    validate_local_exe(cview_tool)
    cview(pat, args)
Пример #8
0
 def load_from_file(param_file):
     if not param_file:
         return
     validate_single_file(param_file)
     d = pd.read_csv(param_file,
                     sep=':',
                     comment='#',
                     header=None,
                     names=['val'],
                     index_col=0,
                     skipinitialspace=True).to_dict()['val']
     return MFParams.set_param_type(d)
Пример #9
0
    def validate_args(self):

        # validate integers
        if self.min_cpg < 0:
            raise IllegalArgumentError('min_cpg must be non negative')
        if self.max_cpg < 1:
            raise IllegalArgumentError('max_cpg must larger than 0')
        if self.min_bp < 0:
            raise IllegalArgumentError('min_bp must be non negative')
        if self.max_bp < 2:
            raise IllegalArgumentError('max_bp must larger than 1')
        if self.chunk_size < 1:
            raise IllegalArgumentError('chunk_size must larger than 1')

        # validate the [0.0, 1.0] fractions
        for key in ('na_rate_tg', 'na_rate_bg', 'delta', 'tg_quant', \
                    'bg_quant', 'unmeth_thresh', 'meth_thresh', \
                    'unmeth_mean_thresh', 'meth_mean_thresh'):
            if not (1.0 >= getattr(self, key) >= 0):
                eprint(
                    f'Invalid value for {key} ({val}): must be in ({low}, {high})'
                )
                raise IllegalArgumentError()

        # validate hyper hypo:
        if self.only_hyper and self.only_hypo:
            eprint(f'at most one of (only_hyper, only_hypo) can be specified')
            raise IllegalArgumentError()

        # validate input files
        for key in ('blocks_path', 'groups_file'):
            val = getattr(self, key)
            if val is None:
                eprint(f'[wt fm] missing required parameter: {key}')
                raise IllegalArgumentError()
            validate_single_file(val)
            # change path to absolute path
            setattr(self, key, op.abspath(val))

        # validate betas
        if (self.betas is None and self.beta_list_file is None) or \
           (self.betas is not None and self.beta_list_file is not None):
            eprint(
                f'[wt fm] Exactly one of the following must be specified: betas, beta_list_file'
            )
            raise IllegalArgumentError()

        if self.beta_list_file:
            validate_single_file(self.beta_list_file)
            with open(self.beta_list_file, 'r') as f:
                self.betas = [l.strip() for l in f.readlines()]
        validate_file_list(self.betas)
Пример #10
0
def groups_load_wrap(groups_file, betas):
    if groups_file is not None:
        validate_single_file(groups_file)
        validate_file_list(betas)
        gf = load_gfile_helper(groups_file)
    else:
        # otherwise, generate dummy group file for all binary files in input_dir
        # first drop duplicated files, while keeping original order
        betas = drop_dup_keep_order(betas.copy())
        fnames = [op.splitext(op.basename(b))[0] for b in betas]
        gf = pd.DataFrame(columns=['fname'], data=fnames)
        gf['group'] = gf['fname']
    gf['full_path'] = match_prefix_to_bin(gf['fname'], betas, '.beta')
    return gf
Пример #11
0
def load_blocks_file(blocks_path, nrows=None):
    # validate blocks_path
    validate_single_file(blocks_path)

    try:
        # see if blocks_path has a header:
        peek_df = pd.read_csv(blocks_path,
                              sep='\t',
                              nrows=1,
                              header=None,
                              comment='#')
        header = None if str(peek_df.iloc[0, 1]).isdigit() else 0

        names = COORDS_COLS5
        if len(peek_df.columns) < len(names):
            msg = f'Invalid blocks file: {blocks_path}. less than {len(names)} columns.\n'
            msg += f'Run wgbstools convert -L {blocks_path} -o OUTPUT_REGION_FILE to add the CpG columns'
            raise IllegalArgumentError(msg)

        # load
        # dtypes = {'chr':str, 'start', 'end', 'startCpG', 'endCpG'}
        dtypes = {'startCpG': 'Int64', 'endCpG': 'Int64'}
        df = pd.read_csv(blocks_path,
                         sep='\t',
                         usecols=range(len(names)),
                         dtype=dtypes,
                         header=header,
                         names=names,
                         nrows=None,
                         comment='#')

        # blocks start before they end - invalid file
        dfnona = df.dropna()  # allow blocks with missing values
        if not ((dfnona['endCpG'] - dfnona['startCpG']) >= 0).all():
            raise IllegalArgumentError(
                f'Invalid CpG columns in blocks file {blocks_path}')

        if dfnona.shape[0] == df.shape[0]:
            df['startCpG'] = df['startCpG'].astype(int)
            df['endCpG'] = df['endCpG'].astype(int)

    except pd.errors.ParserError as e:
        eprint(f'Invalid input file.\n{e}')
        return pd.DataFrame()
    except pd.errors.EmptyDataError as e:
        eprint(f'Empty blocks file.\n{e}')
        return pd.DataFrame()

    return df
Пример #12
0
def beta2table_generator(betas,
                         blocks,
                         groups_file,
                         min_cov,
                         threads,
                         chunk_size=None,
                         verbose=False):
    validate_single_file(blocks)
    gf = groups_load_wrap(groups_file, betas)
    blocks_df = load_blocks_file(blocks)
    if chunk_size is None:
        chunk_size = blocks_df.shape[0]
    for start in range(0, blocks_df.shape[0], chunk_size):
        subset_blocks = blocks_df.iloc[start:start + chunk_size].copy()
        yield get_table(subset_blocks, gf, min_cov, threads, verbose)
Пример #13
0
    def __init__(self, args):
        self.args = args
        self.ref_path = args.genome_ref
        self.force = args.force
        self.name = args.name
        self.out_dir = self.make_output_dir()

        # validate input files
        validate_single_file(self.ref_path, '.fa')

        # abort if files exists and --force was not specified
        eprint('Setting up genome reference files in {}'.format(self.out_dir))
        if not delete_or_skip(op.join(self.out_dir, 'CpG.bed.gz'), self.force):
            return

        self.fai_df = self.load_fai()
Пример #14
0
 def set_lists(self):
     # black/white lists:
     blacklist = self.args.blacklist
     whitelist = self.args.whitelist
     if blacklist == True:
         blacklist = GenomeRefPaths(self.args.genome).blacklist
     elif whitelist == True:
         whitelist = GenomeRefPaths(self.args.genome).whitelist
     if blacklist:
         validate_single_file(blacklist)
     elif whitelist:
         validate_single_file(whitelist)
     if self.verbose:
         eprint(f'[wt bam2pat] blacklist: {blacklist}')
         eprint(f'[wt bam2pat] whitelist: {whitelist}')
     return blacklist, whitelist
Пример #15
0
def main():
    """
    View the content of input file (pat/unq/beta) as plain text.
    Possible filter by genomic region or sites range
    Output to stdout as default
    """
    args = parse_args()
    # validate input file
    input_file = args.input_file
    validate_single_file(input_file)

    if args.sub_sample is not None and not 1 > args.sub_sample > 0:
        eprint('sub-sampling rate must be within (0.0, 1.0)')
        return

    if args.bed_file and (args.region or args.sites):
        eprint('-L, -s and -r are mutually exclusive')
        return

    bed_wrapper = BedFileWrap(args.bed_file) if args.bed_file else None
    gr = GenomicRegion(args)

    try:
        if input_file.endswith('.beta') or input_file.endswith('.bin'):
            view_beta(input_file, gr, args.out_path)
        elif input_file.endswith('.pat.gz'):
            if bed_wrapper:
                view_pat_bed_multiprocess(args, bed_wrapper)
            else:
                vp = ViewPat(input_file, args.out_path, gr, args.strict,
                             args.sub_sample, bed_wrapper, args.min_len)
                vp.view_pat(args.awk_engine)
        elif input_file.endswith('.unq.gz'):
            grs = bed_wrapper.iter_grs() if bed_wrapper else [gr]
            for gr in grs:
                ViewUnq(input_file, args.out_path, gr, args.inflate).view()
        else:
            raise IllegalArgumentError('Unknown input format:', input_file)

    except BrokenPipeError:
        # Python flushes standard streams on exit; redirect remaining output
        # to devnull to avoid another BrokenPipeError at shutdown
        devnull = os.open(os.devnull, os.O_WRONLY)
        os.dup2(devnull, sys.stdout.fileno())
        sys.exit(1)  # Python exits with error code 1 on EPIPE
Пример #16
0
def parse_betas_input(args):
    """
    parse user input to get the list of beta files to segment
    Either args.betas is a list of beta files,
    or args.beta_file is a text file in which each line is a beta file
    return: list of beta files
    """
    if args.betas:
        betas = args.betas
    elif args.beta_file:
        validate_single_file(args.beta_file)
        with open(args.beta_file, 'r') as f:
            betas = [
                b.strip() for b in f.readlines()
                if b.strip() and not b.startswith('#')
            ]
        if not betas:
            raise IllegalArgumentError(
                f'no beta files found in file {args.beta_file}')
    validate_file_list(betas)
    return betas
Пример #17
0
def pat2beta(pat_path, out_dir, args, force=True):
    validate_single_file(pat_path)

    if pat_path.endswith('.pat.gz'):
        cmd = 'gunzip -cd'
    elif pat_path.endswith('.pat'):
        cmd = 'cat'
    else:
        raise IllegalArgumentError('Invalid pat suffix: {}'.format(pat_path))

    out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + '.beta')
    if not delete_or_skip(out_beta, force):
        return
    nr_sites = GenomeRefPaths(args.genome).nr_sites

    if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile(
            pat_path + '.csi'):
        return mult_pat2beta(pat_path, out_beta, nr_sites, args)

    cmd += ' {} | {} {} {}'.format(pat_path, PAT2BETA_TOOL, out_beta, nr_sites)
    subprocess.check_call(cmd, shell=True)
    return out_beta
Пример #18
0
def read_reference(ref):
    # read Illumina-to-CpG_Index table:
    validate_single_file(ilmn2cpg_dict)
    df = pd.read_csv(ilmn2cpg_dict,
                     sep='\t',
                     header=None,
                     names=['ilmn', 'cpg'])
    if ref is None:
        return df

    # validate and read reference file
    validate_single_file(ref)
    rf = pd.read_csv(ref, header=None, usecols=[0], names=['ilmn'])
    # remove first row if it's not a cg entry:
    if pd.isna(rf['ilmn'][0]) or not rf['ilmn'][0].startswith('cg'):
        rf = rf.iloc[1:, :]

    # merge reference file with map table
    mf = df.merge(rf, how='right', on='ilmn')

    # if there are sites that appear in the reference but not in the map table,
    # remove them and print a warning
    missing_sites = mf[mf['cpg'].isna()]
    if not missing_sites.empty:
        msg = 'WARNING: Skipping some unrecognized Illumina IDs \n'
        msg += f'(not found in the map table {ilmn2cpg_dict})\n'
        if not missing_sites['ilmn'].empty:
            eprint(missing_sites['ilmn'])
            eprint(list(missing_sites['ilmn']))
            msg += 'The missing sites: {}'.format(','.join(
                map(str, missing_sites['ilmn'])))
        eprint(msg)
    mf = mf[~mf['cpg'].isna()]

    mf['cpg'] = mf['cpg'].astype(int)
    return mf
Пример #19
0
 def validate_args(self):
     if self.args.min_cpg < 1:
         raise IllegalArgumentError('min_cpg must be a positive integer')
     validate_single_file(self.args.blocks_path)
     validate_single_file(self.args.groups_file)
Пример #20
0
def betas2table(betas, blocks, groups_file, min_cov, threads=8, verbose=False):
    validate_single_file(blocks)
    gf = groups_load_wrap(groups_file, betas)
    blocks_df = load_blocks_file(blocks)
    return get_table(blocks_df, gf, min_cov, threads, verbose)
Пример #21
0
def load_group_file(groups_file, betas):
    validate_single_file(groups_file)
    validate_file_list(betas)
    gf = load_gfile_helper(groups_file)
    gf['full_path'] = match_prefix_to_bin(gf['fname'], betas, '.beta')
    return gf