예제 #1
0
def check_config_reseq2(c):
    c = check_config_default(c)
    c['dirw'] = c['dirc']
    y, refs = dict(), set()
    num_run = 0
    for yid, ydic in c['y'].items():
        if not ydic['run']: continue
        num_run += 1

        for rsubdir in [c['dirl'], c['dirj']]:
            subdir = op.join(c['dirw'], yid, rsubdir)
            if not op.isdir(subdir):
                makedirs(subdir)

        ydic['t'] = dict()
        fs = "%s/%s/%s.tsv" % (c['dirh'], c['callvnt2']['idir'], yid)
        sl = pd.read_csv(fs, sep="\t", header=0)
        cols = sl.columns.values.tolist()
        for i in range(len(sl)):
            sid = sl['sid'][i]
            sdic = {x: sl[x][i] for x in cols if x != 'sid'}
            ydic['t'][sid] = sdic

        refs.add(ydic['ref'])
        y[yid] = ydic

    for ref in refs:
        check_genome(ref, c)

    c['y'] = y
    print('working on %s datasets' % num_run)

    return c
예제 #2
0
def check_config_bsseq(c):
    c = check_config_default(c)
    c['dirw'] = c['dirc']

    df = read_study_list(c)
    y, refs = dict(), set()
    num_run = 0
    meta = False
    for i in range(len(df)):
        if not df['runB'][i]: continue
        if df['runB'][i]: num_run += 1
        yid = df['yid'][i]
        y1 = {x: df[x][i] for x in list(df) if x != 'yid'}

        for rsubdir in [c['dirl'], c['dirj']]:
            subdir = op.join(c['dirw'], yid, rsubdir)
            if not op.isdir(subdir):
                makedirs(subdir)

        diri = op.join(c['barn']['home'], c['barn']['odir'])
        y1.update(
            read_samplelist(diri=diri,
                            yid=yid,
                            part_size=c['trimming']['part_size']))
        y[yid] = y1

        refs.add(y1['ref'])

    c['y'] = y
    print('working on %s datasets' % num_run)

    for ref in refs:
        check_genome(ref, c)
    return c
예제 #3
0
def _download_file(fn, dest=None):
    url = URL.format(fn)
    if dest is None:
        dest = fn
    makedirs(os.path.dirname(dest))
    basename = os.path.basename(fn)
    shell('wget -q -O- {url} > {dest}')
    return dest
예제 #4
0
def main(args: argparse.ArgumentParser) -> None:
    """
    This function performs the whole design update and configuration calls
    """
    logging.debug("Loading rna-count-salmon configuration, building new one")
    old_config = load_old_config(args.results_config)
    new_config = args_to_dict(args, old_config)
    design = update_design(old_config['design'])

    logging.debug("Saving output files")
    makedirs(new_config["workdir"])
    write_yaml(Path(new_config["config"]), new_config)
    design.to_csv(new_config["design"], sep="\t")
예제 #5
0
def check_config_brbseq(c):
    c = check_config_default(c)
    c['dirw'] = c['dirc']

    for rsubdir in [c['dirl'], c['dirj']]:
        subdir = op.join(c['dirw'], rsubdir)
        if not op.isdir(subdir):
            makedirs(subdir)

    refs = ['Zmays_B73']
    for ref in refs:
        check_genome(ref, c, 'B73_vt01')

    return c
예제 #6
0
def _download_file(fn, d):
    """
    Intended to be called from a pytest.fixture function.

    `fn` is a path to a file that is used to fill in `URL`. `d` is a tempdir
    likely created by the calling function to which the file will be
    downloaded.

    The path to the downloaded file is returned.
    """
    url = URL.format(fn)
    dest = os.path.join(d, fn)
    makedirs(os.path.dirname(dest))
    basename = os.path.basename(fn)
    shell('wget -q -O- {url} > {dest}')
    return dest
예제 #7
0
def check_config_reseq3(c):
    c = check_config_default(c)
    c['dirw'] = c['dirc']
    y, refs = dict(), set()
    num_run = 0
    for yid, ydic in c['y'].items():
        if not ydic['run']: continue
        num_run += 1

        for rsubdir in [c['dirl'], c['dirj']]:
            subdir = op.join(c['dirw'], yid, rsubdir)
            if not op.isdir(subdir):
                makedirs(subdir)

        fs = "%s/31_vnt_list/%s.txt" % (c['dirh'], yid)
        assert op.isfile(fs), "samplelist not found: %s" % fs
        sl = pd.read_csv(fs, sep="\t", names=['sid'], header=None)
        ydic['samples'] = sl['sid'].tolist()

        fs = "%s/35_vnt_ase/%s.tsv" % (c['dirh'], yid)
        sl = pd.read_csv(fs, sep="\t", header=0)
        ydic['ase_genotypes'] = sl['Genotype'].tolist()
        cols = sl.columns.values.tolist()
        ydic['ase'] = dict()
        for i in range(len(sl)):
            gt = sl['Genotype'][i]
            gdic = {x: sl[x][i] for x in cols if x != 'Genotype'}
            ydic['ase'][gt] = gdic

        vid = ydic['vid']
        fv = "/home/springer/zhoux379/projects/reseq/data/cache/vcf/%s.vcf.gz" % vid
        assert op.isfile(fv), "vcf not found: %s" % fv
        ydic['vcf'] = fv

        refs.add(ydic['ref'])
        y[yid] = ydic

    for ref in refs:
        check_genome(ref, c)

    c['y'] = y
    print('working on %s datasets' % num_run)

    return c
예제 #8
0
def check_config_phylo(c):
    c = check_config_default(c)
    c['dirw'] = c['dirc']
    for fn in [c['studylist']]:
        assert op.isfile(fn), "cannot read %s" % fn

    df = pd.read_excel(c['studylist'],
                       sheet_name=0,
                       header=0,
                       converters={"run": bool})

    y, refs = dict(), set()
    num_run = 0
    for yid, ydic in c['y'].items():
        if not ydic['run']: continue
        num_run += 1

        for rsubdir in [c['dirl'], c['dirj']]:
            subdir = op.join(c['dirw'], yid, rsubdir)
            if not op.isdir(subdir):
                makedirs(subdir)

        fs = "%s/05_sample_list/%s.txt" % (c['dirh'], yid)
        assert op.isfile(fs), "samplelist not found: %s" % fs
        sl = pd.read_csv(fs, sep="\t", names=['sid'], header=None)
        ydic['samples'] = sl['sid'].tolist()

        vid = ydic['vid']
        fv = "/home/springer/zhoux379/projects/reseq/data/cache/vcf/%s.vcf.gz" % vid
        assert op.isfile(fv), "vcf not found: %s" % fv
        ydic['vcf'] = fv

        refs.add(ydic['ref'])
        y[yid] = ydic

    for ref in refs:
        check_genome(ref, c)

    c['y'] = y
    print('working on %s datasets' % num_run)

    return c
예제 #9
0
def main(args: argparse.ArgumentParser) -> None:
    """
    Build config, and save it, then Build design, and save it.

    Parameters:
        args    ArgumentParser      The parsed command line

    Example:
    >>> main()
    """
    logging.debug("Building output directory")
    makedirs(args.workdir)

    logging.debug("Building configuration")
    config_dict = args_to_dict(args)
    with open(config_dict["config"], "w") as config_out:
        config_out.write(yaml.dump(config_dict, default_flow_style=False))

    logging.debug("Building design")
    design_frame = build_design(search_bam(args.bam_dir),
                                previous_design=args.previous_design)
    design.to_csv(config["design"], sep="\t")
예제 #10
0
def check_config_barn(c):
    c = check_config_default(c)
    c['dirw'] = c['dirc']

    df = read_study_list(c)
    y, num_run = dict(), 0
    for i in range(len(df)):
        if not df['run'][i]: continue
        if df['run'][i]: num_run += 1
        y1 = {x: df[x][i] for x in list(df) if x != 'yid'}
        yid = df['yid'][i]

        for rsubdir in [c['dirl'], c['dirj']]:
            subdir = op.join(c['dirw'], yid, rsubdir)
            if not op.isdir(subdir):
                makedirs(subdir)

        idir = c['barn']['idir_sra']
        if y1['source'] == 'local':
            idir = c['barn']['idir_local']
        fs = "%s/%s/%s.tsv" % (c['dirh'], idir, yid)
        assert op.isfile(fs), "samplelist not found: %s" % fs
        y1['samplelist'] = fs

        sl = pd.read_csv(fs, sep="\t", header=0)
        y1['SampleID'] = sl['SampleID'].tolist()
        y1['t'] = dict()
        cols = sl.columns.values.tolist()
        for i in range(len(sl)):
            sid = sl['SampleID'][i]
            sdic = {x: sl[x][i] for x in cols}
            y1['t'][sid] = sdic

        y[yid] = y1

    c['y'] = y
    print('working on %s datasets' % num_run)

    return c
예제 #11
0
def check_config_default(c):
    for fn in [c['config_default']]:
        assert op.isfile(fn), "cannot read %s" % fn

    cfg_default = yaml.safe_load(open(c['config_default'], 'r'))
    update_config(cfg_default, c)
    c = cfg_default

    for fn in [c['config_job_default']]:
        assert op.isfile(fn), "cannot read %s" % fn

    cfg_job = read_job_config(c)
    update_config(cfg_job, c)
    c = cfg_job

    dirh0, dirc0 = c['dir_project'], c['dir_cache']
    pid, wid, oid = c['pid'], c['wid'], c['oid']
    c['dirh'] = op.join(dirh0, pid, wid)
    c['dirc'] = op.join(dirc0, pid, wid)
    c['dirr'] = op.join(dirh0, pid, wid, oid)
    dirh, dirc, dirr = c['dirh'], c['dirc'], c['dirr']
    dirr_l = op.join(dirc, oid)
    for subdir in [dirh, dirc, dirr, c['tmpdir']]:
        if not op.isdir(subdir):
            makedirs(subdir)
    make_symlink(dirr, dirr_l)

    dirh_l = op.join(dirc, 'primary')
    dirc_l = op.join(dirh, 'cache')
    make_symlink(dirc, dirc_l)
    make_symlink(dirh, dirh_l)

    xdic = read_genome_config(c)
    gdic = {g: dict() for g in xdic.keys()}
    c['x'] = xdic
    c['g'] = gdic

    return c
예제 #12
0
def check_config_rnaseq(c):
    c = check_config_default(c)
    c['dirw'] = c['dirc']

    df = read_study_list(c)
    y, refs = dict(), set()
    num_run = 0
    for i in range(len(df)):
        if not df['runR'][i]: continue
        if df['runR'][i]: num_run += 1
        yid = df['yid'][i]
        y1 = {x: df[x][i] for x in list(df) if x != 'yid'}

        for rsubdir in [c['dirl'], c['dirj']]:
            subdir = op.join(c['dirw'], yid, rsubdir)
            if not op.isdir(subdir):
                makedirs(subdir)

        diri = op.join(c['barn']['home'], c['barn']['odir'])
        y1.update(read_samplelist(diri=diri, yid=yid, cap_gt=True))
        y[yid] = y1

        if y[yid]['ril']:
            if yid not in c['ril_variant'] or not op.isfile(
                    c['ril_variant'][yid]):
                print("no variant file to do RIL genotyping: %s" % yid)
                sys.exit(1)

        ref, tag_hisat2 = y1['ref'], y1['hisat2']
        if ref not in refs:
            check_genome(ref, c, tag_hisat2)
        refs.add(y1['ref'])

    c['y'] = y
    print('working on %s datasets' % num_run)

    return c
예제 #13
0
    logging.debug("Loading rna-count-salmon configuration, building new one")
    old_config = load_old_config(args.results_config)
    new_config = args_to_dict(args, old_config)
    design = update_design(old_config['design'])

    logging.debug("Saving output files")
    makedirs(new_config["workdir"])
    write_yaml(Path(new_config["config"]), new_config)
    design.to_csv(new_config["design"], sep="\t")


# Running programm if not imported
if __name__ == '__main__':
    # Parsing command line
    args = parse(sys.argv[1:])
    makedirs("logs/prepare")

    # Build logging object and behaviour
    logging.basicConfig(
        filename="logs/prepare/config.log",
        filemode="w",
        level=10
    )

    try:
        main(args)
    except Exception as e:
        logging.exception("%s", e)
        sys.exit(1)

    logging.info("Process over")
예제 #14
0
__license__ = "MIT"

import os.path as op
from snakemake.shell import shell
from snakemake.utils import makedirs
from snakemake_wrapper_utils.java import get_java_opts

# Gathering extra parameters and logging behaviour
log = snakemake.log_fmt_shell(stdout=False, stderr=True)
extra = snakemake.params.get("extra", "")
java_opts = get_java_opts(snakemake)

# In case input files are gzipped mpileup files,
# they are being unzipped and piped
# In that case, it is recommended to use at least 2 threads:
# - One for unzipping with zcat
# - One for running varscan
pileup = (" cat {} ".format(snakemake.input[0])
          if not snakemake.input[0].endswith("gz") else " zcat {} ".format(
              snakemake.input[0]))

# Building output directories
makedirs(op.dirname(snakemake.output[0]))

shell("varscan mpileup2indel "  # Tool and its subprocess
      "<( {pileup} ) "
      "{java_opts} {extra} "  # Extra parameters
      "> {snakemake.output[0]} "  # Path to vcf file
      "{log}"  # Logging behaviour
      )
예제 #15
0
from os import path
from snakemake.shell import shell
from snakemake.utils import makedirs

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

read_length = snakemake.config["parameters"]["general"]["read_length"]
genome_dir = snakemake.output[0] + "/"
annotation = snakemake.config["locations"]["annotation"]
extra = ""

if snakemake.config["parameters"]["star"]["index"]["sjdb_overhang"]:
    extra += "--sjdbOverhang {} ".format(str(read_length - 1))

makedirs(genome_dir)
shell("STAR "
      "--runMode genomeGenerate "
      "--runThreadN {snakemake.params.ntasks} "
      "--genomeDir {genome_dir} "
      "--outFileNamePrefix {genome_dir} "
      "--genomeFastaFiles {snakemake.input} "
      "--sjdbGTFfile {annotation} "
      "{extra}"
      "{log}")
예제 #16
0
        data = data[[column]]
        data.columns = [sample_id]

        try:
            merged_frame = pd.merge(
                merged_frame,
                data,
                left_index=True,
                right_index=True
            )
        except TypeError:
            merged_frame = data

    merged_frame.fillna(0)
    return merged_frame


if __name__ == '__main__':
    makedirs(op.dirname(snakemake.output["est_counts"]))

    for column in ["est_counts", "tpm"]:
        data = extract_field(
            *snakemake.input["quants"],
            prefix="pseudo_mapping/",
            column=column
        )

        print(data.head(), file=sys.stderr)

        data.to_csv(snakemake.output[column], sep="\t")
예제 #17
0
        sy = x/math.ceil(x*py/y)
    else:
        sy = y/py
    return math.floor(max(sx, sy))


rule extract_contributor_avatar:
    '''
    Create an image composed of the avatar of all the contributors
    '''
    output:
        contributors="images/contributors.png"
    run:
        avatar_paths = []
        avatar_dir = os.path.join("images", "avatars")
        makedirs(avatar_dir)
        # parse the contributors
        for contri in repo.get_contributors():
            # get the url to the avatar
            avatar_url = contri.avatar_url
            # download the avatar with requests
            avatar_path = os.path.join(avatar_dir, "%s.png" % contri.login)
            if not os.path.exists(avatar_path):
                r = requests.get(avatar_url, stream=True)
                r.raise_for_status()
                with open(avatar_path, "ab") as fd:
                    for chunk in r.iter_content(chunk_size=128):
                        fd.write(chunk)
            # add the path to the list of image paths
            avatar_paths.append(avatar_path)
        # create image to combine the avatars
예제 #18
0
    exe = 'jams'
    logger.warning(f'WARNING: searching PATH for "{exe}"')
    exe = which(exe)

# all our attempts to find the file have failed
if exe is None:
    raise Exception(f"ERROR: jams executable {exe} not found")

command.append(f" {exe} ")

# if we have specified output files then set output to their
# intended location
if len(snakemake.output) > 0:
    output_path = os.path.dirname(snakemake.output[0])
    if output_path:
        makedirs(output_path)
        command.append(f" --output=\"{output_path}\" ")

if name is not None:
    command.append(f' --name=\"{name}\" ')

# Look through the input files for "*.cfg" files and append them.
# Note: We don't nessecarily have to have a config file, the whole config 
# could be given as strings
for file in snakemake.input:
    if file.endswith("cfg"):
        command.append(f" \"{file}\" ")

# If a h5 file is given as input, use the data for the initial 
# spins, only use the first h5 file specified. The painful series
# of escape characters is because we have to escape in both python
예제 #19
0
if 'M' in params: mailuser = "******" % str(params['M'])
if "nodes" in params: nodes = "nodes=%d" % params["nodes"]
if 'ppn' in params: ppn = "ppn=%d" % (params["ppn"] + retry * params['appn'])
if ppn and not nodes: nodes = "nodes=1"
if "mem" in params: mem = "mem=%dgb" % (params["mem"] + retry * params['amem'])
if "runtime" in params:
    walltime = "walltime=%d:00:00" % (params["runtime"] +
                                      retry * params['aruntime'])

print('  '.join((jname, jobo, jobe)), file=sys.stderr)
print("  ".join((q, ppn, mem, walltime)), file=sys.stderr)
#sys.exit(2)

for jdir in set([os.path.dirname(p) for p in [jobe, jobo]]):
    if not os.path.isdir(jdir):
        makedirs(jdir)

if nodes or ppn or mem or walltime: resourceparams = " -l \""
if nodes: resourceparams = resourceparams + nodes
if nodes and ppn: resourceparams = resourceparams + ":" + ppn
if nodes and mem: resourceparams = resourceparams + ","
if mem: resourceparams = resourceparams + mem
if walltime and (nodes or mem): resourceparams = resourceparams + ","
if walltime: resourceparams = resourceparams + walltime
if nodes or mem or walltime: resourceparams = resourceparams + "\""

cmd = "qsub {a}{A}{b}{c}{C}{d}{D}{e}{f}{h}{j}{l}{m}{M}{N}{o}{p}{P}{q}{t}{u}{v}{V}{w}{W}{rp}{dep}{ex}".format(\
    a=atime,A=acc_string,b=pbs_time,c=chkpt,C=pref,d=dd,D=rd,e=se,f=ft,h=hold,j=j,l=resource,m=mail,M=mailuser,\
    N=jname,o=so,p=priority,P=proxy,q=q,t=ar,u=user,v=ev,V=eall,w=wd,W=add,rp=resourceparams,dep=depend,ex=extras)

try:
예제 #20
0
from os import path
from snakemake.shell import shell
from snakemake.utils import makedirs

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

threads = snakemake.params.ntasks
memory = snakemake.params.mem
accession = snakemake.params.accession
fastq_dir = snakemake.params.fastq_dir
tmp_dir = path.join(fastq_dir, "tmp")
makedirs(tmp_dir)

extra = ""

shell("parallel-fastq-dump "
      "-s {accession} "
      "-t {threads} "
      "-O {fastq_dir} "
      "--tmpdir {tmp_dir} "
      "--split-3 "
      "-I "
      "--gzip "
      "{log}")
예제 #21
0
__email__ = "*****@*****.**"
__license__ = "MIT"

import os.path as op

from snakemake.shell import shell
from snakemake.utils import makedirs
from snakemake_wrapper_utils.java import get_java_opts

# Defining logging and gathering extra parameters
log = snakemake.log_fmt_shell(stdout=True, stderr=True)
extra = snakemake.params.get("extra", "")
java_opts = get_java_opts(snakemake)

# Building output dirs
makedirs(op.dirname(snakemake.output.snp))
makedirs(op.dirname(snakemake.output.indel))

# Output prefix
prefix = op.splitext(snakemake.output.snp)[0]

# Searching for input files
pileup_pair = ["normal_pileup", "tumor_pileup"]

in_pileup = ""
mpileup = ""
if "mpileup" in snakemake.input.keys():
    # Case there is a mpileup with both normal and tumor
    in_pileup = snakemake.input.mpileup
    mpileup = "--mpileup 1"
elif all(pileup in snakemake.input.keys() for pileup in pileup_pair):
예제 #22
0
import tempfile
from snakemake.shell import shell
from snakemake.utils import makedirs

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

extra = snakemake.params.get("extra", "")
sjdb_overhang = snakemake.params.get("sjdbOverhang", "100")

gtf = snakemake.input.get("gtf")
if gtf is not None:
    gtf = f"--sjdbGTFfile {gtf}"
    sjdb_overhang = f"--sjdbOverhang {sjdb_overhang}"
else:
    gtf = sjdb_overhang = ""

makedirs(snakemake.output)

with tempfile.TemporaryDirectory() as tmpdir:
    shell("STAR"
          " --runThreadN {snakemake.threads}"  # Number of threads
          " --runMode genomeGenerate"  # Indexation mode
          " --genomeFastaFiles {snakemake.input.fasta}"  # Path to fasta files
          " {sjdb_overhang}"  # Read-len - 1
          " {gtf}"  # Highly recommended GTF
          " {extra}"  # Optional parameters
          " --outTmpDir {tmpdir}/STARtmp"  # Temp dir
          " --genomeDir {snakemake.output}"  # Path to output
          " {log}"  # Logging
          )