def check_config_reseq2(c): c = check_config_default(c) c['dirw'] = c['dirc'] y, refs = dict(), set() num_run = 0 for yid, ydic in c['y'].items(): if not ydic['run']: continue num_run += 1 for rsubdir in [c['dirl'], c['dirj']]: subdir = op.join(c['dirw'], yid, rsubdir) if not op.isdir(subdir): makedirs(subdir) ydic['t'] = dict() fs = "%s/%s/%s.tsv" % (c['dirh'], c['callvnt2']['idir'], yid) sl = pd.read_csv(fs, sep="\t", header=0) cols = sl.columns.values.tolist() for i in range(len(sl)): sid = sl['sid'][i] sdic = {x: sl[x][i] for x in cols if x != 'sid'} ydic['t'][sid] = sdic refs.add(ydic['ref']) y[yid] = ydic for ref in refs: check_genome(ref, c) c['y'] = y print('working on %s datasets' % num_run) return c
def check_config_bsseq(c): c = check_config_default(c) c['dirw'] = c['dirc'] df = read_study_list(c) y, refs = dict(), set() num_run = 0 meta = False for i in range(len(df)): if not df['runB'][i]: continue if df['runB'][i]: num_run += 1 yid = df['yid'][i] y1 = {x: df[x][i] for x in list(df) if x != 'yid'} for rsubdir in [c['dirl'], c['dirj']]: subdir = op.join(c['dirw'], yid, rsubdir) if not op.isdir(subdir): makedirs(subdir) diri = op.join(c['barn']['home'], c['barn']['odir']) y1.update( read_samplelist(diri=diri, yid=yid, part_size=c['trimming']['part_size'])) y[yid] = y1 refs.add(y1['ref']) c['y'] = y print('working on %s datasets' % num_run) for ref in refs: check_genome(ref, c) return c
def _download_file(fn, dest=None): url = URL.format(fn) if dest is None: dest = fn makedirs(os.path.dirname(dest)) basename = os.path.basename(fn) shell('wget -q -O- {url} > {dest}') return dest
def main(args: argparse.ArgumentParser) -> None: """ This function performs the whole design update and configuration calls """ logging.debug("Loading rna-count-salmon configuration, building new one") old_config = load_old_config(args.results_config) new_config = args_to_dict(args, old_config) design = update_design(old_config['design']) logging.debug("Saving output files") makedirs(new_config["workdir"]) write_yaml(Path(new_config["config"]), new_config) design.to_csv(new_config["design"], sep="\t")
def check_config_brbseq(c): c = check_config_default(c) c['dirw'] = c['dirc'] for rsubdir in [c['dirl'], c['dirj']]: subdir = op.join(c['dirw'], rsubdir) if not op.isdir(subdir): makedirs(subdir) refs = ['Zmays_B73'] for ref in refs: check_genome(ref, c, 'B73_vt01') return c
def _download_file(fn, d): """ Intended to be called from a pytest.fixture function. `fn` is a path to a file that is used to fill in `URL`. `d` is a tempdir likely created by the calling function to which the file will be downloaded. The path to the downloaded file is returned. """ url = URL.format(fn) dest = os.path.join(d, fn) makedirs(os.path.dirname(dest)) basename = os.path.basename(fn) shell('wget -q -O- {url} > {dest}') return dest
def check_config_reseq3(c): c = check_config_default(c) c['dirw'] = c['dirc'] y, refs = dict(), set() num_run = 0 for yid, ydic in c['y'].items(): if not ydic['run']: continue num_run += 1 for rsubdir in [c['dirl'], c['dirj']]: subdir = op.join(c['dirw'], yid, rsubdir) if not op.isdir(subdir): makedirs(subdir) fs = "%s/31_vnt_list/%s.txt" % (c['dirh'], yid) assert op.isfile(fs), "samplelist not found: %s" % fs sl = pd.read_csv(fs, sep="\t", names=['sid'], header=None) ydic['samples'] = sl['sid'].tolist() fs = "%s/35_vnt_ase/%s.tsv" % (c['dirh'], yid) sl = pd.read_csv(fs, sep="\t", header=0) ydic['ase_genotypes'] = sl['Genotype'].tolist() cols = sl.columns.values.tolist() ydic['ase'] = dict() for i in range(len(sl)): gt = sl['Genotype'][i] gdic = {x: sl[x][i] for x in cols if x != 'Genotype'} ydic['ase'][gt] = gdic vid = ydic['vid'] fv = "/home/springer/zhoux379/projects/reseq/data/cache/vcf/%s.vcf.gz" % vid assert op.isfile(fv), "vcf not found: %s" % fv ydic['vcf'] = fv refs.add(ydic['ref']) y[yid] = ydic for ref in refs: check_genome(ref, c) c['y'] = y print('working on %s datasets' % num_run) return c
def check_config_phylo(c): c = check_config_default(c) c['dirw'] = c['dirc'] for fn in [c['studylist']]: assert op.isfile(fn), "cannot read %s" % fn df = pd.read_excel(c['studylist'], sheet_name=0, header=0, converters={"run": bool}) y, refs = dict(), set() num_run = 0 for yid, ydic in c['y'].items(): if not ydic['run']: continue num_run += 1 for rsubdir in [c['dirl'], c['dirj']]: subdir = op.join(c['dirw'], yid, rsubdir) if not op.isdir(subdir): makedirs(subdir) fs = "%s/05_sample_list/%s.txt" % (c['dirh'], yid) assert op.isfile(fs), "samplelist not found: %s" % fs sl = pd.read_csv(fs, sep="\t", names=['sid'], header=None) ydic['samples'] = sl['sid'].tolist() vid = ydic['vid'] fv = "/home/springer/zhoux379/projects/reseq/data/cache/vcf/%s.vcf.gz" % vid assert op.isfile(fv), "vcf not found: %s" % fv ydic['vcf'] = fv refs.add(ydic['ref']) y[yid] = ydic for ref in refs: check_genome(ref, c) c['y'] = y print('working on %s datasets' % num_run) return c
def main(args: argparse.ArgumentParser) -> None: """ Build config, and save it, then Build design, and save it. Parameters: args ArgumentParser The parsed command line Example: >>> main() """ logging.debug("Building output directory") makedirs(args.workdir) logging.debug("Building configuration") config_dict = args_to_dict(args) with open(config_dict["config"], "w") as config_out: config_out.write(yaml.dump(config_dict, default_flow_style=False)) logging.debug("Building design") design_frame = build_design(search_bam(args.bam_dir), previous_design=args.previous_design) design.to_csv(config["design"], sep="\t")
def check_config_barn(c): c = check_config_default(c) c['dirw'] = c['dirc'] df = read_study_list(c) y, num_run = dict(), 0 for i in range(len(df)): if not df['run'][i]: continue if df['run'][i]: num_run += 1 y1 = {x: df[x][i] for x in list(df) if x != 'yid'} yid = df['yid'][i] for rsubdir in [c['dirl'], c['dirj']]: subdir = op.join(c['dirw'], yid, rsubdir) if not op.isdir(subdir): makedirs(subdir) idir = c['barn']['idir_sra'] if y1['source'] == 'local': idir = c['barn']['idir_local'] fs = "%s/%s/%s.tsv" % (c['dirh'], idir, yid) assert op.isfile(fs), "samplelist not found: %s" % fs y1['samplelist'] = fs sl = pd.read_csv(fs, sep="\t", header=0) y1['SampleID'] = sl['SampleID'].tolist() y1['t'] = dict() cols = sl.columns.values.tolist() for i in range(len(sl)): sid = sl['SampleID'][i] sdic = {x: sl[x][i] for x in cols} y1['t'][sid] = sdic y[yid] = y1 c['y'] = y print('working on %s datasets' % num_run) return c
def check_config_default(c): for fn in [c['config_default']]: assert op.isfile(fn), "cannot read %s" % fn cfg_default = yaml.safe_load(open(c['config_default'], 'r')) update_config(cfg_default, c) c = cfg_default for fn in [c['config_job_default']]: assert op.isfile(fn), "cannot read %s" % fn cfg_job = read_job_config(c) update_config(cfg_job, c) c = cfg_job dirh0, dirc0 = c['dir_project'], c['dir_cache'] pid, wid, oid = c['pid'], c['wid'], c['oid'] c['dirh'] = op.join(dirh0, pid, wid) c['dirc'] = op.join(dirc0, pid, wid) c['dirr'] = op.join(dirh0, pid, wid, oid) dirh, dirc, dirr = c['dirh'], c['dirc'], c['dirr'] dirr_l = op.join(dirc, oid) for subdir in [dirh, dirc, dirr, c['tmpdir']]: if not op.isdir(subdir): makedirs(subdir) make_symlink(dirr, dirr_l) dirh_l = op.join(dirc, 'primary') dirc_l = op.join(dirh, 'cache') make_symlink(dirc, dirc_l) make_symlink(dirh, dirh_l) xdic = read_genome_config(c) gdic = {g: dict() for g in xdic.keys()} c['x'] = xdic c['g'] = gdic return c
def check_config_rnaseq(c): c = check_config_default(c) c['dirw'] = c['dirc'] df = read_study_list(c) y, refs = dict(), set() num_run = 0 for i in range(len(df)): if not df['runR'][i]: continue if df['runR'][i]: num_run += 1 yid = df['yid'][i] y1 = {x: df[x][i] for x in list(df) if x != 'yid'} for rsubdir in [c['dirl'], c['dirj']]: subdir = op.join(c['dirw'], yid, rsubdir) if not op.isdir(subdir): makedirs(subdir) diri = op.join(c['barn']['home'], c['barn']['odir']) y1.update(read_samplelist(diri=diri, yid=yid, cap_gt=True)) y[yid] = y1 if y[yid]['ril']: if yid not in c['ril_variant'] or not op.isfile( c['ril_variant'][yid]): print("no variant file to do RIL genotyping: %s" % yid) sys.exit(1) ref, tag_hisat2 = y1['ref'], y1['hisat2'] if ref not in refs: check_genome(ref, c, tag_hisat2) refs.add(y1['ref']) c['y'] = y print('working on %s datasets' % num_run) return c
logging.debug("Loading rna-count-salmon configuration, building new one") old_config = load_old_config(args.results_config) new_config = args_to_dict(args, old_config) design = update_design(old_config['design']) logging.debug("Saving output files") makedirs(new_config["workdir"]) write_yaml(Path(new_config["config"]), new_config) design.to_csv(new_config["design"], sep="\t") # Running programm if not imported if __name__ == '__main__': # Parsing command line args = parse(sys.argv[1:]) makedirs("logs/prepare") # Build logging object and behaviour logging.basicConfig( filename="logs/prepare/config.log", filemode="w", level=10 ) try: main(args) except Exception as e: logging.exception("%s", e) sys.exit(1) logging.info("Process over")
__license__ = "MIT" import os.path as op from snakemake.shell import shell from snakemake.utils import makedirs from snakemake_wrapper_utils.java import get_java_opts # Gathering extra parameters and logging behaviour log = snakemake.log_fmt_shell(stdout=False, stderr=True) extra = snakemake.params.get("extra", "") java_opts = get_java_opts(snakemake) # In case input files are gzipped mpileup files, # they are being unzipped and piped # In that case, it is recommended to use at least 2 threads: # - One for unzipping with zcat # - One for running varscan pileup = (" cat {} ".format(snakemake.input[0]) if not snakemake.input[0].endswith("gz") else " zcat {} ".format( snakemake.input[0])) # Building output directories makedirs(op.dirname(snakemake.output[0])) shell("varscan mpileup2indel " # Tool and its subprocess "<( {pileup} ) " "{java_opts} {extra} " # Extra parameters "> {snakemake.output[0]} " # Path to vcf file "{log}" # Logging behaviour )
from os import path from snakemake.shell import shell from snakemake.utils import makedirs log = snakemake.log_fmt_shell(stdout=True, stderr=True) read_length = snakemake.config["parameters"]["general"]["read_length"] genome_dir = snakemake.output[0] + "/" annotation = snakemake.config["locations"]["annotation"] extra = "" if snakemake.config["parameters"]["star"]["index"]["sjdb_overhang"]: extra += "--sjdbOverhang {} ".format(str(read_length - 1)) makedirs(genome_dir) shell("STAR " "--runMode genomeGenerate " "--runThreadN {snakemake.params.ntasks} " "--genomeDir {genome_dir} " "--outFileNamePrefix {genome_dir} " "--genomeFastaFiles {snakemake.input} " "--sjdbGTFfile {annotation} " "{extra}" "{log}")
data = data[[column]] data.columns = [sample_id] try: merged_frame = pd.merge( merged_frame, data, left_index=True, right_index=True ) except TypeError: merged_frame = data merged_frame.fillna(0) return merged_frame if __name__ == '__main__': makedirs(op.dirname(snakemake.output["est_counts"])) for column in ["est_counts", "tpm"]: data = extract_field( *snakemake.input["quants"], prefix="pseudo_mapping/", column=column ) print(data.head(), file=sys.stderr) data.to_csv(snakemake.output[column], sep="\t")
sy = x/math.ceil(x*py/y) else: sy = y/py return math.floor(max(sx, sy)) rule extract_contributor_avatar: ''' Create an image composed of the avatar of all the contributors ''' output: contributors="images/contributors.png" run: avatar_paths = [] avatar_dir = os.path.join("images", "avatars") makedirs(avatar_dir) # parse the contributors for contri in repo.get_contributors(): # get the url to the avatar avatar_url = contri.avatar_url # download the avatar with requests avatar_path = os.path.join(avatar_dir, "%s.png" % contri.login) if not os.path.exists(avatar_path): r = requests.get(avatar_url, stream=True) r.raise_for_status() with open(avatar_path, "ab") as fd: for chunk in r.iter_content(chunk_size=128): fd.write(chunk) # add the path to the list of image paths avatar_paths.append(avatar_path) # create image to combine the avatars
exe = 'jams' logger.warning(f'WARNING: searching PATH for "{exe}"') exe = which(exe) # all our attempts to find the file have failed if exe is None: raise Exception(f"ERROR: jams executable {exe} not found") command.append(f" {exe} ") # if we have specified output files then set output to their # intended location if len(snakemake.output) > 0: output_path = os.path.dirname(snakemake.output[0]) if output_path: makedirs(output_path) command.append(f" --output=\"{output_path}\" ") if name is not None: command.append(f' --name=\"{name}\" ') # Look through the input files for "*.cfg" files and append them. # Note: We don't nessecarily have to have a config file, the whole config # could be given as strings for file in snakemake.input: if file.endswith("cfg"): command.append(f" \"{file}\" ") # If a h5 file is given as input, use the data for the initial # spins, only use the first h5 file specified. The painful series # of escape characters is because we have to escape in both python
if 'M' in params: mailuser = "******" % str(params['M']) if "nodes" in params: nodes = "nodes=%d" % params["nodes"] if 'ppn' in params: ppn = "ppn=%d" % (params["ppn"] + retry * params['appn']) if ppn and not nodes: nodes = "nodes=1" if "mem" in params: mem = "mem=%dgb" % (params["mem"] + retry * params['amem']) if "runtime" in params: walltime = "walltime=%d:00:00" % (params["runtime"] + retry * params['aruntime']) print(' '.join((jname, jobo, jobe)), file=sys.stderr) print(" ".join((q, ppn, mem, walltime)), file=sys.stderr) #sys.exit(2) for jdir in set([os.path.dirname(p) for p in [jobe, jobo]]): if not os.path.isdir(jdir): makedirs(jdir) if nodes or ppn or mem or walltime: resourceparams = " -l \"" if nodes: resourceparams = resourceparams + nodes if nodes and ppn: resourceparams = resourceparams + ":" + ppn if nodes and mem: resourceparams = resourceparams + "," if mem: resourceparams = resourceparams + mem if walltime and (nodes or mem): resourceparams = resourceparams + "," if walltime: resourceparams = resourceparams + walltime if nodes or mem or walltime: resourceparams = resourceparams + "\"" cmd = "qsub {a}{A}{b}{c}{C}{d}{D}{e}{f}{h}{j}{l}{m}{M}{N}{o}{p}{P}{q}{t}{u}{v}{V}{w}{W}{rp}{dep}{ex}".format(\ a=atime,A=acc_string,b=pbs_time,c=chkpt,C=pref,d=dd,D=rd,e=se,f=ft,h=hold,j=j,l=resource,m=mail,M=mailuser,\ N=jname,o=so,p=priority,P=proxy,q=q,t=ar,u=user,v=ev,V=eall,w=wd,W=add,rp=resourceparams,dep=depend,ex=extras) try:
from os import path from snakemake.shell import shell from snakemake.utils import makedirs log = snakemake.log_fmt_shell(stdout=True, stderr=True) threads = snakemake.params.ntasks memory = snakemake.params.mem accession = snakemake.params.accession fastq_dir = snakemake.params.fastq_dir tmp_dir = path.join(fastq_dir, "tmp") makedirs(tmp_dir) extra = "" shell("parallel-fastq-dump " "-s {accession} " "-t {threads} " "-O {fastq_dir} " "--tmpdir {tmp_dir} " "--split-3 " "-I " "--gzip " "{log}")
__email__ = "*****@*****.**" __license__ = "MIT" import os.path as op from snakemake.shell import shell from snakemake.utils import makedirs from snakemake_wrapper_utils.java import get_java_opts # Defining logging and gathering extra parameters log = snakemake.log_fmt_shell(stdout=True, stderr=True) extra = snakemake.params.get("extra", "") java_opts = get_java_opts(snakemake) # Building output dirs makedirs(op.dirname(snakemake.output.snp)) makedirs(op.dirname(snakemake.output.indel)) # Output prefix prefix = op.splitext(snakemake.output.snp)[0] # Searching for input files pileup_pair = ["normal_pileup", "tumor_pileup"] in_pileup = "" mpileup = "" if "mpileup" in snakemake.input.keys(): # Case there is a mpileup with both normal and tumor in_pileup = snakemake.input.mpileup mpileup = "--mpileup 1" elif all(pileup in snakemake.input.keys() for pileup in pileup_pair):
import tempfile from snakemake.shell import shell from snakemake.utils import makedirs log = snakemake.log_fmt_shell(stdout=True, stderr=True) extra = snakemake.params.get("extra", "") sjdb_overhang = snakemake.params.get("sjdbOverhang", "100") gtf = snakemake.input.get("gtf") if gtf is not None: gtf = f"--sjdbGTFfile {gtf}" sjdb_overhang = f"--sjdbOverhang {sjdb_overhang}" else: gtf = sjdb_overhang = "" makedirs(snakemake.output) with tempfile.TemporaryDirectory() as tmpdir: shell("STAR" " --runThreadN {snakemake.threads}" # Number of threads " --runMode genomeGenerate" # Indexation mode " --genomeFastaFiles {snakemake.input.fasta}" # Path to fasta files " {sjdb_overhang}" # Read-len - 1 " {gtf}" # Highly recommended GTF " {extra}" # Optional parameters " --outTmpDir {tmpdir}/STARtmp" # Temp dir " --genomeDir {snakemake.output}" # Path to output " {log}" # Logging )