def readSeqs(spec, tid_list, utr=False): if spec == "mouse": exons_file = "../Reference-genomes/mm10/Mus_musculus.GRCm38.exon.all.200flank.fa" utr5_file = "../Reference-genomes/mm10/Mus_musculus.GRCm38.5utr.fa" utr3_file = "../Reference-genomes/mm10/Mus_musculus.GRCm38.3utr.fa" elif spec == "rat": exons_file = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.exon.all.200flank.fa" utr5_file = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.5utr.fa" utr3_file = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.3utr.fa" # Sequences downloaded from Ensembl Biomart 99 mcore.PWS("# " + mcore.getDateTime() + " Reading " + spec + " exon sequences: " + exons_file) exons = mseq.fastaGetDict(exons_file) # Read the sequences exons = parseHeaderIds(exons, "exon", tid_list) # Parse the header IDs so they only contain the exon ID. mcore.PWS("# Total sequences read: " + str(len(exons))) mcore.PWS("# ----------------") # This block reads the exon sequences + 500bp of flanking sequence utr5, utr3 = "", "" if utr: mcore.PWS("# " + mcore.getDateTime() + " Reading " + spec + " UTR sequences: " + utr5_file + " " + utr3_file) utr5 = mseq.fastaGetDict(utr5_file) utr5 = parseHeaderIds(utr5, "utr", tid_list) utr3 = mseq.fastaGetDict(utr3_file) utr3 = parseHeaderIds(utr3, "utr", tid_list) # Read the sequences and parse the header IDs so each exon coincides with the UTR for the transcript mcore.PWS("# Total 5' UTRs read: " + str(len(utr5))) mcore.PWS("# Total 3' UTRs read: " + str(len(utr3))) mcore.PWS("# ----------------") # This block reads the UTR sequences return exons, utr5, utr3
if args.spec == "all": spec = specs_ordered else: spec = args.spec.replace(", ", ",").split(",") for s in spec: if s not in spec_ids: sys.exit(" * ERROR SF2: Cannot find specified species: " + s) # Parse the input species. ########################## # Reporting run-time info for records. with open(output_file, "w") as jobfile: mcore.runTime("#!/bin/bash\n# Rodent post-dedup reformat commands", jobfile) mcore.PWS("# STEP INFO", jobfile) mcore.PWS(mcore.spacedOut("# Current step:", pad) + step, jobfile) mcore.PWS(mcore.spacedOut("# Previous step:", pad) + prev_step, jobfile) mcore.PWS("# ----------", jobfile) mcore.PWS("# I/O INFO", jobfile) mcore.PWS( mcore.spacedOut("# Input directory:", pad) + prev_step_dir, jobfile) mcore.PWS( mcore.spacedOut("# Output directory:", pad) + dedup_dir, jobfile) mcore.PWS( mcore.spacedOut("# Intermediate reformat directory:", pad) + reformat_dir, jobfile) mcore.PWS( mcore.spacedOut("# reformat.sh path:", pad) + args.path, jobfile) mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile) mcore.PWS(mcore.spacedOut("# Seq runs:", pad) + args.runtype, jobfile)
import os, mcore, mseq, gzip, re from collections import defaultdict ############################################################ ref = "../Reference-genomes/mm10/mm10.fa" gtffile_mouse = "../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz" transcript_file = "../02-Annotation-data/selected-transcripts.txt" outdir = "../02-Annotation-data/transcript-seq/" #outdir = "../02-Annotation-data/ts2/"; logfilename = "get_selected_seqs.log" # Hardcoded file names with open(logfilename, "w") as logfile: mcore.runTime("# Rodent exomes -- get mouse CDS", logfile) mcore.PWS("# Mouse reference FASTA: " + ref, logfile) mcore.PWS("# Mouse GTF file: " + gtffile_mouse, logfile) mcore.PWS("# Transcripts file: " + transcript_file, logfile) mcore.PWS("# Sequence output dir: " + outdir, logfile) mcore.PWS("# Log file: " + logfilename, logfile) mcore.PWS("# ----------------", logfile) mcore.PWS("# " + mcore.getDateTime() + " Reading mouse transcripts...", logfile) mouse_transcripts = {} transcript_len_sum, first = 0, True for line in open(transcript_file): if line[0] == "#" or first: first = False continue line = line.strip().split("\t")
target_file = "../Targets/targets-mm10-coords.bed" tile_file = "../Targets/tiles-mm10-coords.bed" # Reference options if args.spec == "all": spec = specs_ordered else: spec = args.spec.replace(", ", ",").split(",") for s in spec: if s not in spec_ids: sys.exit("SF2", "Cannot find specified species: " + s) # Parse the input species. with open(output_file, "w") as jobfile: mcore.runTime("#!/bin/bash\n# Rodent BAM commands", jobfile) mcore.PWS("# STEP INFO", jobfile) mcore.PWS( mcore.spacedOut("# Current step:", pad) + "BAM merging", jobfile) mcore.PWS(mcore.spacedOut("# Input directory:", pad) + indir, jobfile) mcore.PWS( mcore.spacedOut("# Species job directory:", pad) + jobs_dir, jobfile) if not os.path.isdir(jobs_dir): mcore.PWS("# Creating jobs directory.", jobfile) os.system("mkdir " + jobs_dir) mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile) if not args.name: mcore.PWS( "# -n not specified --> Generating random string for job name", jobfile) mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile) mcore.PWS(mcore.spacedOut("# Logfile directory:", pad) + logdir, jobfile)
base_logdir = os.path.abspath("logs/") step = "05-Index" logdir = os.path.join(base_logdir, step + "-logs") # Step I/O info. runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids) print(runtype, runstrs) # Parse the input run types. spec = mfiles.parseSpecs(args.spec, specs_ordered) # Parse the input species. with open(output_file, "w") as jobfile: mcore.runTime("#!/bin/bash\n# Rodent BWA commands", jobfile) mcore.PWS("# STEP INFO", jobfile) #mcore.PWS(mcore.spacedOut("# Reads directory:", pad) + read_dir, jobfile); mcore.PWS( mcore.spacedOut("# Assembly input directory:", pad) + assembly_indir, jobfile) mcore.PWS( mcore.spacedOut("# Assembly output directory:", pad) + assembly_outdir, jobfile) #mcore.PWS(mcore.spacedOut("# Output directory:", pad) + bam_dir, jobfile); #mcore.PWS(mcore.spacedOut("# Pseudo-it path:", pad) + args.path, jobfile); mcore.PWS(mcore.spacedOut("# Species:", pad) + str(args.spec), jobfile) mcore.PWS( mcore.spacedOut("# Run types:", pad) + str(args.runtype), jobfile) if not args.name: mcore.PWS( "# -n not specified --> Generating random string for job name",
total_target_len = 0.0 for line in open(target_file): line = line.strip().split("\t") total_target_len += (float(line[2]) - float(line[1])) runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids) # Parse the input run types. specs = mfiles.parseSpecs(args.spec, specs_ordered) specs = [s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s] specs = [s for s in specs if any(r in spec_ids[s] for r in runtype)] # Parse the input species. with open(outfilename, "w") as outfile, mp.Pool(processes=args.procs) as pool: mcore.runTime("# Rodent assembly and mapping stats", outfile) mcore.PWS( mcore.spacedOut("# Total species:", pad) + str(len(specs)), outfile) mcore.PWS( mcore.spacedOut("# Total target length:", pad) + str(total_target_len), outfile) mcore.PWS(mcore.spacedOut("# Mapping directory:", pad) + map_dir, outfile) mcore.PWS( mcore.spacedOut("# Assembly directory:", pad) + assembly_dir, outfile) mcore.PWS(mcore.spacedOut("# Output file:", pad) + outfilename, outfile) mcore.PWS("# ----------", outfile) cols = [ 'num-scaffs', 'avg-scaff-len', 'asm-len', 'asm-n50', 'asm-l50', 'asm-reads-mapped', 'asm-perc-reads-mapped', 'asm-paired-mapped', 'asm-perc-paired-mapped', 'asm-pair-mapped-diff-chr', 'asm-single-mapped', 'asm-perc-single-mapped', 'asm-duplicate-reads', 'asm-avg-depth', 'asm-avg-start-depth', 'asm-avg-mid-depth',
output_file = os.path.join("count-reads.csv") # Job files runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids) # Parse the input run types. specs = mfiles.parseSpecs(args.spec, specs_ordered) specs = [s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s] specs = [s for s in specs if any(r in spec_ids[s] for r in runtype)] # Parse the input species. ########################## # Reporting run-time info for records. with open(output_file, "w") as outfile, mp.Pool(processes=args.procs) as pool: mcore.runTime("# Rodent read counting", outfile) mcore.PWS( mcore.spacedOut("# Input directory:", pad) + args.indir, outfile) mcore.PWS(mcore.spacedOut("# Output file:", pad) + output_file, outfile) mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, outfile) mcore.PWS(mcore.spacedOut("# Seq runs:", pad) + args.runtype, outfile) mcore.PWS(mcore.spacedOut("# Job file:", pad) + output_file, outfile) mcore.PWS("# ----------", outfile) mcore.PWS("# BEGIN OUTPUT", outfile) ########################## headers = ["Total bases", "Total reads"] mcore.PWS("Species" + "," + ",".join(headers)) outlines = {} chunk_num, spec_num = 1, 1 cur_specs = []
base_logdir = os.path.abspath("logs/") logdir = os.path.join(base_logdir, step + "-logs") # Step I/O info. runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids) #print(runtype, runstrs); # Parse the input run types. specs = mfiles.parseSpecs(args.spec, specs_ordered) specs = [s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s] specs = [s for s in specs if any(r in spec_ids[s] for r in runtype)] # Parse the input species. with open(output_file, "w") as jobfile: mcore.runTime("#!/bin/bash\n# Rodent pileup commands", jobfile) mcore.PWS("# STEP INFO", jobfile) mcore.PWS(mcore.spacedOut("# BAM directory:", pad) + bam_dir, jobfile) mcore.PWS( mcore.spacedOut("# Assembly directory:", pad) + assembly_dir, jobfile) mcore.PWS( mcore.spacedOut("# Output directory:", pad) + pileup_dir, jobfile) mcore.PWS(mcore.spacedOut("# Pseudo-it path:", pad) + args.path, jobfile) mcore.PWS(mcore.spacedOut("# Species:", pad) + str(args.spec), jobfile) mcore.PWS( mcore.spacedOut("# Run types:", pad) + str(args.runtype), jobfile) if not args.name: mcore.PWS( "# -n not specified --> Generating random string for job name", jobfile) mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile) mcore.PWS(mcore.spacedOut("# Logfile directory:", pad) + logdir, jobfile)
base_logdir = os.path.abspath("logs/"); logdir = os.path.join(base_logdir, step + "-logs"); # Step I/O info. runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids); #print(runtype, runstrs); # Parse the input run types. specs = mfiles.parseSpecs(args.spec, specs_ordered); specs = [ s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s ]; specs = [ s for s in specs if any(r in spec_ids[s] for r in runtype) ]; # Parse the input species. with open(output_file, "w") as jobfile: mcore.runTime("#!/bin/bash\n# Rodent BWA re-map commands", jobfile); mcore.PWS("# STEP INFO", jobfile); mcore.PWS(mcore.spacedOut("# Current step:", pad) + step, jobfile); mcore.PWS(mcore.spacedOut("# Previous step:", pad) + prev_step, jobfile); mcore.PWS(mcore.spacedOut("# Input directory:", pad) + prev_step_dir, jobfile); mcore.PWS(mcore.spacedOut("# Assembly directory:", pad) + ref_dir, jobfile); mcore.PWS(mcore.spacedOut("# Output directory:", pad) + step_dir, jobfile); mcore.PWS(mcore.spacedOut("# BWA path:", pad) + args.path, jobfile); mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile); if not args.name: mcore.PWS("# -n not specified --> Generating random string for job name", jobfile); mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile); mcore.PWS(mcore.spacedOut("# Logfile directory:", pad) + logdir, jobfile); if not os.path.isdir(logdir): mcore.PWS("# Creating logfile directory.", jobfile); os.system("mkdir " + logdir); mcore.PWS(mcore.spacedOut("# Job file:", pad) + output_file, jobfile);
datasets = [ "australian-full-all", "australian-full-coding", "australian-reduced-all", "australian-reduced-coding", "reproductive-all", "reproductive-coding", "reproductive-mclennan-all", "reproductive-mclennan-coding", "reproductive-pahl-all", "reproductive-pahl-coding", "reproductive-testes-mass-all", "reproductive-testes-mass-coding", "reproductive-sperm-img-all", "reproductive-sperm-img-coding", "reproductive-sperm-morpho-all", "reproductive-sperm-morpho-coding", "full-all", "full-coding" ] dataset = "reproductive-all" if dataset not in datasets: sys.exit(" * ERROR: check dataset.") mcore.PWS("# " + mcore.getDateTime() + " Separating sequences for dataset: " + dataset) #### exclude_samples = [] add_rat = False add_mouse = False rm_samples = False rmdir = "../03-Alignments/samples-to-rm/" # Job variables #### orthfile = "../02-Annotation-data/mm10-rnor6-master-transcript-id-table.tab" # The ortholog file between mouse and rat. core.PWS("# " + core.getDateTime() + " Reading selected transcript IDs: " +
base_logdir = os.path.abspath("logs/") logdir = os.path.join(base_logdir, step + "-logs") # Step I/O info. runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids) # Parse the input run types. spec = mfiles.parseSpecs(args.spec, specs_ordered) # Parse the input species. ########################## # Reporting run-time info for records. with open(output_file, "w") as jobfile: mcore.runTime("#!/bin/bash\n# Rodent Spades commands", jobfile) mcore.PWS("# STEP INFO", jobfile) mcore.PWS(mcore.spacedOut("# Current step:", pad) + step, jobfile) mcore.PWS(mcore.spacedOut("# Previous step:", pad) + prev_step, jobfile) mcore.PWS("# ----------", jobfile) mcore.PWS("# I/O INFO", jobfile) mcore.PWS( mcore.spacedOut("# Input directory:", pad) + prev_step_dir, jobfile) mcore.PWS(mcore.spacedOut("# Output directory:", pad) + step_dir, jobfile) mcore.PWS(mcore.spacedOut("# Spades path:", pad) + args.path, jobfile) mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile) mcore.PWS(mcore.spacedOut("# Seq runs:", pad) + args.runtype, jobfile) if not args.name: mcore.PWS( "# -n not specified --> Generating random string for job name", jobfile) mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile)
if len(sys.argv) < 2: sys.exit(" * ERROR: Species must be provided: macaque or human") species = sys.argv[1] if sys.argv[1] not in ["mouse", "rat"]: sys.exit(" * ERROR: Species must be provided: macaque or human") if species == 'mouse': gtffile = "../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz" regstr = "MUS" elif species == 'rat': gtffile = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.99.gtf.gz" regstr = "RNO" outfilename = "intron-sizes-" + species + ".csv" mcore.runTime("# Rodent exomes -- get intron lengths") mcore.PWS("# GTF file: " + gtffile) mcore.PWS("# Output file: " + outfilename) mcore.PWS("# ----------------") mcore.PWS("# " + mcore.getDateTime() + " Reading transcripts...") transcripts = {} transcript_len_sum, first = 0, True for line in gzip.open(gtffile): line = line.decode() if line[0] == "#": continue line = line.strip().split("\t") feature_type, chrome, start, end, strand, feature_info = line[2], line[ 0], int(line[3]), int(line[4]), line[6], line[8] if feature_type == "transcript" and "protein_coding" in feature_info:
infile = "../02-Annotation-data/mouse-rat-orths-ens99.txt" gtffile_mouse = "../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz" gtffile_rat = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.99.gtf.gz" if mode == "targets": target_overlaps = "../Targets/bed/mm10-targets-to-exons-0.9.bed" outfilename = "../02-Annotation-data/selected-transcripts-targets.txt" elif mode == "length": outfilename = "../02-Annotation-data/selected-transcripts-length.txt" ds_thresh = 0.5 with open(outfilename, "w") as outfile: mcore.runTime("# Rodent exomes -- select mouse trancsripts", outfile) mcore.PWS("# Mouse GTF file: " + gtffile_mouse, outfile) mcore.PWS("# Rat GTF file: " + gtffile_rat, outfile) mcore.PWS("# Ensembl ortholog file: " + infile, outfile) if mode == "targets": mcore.PWS("# Target overlaps file: " + target_overlaps, outfile) mcore.PWS("# Output file: " + outfilename, outfile) mcore.PWS("# --------------", outfile) mcore.PWS("# dS threshold: " + str(ds_thresh), outfile) mcore.PWS("# --------------", outfile) if mode == "targets": mcore.PWS("# " + mcore.getDateTime() + " Reading target overlaps...", outfile) mouse_transcript_overlaps = {} for line in open(target_overlaps):
pad = 26 cwd = os.getcwd() # Job vars output_file = os.path.join(cwd, "jobs", name + ".sh") submit_file = os.path.join(cwd, "submit", name + ".sh") logdir = os.path.join(args.output, "logs") # Job files ########################## # Reporting run-time info for records. with open(output_file, "w") as outfile: mcore.runTime("#!/bin/bash\n# Exonerate command generator", outfile) mcore.PWS("# IO OPTIONS", outfile) mcore.PWS( mcore.spacedOut("# Input directory:", pad) + args.input, outfile) if args.outname: mcore.PWS( mcore.spacedOut("# --outname:", pad) + "Using end of output directory path as job name.", outfile) if not args.name: mcore.PWS( "# -n not specified --> Generating random string for job name", outfile) mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, outfile) mcore.PWS( mcore.spacedOut("# Output directory:", pad) + args.output, outfile) if args.overwrite: mcore.PWS(