Exemplo n.º 1
0
def readSeqs(spec, tid_list, utr=False):
    if spec == "mouse":
        exons_file = "../Reference-genomes/mm10/Mus_musculus.GRCm38.exon.all.200flank.fa"
        utr5_file = "../Reference-genomes/mm10/Mus_musculus.GRCm38.5utr.fa"
        utr3_file = "../Reference-genomes/mm10/Mus_musculus.GRCm38.3utr.fa"
    elif spec == "rat":
        exons_file = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.exon.all.200flank.fa"
        utr5_file = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.5utr.fa"
        utr3_file = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.3utr.fa"
    # Sequences downloaded from Ensembl Biomart 99

    mcore.PWS("# " + mcore.getDateTime() + " Reading " + spec +
              " exon sequences: " + exons_file)
    exons = mseq.fastaGetDict(exons_file)
    # Read the sequences
    exons = parseHeaderIds(exons, "exon", tid_list)
    # Parse the header IDs so they only contain the exon ID.
    mcore.PWS("# Total sequences read: " + str(len(exons)))
    mcore.PWS("# ----------------")
    # This block reads the exon sequences + 500bp of flanking sequence

    utr5, utr3 = "", ""
    if utr:
        mcore.PWS("# " + mcore.getDateTime() + " Reading " + spec +
                  " UTR sequences: " + utr5_file + " " + utr3_file)
        utr5 = mseq.fastaGetDict(utr5_file)
        utr5 = parseHeaderIds(utr5, "utr", tid_list)
        utr3 = mseq.fastaGetDict(utr3_file)
        utr3 = parseHeaderIds(utr3, "utr", tid_list)
        # Read the sequences and parse the header IDs so each exon coincides with the UTR for the transcript
        mcore.PWS("# Total 5' UTRs read: " + str(len(utr5)))
        mcore.PWS("# Total 3' UTRs read: " + str(len(utr3)))
        mcore.PWS("# ----------------")
    # This block reads the UTR sequences

    return exons, utr5, utr3
Exemplo n.º 2
0
if args.spec == "all":
    spec = specs_ordered
else:
    spec = args.spec.replace(", ", ",").split(",")
    for s in spec:
        if s not in spec_ids:
            sys.exit(" * ERROR SF2: Cannot find specified species: " + s)
# Parse the input species.

##########################
# Reporting run-time info for records.

with open(output_file, "w") as jobfile:
    mcore.runTime("#!/bin/bash\n# Rodent post-dedup reformat commands",
                  jobfile)
    mcore.PWS("# STEP INFO", jobfile)
    mcore.PWS(mcore.spacedOut("# Current step:", pad) + step, jobfile)
    mcore.PWS(mcore.spacedOut("# Previous step:", pad) + prev_step, jobfile)
    mcore.PWS("# ----------", jobfile)
    mcore.PWS("# I/O INFO", jobfile)
    mcore.PWS(
        mcore.spacedOut("# Input directory:", pad) + prev_step_dir, jobfile)
    mcore.PWS(
        mcore.spacedOut("# Output directory:", pad) + dedup_dir, jobfile)
    mcore.PWS(
        mcore.spacedOut("# Intermediate reformat directory:", pad) +
        reformat_dir, jobfile)
    mcore.PWS(
        mcore.spacedOut("# reformat.sh path:", pad) + args.path, jobfile)
    mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile)
    mcore.PWS(mcore.spacedOut("# Seq runs:", pad) + args.runtype, jobfile)
import os, mcore, mseq, gzip, re
from collections import defaultdict

############################################################

ref = "../Reference-genomes/mm10/mm10.fa"
gtffile_mouse = "../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz"
transcript_file = "../02-Annotation-data/selected-transcripts.txt"
outdir = "../02-Annotation-data/transcript-seq/"
#outdir = "../02-Annotation-data/ts2/";
logfilename = "get_selected_seqs.log"
# Hardcoded file names

with open(logfilename, "w") as logfile:
    mcore.runTime("# Rodent exomes -- get mouse CDS", logfile)
    mcore.PWS("# Mouse reference FASTA: " + ref, logfile)
    mcore.PWS("# Mouse GTF file:        " + gtffile_mouse, logfile)
    mcore.PWS("# Transcripts file:      " + transcript_file, logfile)
    mcore.PWS("# Sequence output dir:   " + outdir, logfile)
    mcore.PWS("# Log file:              " + logfilename, logfile)
    mcore.PWS("# ----------------", logfile)

    mcore.PWS("# " + mcore.getDateTime() + " Reading mouse transcripts...",
              logfile)
    mouse_transcripts = {}
    transcript_len_sum, first = 0, True
    for line in open(transcript_file):
        if line[0] == "#" or first:
            first = False
            continue
        line = line.strip().split("\t")
Exemplo n.º 4
0
target_file = "../Targets/targets-mm10-coords.bed"
tile_file = "../Targets/tiles-mm10-coords.bed"
# Reference options

if args.spec == "all":
    spec = specs_ordered
else:
    spec = args.spec.replace(", ", ",").split(",")
    for s in spec:
        if s not in spec_ids:
            sys.exit("SF2", "Cannot find specified species: " + s)
# Parse the input species.

with open(output_file, "w") as jobfile:
    mcore.runTime("#!/bin/bash\n# Rodent BAM commands", jobfile)
    mcore.PWS("# STEP INFO", jobfile)
    mcore.PWS(
        mcore.spacedOut("# Current step:", pad) + "BAM merging", jobfile)
    mcore.PWS(mcore.spacedOut("# Input directory:", pad) + indir, jobfile)
    mcore.PWS(
        mcore.spacedOut("# Species job directory:", pad) + jobs_dir, jobfile)
    if not os.path.isdir(jobs_dir):
        mcore.PWS("# Creating jobs directory.", jobfile)
        os.system("mkdir " + jobs_dir)
    mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile)
    if not args.name:
        mcore.PWS(
            "# -n not specified --> Generating random string for job name",
            jobfile)
    mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile)
    mcore.PWS(mcore.spacedOut("# Logfile directory:", pad) + logdir, jobfile)
base_logdir = os.path.abspath("logs/")
step = "05-Index"
logdir = os.path.join(base_logdir, step + "-logs")
# Step I/O info.

runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids)
print(runtype, runstrs)
# Parse the input run types.

spec = mfiles.parseSpecs(args.spec, specs_ordered)
# Parse the input species.

with open(output_file, "w") as jobfile:
    mcore.runTime("#!/bin/bash\n# Rodent BWA commands", jobfile)
    mcore.PWS("# STEP INFO", jobfile)
    #mcore.PWS(mcore.spacedOut("# Reads directory:", pad) + read_dir, jobfile);
    mcore.PWS(
        mcore.spacedOut("# Assembly input directory:", pad) + assembly_indir,
        jobfile)
    mcore.PWS(
        mcore.spacedOut("# Assembly output directory:", pad) + assembly_outdir,
        jobfile)
    #mcore.PWS(mcore.spacedOut("# Output directory:", pad) + bam_dir, jobfile);
    #mcore.PWS(mcore.spacedOut("# Pseudo-it path:", pad) + args.path, jobfile);
    mcore.PWS(mcore.spacedOut("# Species:", pad) + str(args.spec), jobfile)
    mcore.PWS(
        mcore.spacedOut("# Run types:", pad) + str(args.runtype), jobfile)
    if not args.name:
        mcore.PWS(
            "# -n not specified --> Generating random string for job name",
Exemplo n.º 6
0
total_target_len = 0.0
for line in open(target_file):
    line = line.strip().split("\t")
    total_target_len += (float(line[2]) - float(line[1]))

runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids)
# Parse the input run types.

specs = mfiles.parseSpecs(args.spec, specs_ordered)
specs = [s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s]
specs = [s for s in specs if any(r in spec_ids[s] for r in runtype)]
# Parse the input species.

with open(outfilename, "w") as outfile, mp.Pool(processes=args.procs) as pool:
    mcore.runTime("# Rodent assembly and mapping stats", outfile)
    mcore.PWS(
        mcore.spacedOut("# Total species:", pad) + str(len(specs)), outfile)
    mcore.PWS(
        mcore.spacedOut("# Total target length:", pad) + str(total_target_len),
        outfile)
    mcore.PWS(mcore.spacedOut("# Mapping directory:", pad) + map_dir, outfile)
    mcore.PWS(
        mcore.spacedOut("# Assembly directory:", pad) + assembly_dir, outfile)
    mcore.PWS(mcore.spacedOut("# Output file:", pad) + outfilename, outfile)
    mcore.PWS("# ----------", outfile)

    cols = [
        'num-scaffs', 'avg-scaff-len', 'asm-len', 'asm-n50', 'asm-l50',
        'asm-reads-mapped', 'asm-perc-reads-mapped', 'asm-paired-mapped',
        'asm-perc-paired-mapped', 'asm-pair-mapped-diff-chr',
        'asm-single-mapped', 'asm-perc-single-mapped', 'asm-duplicate-reads',
        'asm-avg-depth', 'asm-avg-start-depth', 'asm-avg-mid-depth',
Exemplo n.º 7
0
output_file = os.path.join("count-reads.csv")
# Job files

runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids)
# Parse the input run types.

specs = mfiles.parseSpecs(args.spec, specs_ordered)
specs = [s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s]
specs = [s for s in specs if any(r in spec_ids[s] for r in runtype)]
# Parse the input species.

##########################
# Reporting run-time info for records.
with open(output_file, "w") as outfile, mp.Pool(processes=args.procs) as pool:
    mcore.runTime("# Rodent read counting", outfile)
    mcore.PWS(
        mcore.spacedOut("# Input directory:", pad) + args.indir, outfile)
    mcore.PWS(mcore.spacedOut("# Output file:", pad) + output_file, outfile)
    mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, outfile)
    mcore.PWS(mcore.spacedOut("# Seq runs:", pad) + args.runtype, outfile)
    mcore.PWS(mcore.spacedOut("# Job file:", pad) + output_file, outfile)
    mcore.PWS("# ----------", outfile)
    mcore.PWS("# BEGIN OUTPUT", outfile)

    ##########################
    headers = ["Total bases", "Total reads"]
    mcore.PWS("Species" + "," + ",".join(headers))

    outlines = {}
    chunk_num, spec_num = 1, 1
    cur_specs = []
base_logdir = os.path.abspath("logs/")
logdir = os.path.join(base_logdir, step + "-logs")
# Step I/O info.

runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids)
#print(runtype, runstrs);
# Parse the input run types.

specs = mfiles.parseSpecs(args.spec, specs_ordered)
specs = [s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s]
specs = [s for s in specs if any(r in spec_ids[s] for r in runtype)]
# Parse the input species.

with open(output_file, "w") as jobfile:
    mcore.runTime("#!/bin/bash\n# Rodent pileup commands", jobfile)
    mcore.PWS("# STEP INFO", jobfile)
    mcore.PWS(mcore.spacedOut("# BAM directory:", pad) + bam_dir, jobfile)
    mcore.PWS(
        mcore.spacedOut("# Assembly directory:", pad) + assembly_dir, jobfile)
    mcore.PWS(
        mcore.spacedOut("# Output directory:", pad) + pileup_dir, jobfile)
    mcore.PWS(mcore.spacedOut("# Pseudo-it path:", pad) + args.path, jobfile)
    mcore.PWS(mcore.spacedOut("# Species:", pad) + str(args.spec), jobfile)
    mcore.PWS(
        mcore.spacedOut("# Run types:", pad) + str(args.runtype), jobfile)
    if not args.name:
        mcore.PWS(
            "# -n not specified --> Generating random string for job name",
            jobfile)
    mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile)
    mcore.PWS(mcore.spacedOut("# Logfile directory:", pad) + logdir, jobfile)
base_logdir = os.path.abspath("logs/");
logdir = os.path.join(base_logdir, step + "-logs");
# Step I/O info.

runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids);
#print(runtype, runstrs);
# Parse the input run types.

specs = mfiles.parseSpecs(args.spec, specs_ordered);
specs = [ s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s ];
specs = [ s for s in specs if any(r in spec_ids[s] for r in runtype) ];
# Parse the input species.

with open(output_file, "w") as jobfile:
    mcore.runTime("#!/bin/bash\n# Rodent BWA re-map commands", jobfile);
    mcore.PWS("# STEP INFO", jobfile);
    mcore.PWS(mcore.spacedOut("# Current step:", pad) + step, jobfile);
    mcore.PWS(mcore.spacedOut("# Previous step:", pad) + prev_step, jobfile);
    mcore.PWS(mcore.spacedOut("# Input directory:", pad) + prev_step_dir, jobfile);
    mcore.PWS(mcore.spacedOut("# Assembly directory:", pad) + ref_dir, jobfile);
    mcore.PWS(mcore.spacedOut("# Output directory:", pad) + step_dir, jobfile);
    mcore.PWS(mcore.spacedOut("# BWA path:", pad) + args.path, jobfile);
    mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile);
    if not args.name:
        mcore.PWS("# -n not specified --> Generating random string for job name", jobfile);
    mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile);
    mcore.PWS(mcore.spacedOut("# Logfile directory:", pad) + logdir, jobfile);
    if not os.path.isdir(logdir):
        mcore.PWS("# Creating logfile directory.", jobfile);
        os.system("mkdir " + logdir);
    mcore.PWS(mcore.spacedOut("# Job file:", pad) + output_file, jobfile);
Exemplo n.º 10
0
datasets = [
    "australian-full-all", "australian-full-coding", "australian-reduced-all",
    "australian-reduced-coding", "reproductive-all", "reproductive-coding",
    "reproductive-mclennan-all", "reproductive-mclennan-coding",
    "reproductive-pahl-all", "reproductive-pahl-coding",
    "reproductive-testes-mass-all", "reproductive-testes-mass-coding",
    "reproductive-sperm-img-all", "reproductive-sperm-img-coding",
    "reproductive-sperm-morpho-all", "reproductive-sperm-morpho-coding",
    "full-all", "full-coding"
]

dataset = "reproductive-all"
if dataset not in datasets:
    sys.exit(" * ERROR: check dataset.")

mcore.PWS("# " + mcore.getDateTime() + " Separating sequences for dataset: " +
          dataset)

####

exclude_samples = []
add_rat = False
add_mouse = False
rm_samples = False
rmdir = "../03-Alignments/samples-to-rm/"
# Job variables

####

orthfile = "../02-Annotation-data/mm10-rnor6-master-transcript-id-table.tab"
# The ortholog file between mouse and rat.
core.PWS("# " + core.getDateTime() + " Reading selected transcript IDs: " +
Exemplo n.º 11
0
base_logdir = os.path.abspath("logs/")
logdir = os.path.join(base_logdir, step + "-logs")
# Step I/O info.

runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids)
# Parse the input run types.

spec = mfiles.parseSpecs(args.spec, specs_ordered)
# Parse the input species.

##########################
# Reporting run-time info for records.

with open(output_file, "w") as jobfile:
    mcore.runTime("#!/bin/bash\n# Rodent Spades commands", jobfile)
    mcore.PWS("# STEP INFO", jobfile)
    mcore.PWS(mcore.spacedOut("# Current step:", pad) + step, jobfile)
    mcore.PWS(mcore.spacedOut("# Previous step:", pad) + prev_step, jobfile)
    mcore.PWS("# ----------", jobfile)
    mcore.PWS("# I/O INFO", jobfile)
    mcore.PWS(
        mcore.spacedOut("# Input directory:", pad) + prev_step_dir, jobfile)
    mcore.PWS(mcore.spacedOut("# Output directory:", pad) + step_dir, jobfile)
    mcore.PWS(mcore.spacedOut("# Spades path:", pad) + args.path, jobfile)
    mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile)
    mcore.PWS(mcore.spacedOut("# Seq runs:", pad) + args.runtype, jobfile)
    if not args.name:
        mcore.PWS(
            "# -n not specified --> Generating random string for job name",
            jobfile)
    mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile)
Exemplo n.º 12
0
if len(sys.argv) < 2:
    sys.exit(" * ERROR: Species must be provided: macaque or human")
species = sys.argv[1]
if sys.argv[1] not in ["mouse", "rat"]:
    sys.exit(" * ERROR: Species must be provided: macaque or human")

if species == 'mouse':
    gtffile = "../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz"
    regstr = "MUS"
elif species == 'rat':
    gtffile = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.99.gtf.gz"
    regstr = "RNO"
outfilename = "intron-sizes-" + species + ".csv"

mcore.runTime("# Rodent exomes -- get intron lengths")
mcore.PWS("# GTF file:              " + gtffile)
mcore.PWS("# Output file:           " + outfilename)
mcore.PWS("# ----------------")

mcore.PWS("# " + mcore.getDateTime() + " Reading transcripts...")
transcripts = {}
transcript_len_sum, first = 0, True
for line in gzip.open(gtffile):
    line = line.decode()
    if line[0] == "#":
        continue
    line = line.strip().split("\t")
    feature_type, chrome, start, end, strand, feature_info = line[2], line[
        0], int(line[3]), int(line[4]), line[6], line[8]

    if feature_type == "transcript" and "protein_coding" in feature_info:
infile = "../02-Annotation-data/mouse-rat-orths-ens99.txt"
gtffile_mouse = "../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz"
gtffile_rat = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.99.gtf.gz"

if mode == "targets":
    target_overlaps = "../Targets/bed/mm10-targets-to-exons-0.9.bed"
    outfilename = "../02-Annotation-data/selected-transcripts-targets.txt"
elif mode == "length":
    outfilename = "../02-Annotation-data/selected-transcripts-length.txt"

ds_thresh = 0.5

with open(outfilename, "w") as outfile:
    mcore.runTime("# Rodent exomes -- select mouse trancsripts", outfile)
    mcore.PWS("# Mouse GTF file:        " + gtffile_mouse, outfile)
    mcore.PWS("# Rat GTF file:          " + gtffile_rat, outfile)
    mcore.PWS("# Ensembl ortholog file: " + infile, outfile)
    if mode == "targets":
        mcore.PWS("# Target overlaps file:  " + target_overlaps, outfile)
    mcore.PWS("# Output file:           " + outfilename, outfile)
    mcore.PWS("# --------------", outfile)
    mcore.PWS("# dS threshold:          " + str(ds_thresh), outfile)

    mcore.PWS("# --------------", outfile)

    if mode == "targets":
        mcore.PWS("# " + mcore.getDateTime() + " Reading target overlaps...",
                  outfile)
        mouse_transcript_overlaps = {}
        for line in open(target_overlaps):
pad = 26
cwd = os.getcwd()
# Job vars

output_file = os.path.join(cwd, "jobs", name + ".sh")
submit_file = os.path.join(cwd, "submit", name + ".sh")
logdir = os.path.join(args.output, "logs")
# Job files

##########################
# Reporting run-time info for records.

with open(output_file, "w") as outfile:
    mcore.runTime("#!/bin/bash\n# Exonerate command generator", outfile)
    mcore.PWS("# IO OPTIONS", outfile)
    mcore.PWS(
        mcore.spacedOut("# Input directory:", pad) + args.input, outfile)
    if args.outname:
        mcore.PWS(
            mcore.spacedOut("# --outname:", pad) +
            "Using end of output directory path as job name.", outfile)
    if not args.name:
        mcore.PWS(
            "# -n not specified --> Generating random string for job name",
            outfile)
    mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, outfile)
    mcore.PWS(
        mcore.spacedOut("# Output directory:", pad) + args.output, outfile)
    if args.overwrite:
        mcore.PWS(