def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-est` to remove duplicate sequences. """ p = OptionParser(deduplicate.__doc__) p.set_align(pctid=98) p.add_option("--fast", default=False, action="store_true", help="Place sequence in the first cluster") p.add_option("--consensus", default=False, action="store_true", help="Compute consensus sequences") p.add_option("--reads", default=False, action="store_true", help="Use `cd-hit-454` to deduplicate [default: %default]") p.add_option("--samestrand", default=False, action="store_true", help="Enforce same strand alignment") p.set_home("cdhit") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. ocmd = "cd-hit-454" if opts.reads else "cd-hit-est" cmd = op.join(opts.cdhit_home, ocmd) cmd += " -c {0}".format(identity) if ocmd == "cd-hit-est": cmd += " -d 0" # include complete defline if opts.samestrand: cmd += " -r 0" if not opts.fast: cmd += " -g 1" dd = fastafile + ".P{0}.cdhit".format(opts.pctid) clstr = dd + ".clstr" cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd) if need_update(fastafile, (dd, clstr)): sh(cmd) if opts.consensus: cons = dd + ".consensus" cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus") cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".\ format(clstr, fastafile, cons) if need_update((clstr, fastafile), cons): sh(cmd) return dd
def htt(args): """ %prog htt bamfile chr4:3070000-3080000 Extract HTT region and run lobSTR. """ p = OptionParser(htt.__doc__) p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, region = args lhome = opts.lobstr_home minibamfile = bamfile.split("/")[-1] baifile = minibamfile + ".bai" if op.exists(baifile): sh("rm {}".format(baifile)) cmd = "samtools view {} {} -b".format(bamfile, region) cmd += " -o {0}".format(minibamfile) sh(cmd) sh("samtools index {0}".format(minibamfile)) c = region.split(":")[0].replace("chr", "") cmd, vcf = allelotype_on_chr(minibamfile, c, lhome, "hg38") sh(cmd)
def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-est` to remove duplicate sequences. """ p = OptionParser(deduplicate.__doc__) p.set_align(pctid=98) p.add_option("--reads", default=False, action="store_true", help="Use `cd-hit-454` to deduplicate [default: %default]") p.add_option("--samestrand", default=False, action="store_true", help="Enforce same strand alignment [%default: %default]") p.set_home("cdhit") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. cmd = "cd-hit-454" if opts.reads else "cd-hit-est" cmd = op.join(opts.cdhit_home, cmd) cmd += " -c {0}".format(identity) cmd += " -d 0" # include complete defline if opts.samestrand: cmd += " -r 0" cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile) sh(cmd) dd = fastafile + ".cdhit" return dd
def filtervcf(args): """ %prog filtervcf NA12878.hg38.vcf.gz Filter lobSTR VCF using script shipped in lobSTR. Input file can be a list of vcf files. """ p = OptionParser(filtervcf.__doc__) p.set_home("lobstr", default="/mnt/software/lobSTR") p.set_aws_opts(store="hli-mv-data-science/htang/str") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args lhome = opts.lobstr_home store = opts.output_path if samples.endswith((".vcf", ".vcf.gz")): vcffiles = [samples] else: vcffiles = [x.strip() for x in must_open(samples)] vcffiles = [x for x in vcffiles if ".filtered." not in x] run_args = [(x, lhome, x.startswith("s3://") and store) for x in vcffiles] cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) for res in p.map_async(run_filter, run_args).get(): continue
def lobstrindex(args): """ %prog lobstrindex hg38.trf.bed hg38.upper.fa Make lobSTR index. Make sure the FASTA contain only upper case (so use fasta.format --upper to convert from UCSC fasta). The bed file is generated by str(). """ p = OptionParser(lobstrindex.__doc__) p.add_option( "--notreds", default=False, action="store_true", help="Remove TREDs from the bed file", ) p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trfbed, fastafile = args pf = fastafile.split(".")[0] lhome = opts.lobstr_home mkdir(pf) if opts.notreds: newbedfile = trfbed + ".new" newbed = open(newbedfile, "w") fp = open(trfbed) retained = total = 0 seen = set() for row in fp: r = STRLine(row) total += 1 name = r.longname if name in seen: continue seen.add(name) print(r, file=newbed) retained += 1 newbed.close() logging.debug("Retained: {0}".format(percentage(retained, total))) else: newbedfile = trfbed mm = MakeManager() cmd = "python {0}/scripts/lobstr_index.py".format(lhome) cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf) mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd) tabfile = "{0}/index.tab".format(pf) cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome) cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile) mm.add((newbedfile, fastafile), tabfile, cmd) infofile = "{0}/index.info".format(pf) cmd = "cp {0} {1}".format(newbedfile, infofile) mm.add(trfbed, infofile, cmd) mm.write()
def locus(args): """ %prog locus bamfile Extract selected locus from a list of TREDs for validation, and run lobSTR. """ from jcvi.formats.sam import get_minibam # See `Format-lobSTR-database.ipynb` for a list of TREDs for validation INCLUDE = [ "HD", "SBMA", "SCA1", "SCA2", "SCA8", "SCA17", "DM1", "DM2", "FXTAS" ] db_choices = ("hg38", "hg19") p = OptionParser(locus.__doc__) p.add_option("--tred", choices=INCLUDE, help="TRED name") p.add_option("--ref", choices=db_choices, default="hg38", help="Reference genome") p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (bamfile, ) = args ref = opts.ref lhome = opts.lobstr_home tred = opts.tred tredsfile = datafile("TREDs.meta.csv") tf = pd.read_csv(tredsfile, index_col=0) row = tf.ix[tred] tag = "repeat_location" ldb = "TREDs" if ref == "hg19": tag += "." + ref ldb += "-" + ref seqid, start_end = row[tag].split(":") PAD = 1000 start, end = start_end.split("-") start, end = int(start) - PAD, int(end) + PAD region = "{}:{}-{}".format(seqid, start, end) minibamfile = get_minibam(bamfile, region) c = seqid.replace("chr", "") cmd, vcf = allelotype_on_chr(minibamfile, c, lhome, ldb) sh(cmd) parser = LobSTRvcf(columnidsfile=None) parser.parse(vcf, filtered=False) items = parser.items() if not items: print("No entry found!", file=sys.stderr) return k, v = parser.items()[0] print("{} => {}".format(tred, v.replace(",", "/")), file=sys.stderr)
def beagle(args): """ %prog beagle input.vcf 1 Use BEAGLE4.1 to impute vcf on chromosome 1. """ p = OptionParser(beagle.__doc__) p.set_home("beagle") p.set_ref() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) vcffile, chr = args pf = vcffile.rsplit(".", 1)[0] outpf = pf + ".beagle" outfile = outpf + ".vcf.gz" mm = MakeManager() beagle_cmd = opts.beagle_home kg = op.join(opts.ref, "1000GP_Phase3") cmd = beagle_cmd + " gt={0}".format(vcffile) cmd += " ref={0}/chr{1}.1kg.phase3.v5a.bref".format(kg, chr) cmd += " map={0}/plink.chr{1}.GRCh37.map".format(kg, chr) cmd += " out={0}".format(outpf) cmd += " nthreads=16 gprobs=true" mm.add(vcffile, outfile, cmd) mm.write()
def bam(args): """ %prog snp input.gsnap ref.fasta Convert GSNAP output to BAM. """ from jcvi.formats.sizes import Sizes from jcvi.formats.sam import index p = OptionParser(bam.__doc__) p.set_home("eddyyeh") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gsnapfile, fastafile = args EYHOME = opts.eddyyeh_home pf = gsnapfile.rsplit(".", 1)[0] uniqsam = pf + ".unique.sam" if need_update((gsnapfile, fastafile), uniqsam): cmd = op.join(EYHOME, "gsnap2gff3.pl") sizesfile = Sizes(fastafile).filename cmd += " --format sam -i {0} -o {1}".format(gsnapfile, uniqsam) cmd += " -u -l {0} -p {1}".format(sizesfile, opts.cpus) sh(cmd) index([uniqsam])
def genemark(args): """ %prog genemark species fastafile Train GENEMARK model given fastafile. GENEMARK self-trains so no trainig model gff file is needed. """ p = OptionParser(genemark.__doc__) p.set_home("gmes") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) species, fastafile = args mhome = opts.gmes_home gmdir = "genemark" mkdir(gmdir) cwd = os.getcwd() os.chdir(gmdir) cmd = "ln -sf ../{0}".format(fastafile) sh(cmd) license = op.expanduser("~/.gm_key") assert op.exists(license), "License key ({0}) not found!".format(license) cmd = "{0}/gm_es.pl {1}".format(mhome, fastafile) sh(cmd) os.chdir(cwd) logging.debug("GENEMARK matrix written to `{0}/mod/{1}.mod`".format(gmdir, species))
def snp(args): """ %prog snp input.gsnap Run SNP calling on GSNAP output after apps.gsnap.align(). """ p = OptionParser(snp.__doc__) p.set_home("eddyyeh") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gsnapfile, = args EYHOME = opts.eddyyeh_home pf = gsnapfile.rsplit(".", 1)[0] nativefile = pf + ".native" if need_update(gsnapfile, nativefile): cmd = op.join(EYHOME, "convert2native.pl") cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile) cmd += " -proc {0}".format(opts.cpus) sh(cmd) snpfile = pf + ".snp" if need_update(nativefile, snpfile): cmd = op.join(EYHOME, "SNPs/SNP_Discovery-short.pl") cmd += " --native {0} -o {1}".format(nativefile, snpfile) cmd += " -a 2 -ac 0.3 -c 0.8" sh(cmd)
def alignextend(args): """ %prog alignextend ref.fasta read.1.fastq read.2.fastq Wrapper around AMOS alignextend. """ p = OptionParser(alignextend.__doc__) p.add_option("--nosuffix", default=False, action="store_true", help="Do not add /1/2 suffix to the read [default: %default]") p.set_home("amos") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ref, r1, r2 = args pf = op.basename(r1).split(".")[0] cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl") if not opts.nosuffix: cmd += " -suffix" bwa_idx = "{0}.ref.fa.sa".format(pf) if not need_update(ref, bwa_idx): cmd += " -noindex" cmd += " -threads {0}".format(opts.cpus) offset = guessoffset([r1]) if offset == 64: cmd += " -I" cmd += " ".join(("", pf, ref, r1, r2)) sh(cmd)
def htt(args): """ %prog htt bamfile Extract HTT region and run lobSTR. """ p = OptionParser(htt.__doc__) p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bamfile, = args lhome = opts.lobstr_home minibamfile = bamfile.split("/")[-1] cmd = "samtools view {0} chr4:3070000-3080000 -b".format(bamfile) cmd += " -o {0}".format(minibamfile) sh(cmd) sh("rm {0}.bai".format(minibamfile)) sh("samtools index {0}".format(minibamfile)) cmd = allelotype_on_chr(minibamfile, 4, lhome, "hg38-named") sh(cmd)
def augustus(args): """ %prog augustus species gffile fastafile Train AUGUSTUS model given gffile and fastafile. Whole procedure taken from: <http://www.molecularevolution.org/molevolfiles/exercises/augustus/training.html> """ p = OptionParser(snap.__doc__) p.set_home("augustus") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) species, gffile, fastafile = args mhome = opts.augustus_home augdir = "augustus" cwd = os.getcwd() mkdir(augdir) os.chdir(augdir) sh("{0}/scripts/new_species.pl --species={1}".format(mhome, species)) sh("{0}/scripts/gff2gbSmallDNA.pl ../{1} ../{2} 1000 raw.gb".format(mhome, gffile, fastafile)) sh("{0}/bin/etraining --species={1} raw.gb 2> train.err".format(mhome, species)) sh("cat train.err | perl -pe 's/.*in sequence (\S+): .*/$1/' > badgenes.lst") sh("{0}/scripts/filterGenes.pl badgenes.lst raw.gb > training.gb".format(mhome)) sh("grep -c LOCUS raw.gb training.gb") sh("{0}/scripts/autoAugTrain.pl --trainingset=training.gb --species={1}".format(mhome, species)) os.chdir(cwd) sh("cp -r {0}/species/{1} augustus/".format(mhome, species))
def genemark(args): """ %prog genemark species fastafile Train GENEMARK model given fastafile. GENEMARK self-trains so no trainig model gff file is needed. """ p = OptionParser(genemark.__doc__) p.set_home("gmes") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) species, fastafile = args mhome = opts.gmes_home gmdir = "genemark" mkdir(gmdir) cwd = os.getcwd() os.chdir(gmdir) cmd = "ln -sf ../{0}".format(fastafile) sh(cmd) license = op.expanduser("~/.gm_key") assert op.exists(license), "License key ({0}) not found!".format(license) cmd = "{0}/gm_es.pl {1}".format(mhome, fastafile) sh(cmd) os.chdir(cwd) logging.debug("GENEMARK matrix written to `{0}/mod/{1}.mod`".format( gmdir, species))
def impute(args): """ %prog impute input.vcf hs37d5.fa 1 Use IMPUTE2 to impute vcf on chromosome 1. """ from pyfaidx import Fasta p = OptionParser(impute.__doc__) p.set_home("shapeit") p.set_home("impute") p.set_ref() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) vcffile, fastafile, chr = args mm = MakeManager() pf = vcffile.rsplit(".", 1)[0] hapsfile = pf + ".haps" kg = op.join(opts.ref, "1000GP_Phase3") shapeit_phasing(mm, chr, vcffile, opts) fasta = Fasta(fastafile) size = len(fasta[chr]) binsize = 5000000 bins = size / binsize # 5Mb bins if size % binsize: bins += 1 impute_cmd = op.join(opts.impute_home, "impute2") chunks = [] for x in xrange(bins + 1): chunk_start = x * binsize + 1 chunk_end = min(chunk_start + binsize - 1, size) outfile = pf + ".chunk{0:02d}.impute2".format(x) mapfile = "{0}/genetic_map_chr{1}_combined_b37.txt".format(kg, chr) rpf = "{0}/1000GP_Phase3_chr{1}".format(kg, chr) cmd = impute_cmd + " -m {0}".format(mapfile) cmd += " -known_haps_g {0}".format(hapsfile) cmd += " -h {0}.hap.gz -l {0}.legend.gz".format(rpf) cmd += " -Ne 20000 -int {0} {1}".format(chunk_start, chunk_end) cmd += " -o {0} -allow_large_regions -seed 367946".format(outfile) cmd += " && touch {0}".format(outfile) mm.add(hapsfile, outfile, cmd) chunks.append(outfile) # Combine all the files imputefile = pf + ".impute2" cmd = "cat {0} > {1}".format(" ".join(chunks), imputefile) mm.add(chunks, imputefile, cmd) # Convert to vcf vcffile = pf + ".impute2.vcf" cmd = "python -m jcvi.formats.vcf fromimpute2 {0} {1} {2} > {3}".\ format(imputefile, fastafile, chr, vcffile) mm.add(imputefile, vcffile, cmd) mm.write()
def mito(args): """ %prog mito chrM.fa input.bam Identify mitochondrial deletions. """ p = OptionParser(mito.__doc__) p.set_aws_opts(store="hli-mv-data-science/htang/mito-deletions") p.add_option("--realignonly", default=False, action="store_true", help="Realign only") p.add_option("--svonly", default=False, action="store_true", help="Run Realign => SV calls only") p.add_option("--support", default=1, type="int", help="Minimum number of supporting reads") p.set_home("speedseq", default="/mnt/software/speedseq/bin") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) chrMfa, bamfile = args store = opts.output_path cleanup = not opts.nocleanup if not op.exists(chrMfa): logging.debug("File `{}` missing. Exiting.".format(chrMfa)) return chrMfai = chrMfa + ".fai" if not op.exists(chrMfai): cmd = "samtools index {}".format(chrMfa) sh(cmd) if not bamfile.endswith(".bam"): bamfiles = [x.strip() for x in open(bamfile)] else: bamfiles = [bamfile] if store: computed = ls_s3(store) computed = [op.basename(x).split('.')[0] for x in computed if \ x.endswith(".depth")] remaining_samples = [x for x in bamfiles \ if op.basename(x).split(".")[0] not in computed] logging.debug("Already computed on `{}`: {}".\ format(store, len(bamfiles) - len(remaining_samples))) bamfiles = remaining_samples logging.debug("Total samples: {}".format(len(bamfiles))) for bamfile in bamfiles: run_mito(chrMfa, bamfile, opts, realignonly=opts.realignonly, svonly=opts.svonly, store=store, cleanup=cleanup)
def locus(args): """ %prog locus bamfile Extract selected locus from a list of TREDs for validation, and run lobSTR. """ from jcvi.formats.sam import get_minibam # See `Format-lobSTR-database.ipynb` for a list of TREDs for validation INCLUDE = ["HD", "SBMA", "SCA1", "SCA2", "SCA8", "SCA17", "DM1", "DM2", "FXTAS"] db_choices = ("hg38", "hg19") p = OptionParser(locus.__doc__) p.add_option("--tred", choices=INCLUDE, help="TRED name") p.add_option("--ref", choices=db_choices, default="hg38", help="Reference genome") p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bamfile, = args ref = opts.ref lhome = opts.lobstr_home tred = opts.tred tredsfile = datafile("TREDs.meta.csv") tf = pd.read_csv(tredsfile, index_col=0) row = tf.ix[tred] tag = "repeat_location" ldb = "TREDs" if ref == "hg19": tag += "." + ref ldb += "-" + ref seqid, start_end = row[tag].split(":") PAD = 1000 start, end = start_end.split('-') start, end = int(start) - PAD, int(end) + PAD region = "{}:{}-{}".format(seqid, start, end) minibamfile = get_minibam(bamfile, region) c = seqid.replace("chr", "") cmd, vcf = allelotype_on_chr(minibamfile, c, lhome, ldb) sh(cmd) parser = LobSTRvcf(columnidsfile=None) parser.parse(vcf, filtered=False) items = parser.items() if not items: print("No entry found!", file=sys.stderr) return k, v = parser.items()[0] print("{} => {}".format(tred, v.replace(',', '/')), file=sys.stderr)
def lobstrindex(args): """ %prog lobstrindex hg38.trf.bed hg38.upper.fa Make lobSTR index. Make sure the FASTA contain only upper case (so use fasta.format --upper to convert from UCSC fasta). The bed file is generated by str(). """ p = OptionParser(lobstrindex.__doc__) p.add_option("--notreds", default=False, action="store_true", help="Remove TREDs from the bed file") p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trfbed, fastafile = args pf = fastafile.split(".")[0] lhome = opts.lobstr_home mkdir(pf) if opts.notreds: newbedfile = trfbed + ".new" newbed = open(newbedfile, "w") fp = open(trfbed) retained = total = 0 seen = set() for row in fp: r = STRLine(row) total += 1 name = r.longname if name in seen: continue seen.add(name) print >> newbed, r retained += 1 newbed.close() logging.debug("Retained: {0}".format(percentage(retained, total))) else: newbedfile = trfbed mm = MakeManager() cmd = "python {0}/scripts/lobstr_index.py".format(lhome) cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf) mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd) tabfile = "{0}/index.tab".format(pf) cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome) cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile) mm.add((newbedfile, fastafile), tabfile, cmd) infofile = "{0}/index.info".format(pf) cmd = "cp {0} {1}".format(newbedfile, infofile) mm.add(trfbed, infofile, cmd) mm.write()
def augustus(args): """ %prog augustus species gffile fastafile Train AUGUSTUS model given gffile and fastafile. Whole procedure taken from: <http://www.molecularevolution.org/molevolfiles/exercises/augustus/training.html> """ p = OptionParser(augustus.__doc__) p.add_option( "--autotrain", default=False, action="store_true", help="Run autoAugTrain.pl to iteratively train AUGUSTUS", ) p.set_home("augustus") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) species, gffile, fastafile = args gffile = os.path.abspath(gffile) fastafile = os.path.abspath(fastafile) mhome = opts.augustus_home augdir = "augustus" cwd = os.getcwd() mkdir(augdir) os.chdir(augdir) target = "{0}/config/species/{1}".format(mhome, species) if op.exists(target): logging.debug("Removing existing target `{0}`".format(target)) sh("rm -rf {0}".format(target)) config_path = "{0}/config".format(mhome) sh("{0}/scripts/new_species.pl --species={1} --AUGUSTUS_CONFIG_PATH={2}". format(mhome, species, config_path)) sh("{0}/scripts/gff2gbSmallDNA.pl {1} {2} 1000 raw.gb".format( mhome, gffile, fastafile)) sh("{0}/bin/etraining --species={1} raw.gb 2> train.err".format( mhome, species)) sh(r"cat train.err | perl -pe 's/.*in sequence (\S+): .*/$1/' > badgenes.lst" ) sh("{0}/scripts/filterGenes.pl badgenes.lst raw.gb > training.gb".format( mhome)) sh("grep -c LOCUS raw.gb training.gb") # autoAugTrain failed to execute, disable for now if opts.autotrain: sh("rm -rf {0}".format(target)) sh("{0}/scripts/autoAugTrain.pl --trainingset=training.gb --species={1}" .format(mhome, species)) os.chdir(cwd) sh("cp -r {0} augustus/".format(target))
def lobstrindex(args): """ %prog lobstrindex hg38.trf.bed hg38.upper.fa hg38 Make lobSTR index. Make sure the FASTA contain only upper case (so use fasta.format --upper to convert from UCSC fasta). The bed file is generated by str(). """ p = OptionParser(lobstrindex.__doc__) p.add_option("--fixseq", action="store_true", default=False, help="Scan sequences to extract perfect STRs") p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) trfbed, fastafile, pf = args lhome = opts.lobstr_home mkdir(pf) if opts.fixseq: genome = pyfasta.Fasta(fastafile) newbedfile = trfbed + ".new" newbed = open(newbedfile, "w") fp = open(trfbed) retained = total = 0 for row in fp: s = STRLine(row) total += 1 for ns in s.iter_exact_str(genome): if not ns.is_valid(): continue print >> newbed, ns retained += 1 newbed.close() logging.debug("Retained: {0}".format(percentage(retained, total))) else: newbedfile = trfbed mm = MakeManager() cmd = "python {0}/scripts/lobstr_index.py".format(lhome) cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf) mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd) tabfile = "{0}/index.tab".format(pf) cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome) cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile) mm.add((newbedfile, fastafile), tabfile, cmd) infofile = "{0}/index.info".format(pf) cmd = "cp {0} {1}".format(trfbed, infofile) mm.add(trfbed, infofile, cmd) mm.write()
def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-est` to remove duplicate sequences. """ p = OptionParser(deduplicate.__doc__) p.set_align(pctid=96, pctcov=0) p.add_option("--fast", default=False, action="store_true", help="Place sequence in the first cluster") p.add_option("--consensus", default=False, action="store_true", help="Compute consensus sequences") p.add_option("--reads", default=False, action="store_true", help="Use `cd-hit-454` to deduplicate [default: %default]") p.add_option("--samestrand", default=False, action="store_true", help="Enforce same strand alignment") p.set_home("cdhit") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. fastafile, qualfile = fasta([fastafile, "--seqtk"]) ocmd = "cd-hit-454" if opts.reads else "cd-hit-est" cmd = op.join(opts.cdhit_home, ocmd) cmd += " -c {0}".format(identity) if ocmd == "cd-hit-est": cmd += " -d 0" # include complete defline if opts.samestrand: cmd += " -r 0" if not opts.fast: cmd += " -g 1" if opts.pctcov != 0: cmd += " -aL {0} -aS {0}".format(opts.pctcov / 100.) dd = fastafile + ".P{0}.cdhit".format(opts.pctid) clstr = dd + ".clstr" cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd) if need_update(fastafile, (dd, clstr)): sh(cmd) if opts.consensus: cons = dd + ".consensus" cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus") cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".\ format(clstr, fastafile, cons) if need_update((clstr, fastafile), cons): sh(cmd) return dd
def compilevcf(args): """ %prog compilevcf samples.csv Compile vcf results into master spreadsheet. """ p = OptionParser(compilevcf.__doc__) p.add_option("--db", default="hg38", help="Use these lobSTR db") p.add_option( "--nofilter", default=False, action="store_true", help="Do not filter the variants", ) p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (samples, ) = args workdir = opts.workdir store = opts.output_path cleanup = not opts.nocleanup filtered = not opts.nofilter dbs = opts.db.split(",") cwd = os.getcwd() mkdir(workdir) os.chdir(workdir) samples = op.join(cwd, samples) stridsfile = "STR.ids" if samples.endswith((".vcf", ".vcf.gz")): vcffiles = [samples] else: vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print("\n".join(uids), file=fw) fw.close() run_args = [(x, filtered, cleanup, store) for x in vcffiles] cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) for _ in p.map_async(run_compile, run_args).get(): continue
def batchlobstr(args): """ %prog batchlobstr samples.csv Run lobSTR sequentially on list of samples. Each line contains: sample-name,s3-location """ p = OptionParser(batchlobstr.__doc__) p.add_option("--sep", default=",", help="Separator for building commandline") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (samplesfile,) = args store = opts.output_path computed = ls_s3(store) fp = open(samplesfile) skipped = total = 0 for row in fp: total += 1 sample, s3file = row.strip().split(",")[:2] exec_id, sample_id = sample.split("_") bamfile = s3file.replace(".gz", "").replace(".vcf", ".bam") gzfile = sample + ".{0}.vcf.gz".format("hg38") if gzfile in computed: skipped += 1 continue print( opts.sep.join( "python -m jcvi.variation.str lobstr".split() + [ "hg38", "--input_bam_path", bamfile, "--output_path", store, "--sample_id", sample_id, "--workflow_execution_id", exec_id, "--lobstr_home", opts.lobstr_home, "--workdir", opts.workdir, ] ) ) fp.close() logging.debug("Total skipped: {0}".format(percentage(skipped, total)))
def augustus(args): """ %prog augustus fastafile Run parallel AUGUSTUS. Final results can be reformatted using annotation.reformat.augustus(). """ p = OptionParser(augustus.__doc__) p.add_option("--species", default="maize", help="Use species model for prediction") p.add_option("--hintsfile", help="Hint-guided AUGUSTUS") p.add_option("--nogff3", default=False, action="store_true", help="Turn --gff3=off") p.set_home("augustus") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args cpus = opts.cpus mhome = opts.augustus_home gff3 = not opts.nogff3 suffix = ".gff3" if gff3 else ".out" cfgfile = op.join(mhome, "config/extrinsic/extrinsic.M.RM.E.W.cfg") outdir = mkdtemp(dir=".") fs = split([fastafile, outdir, str(cpus)]) augustuswrap_params = partial( augustuswrap, species=opts.species, gff3=gff3, cfgfile=cfgfile, hintsfile=opts.hintsfile, ) g = Jobs(augustuswrap_params, fs.names) g.run() gff3files = [x.rsplit(".", 1)[0] + suffix for x in fs.names] outfile = fastafile.rsplit(".", 1)[0] + suffix FileMerger(gff3files, outfile=outfile).merge() shutil.rmtree(outdir) if gff3: from jcvi.annotation.reformat import augustus as reformat_augustus reformat_outfile = outfile.replace(".gff3", ".reformat.gff3") reformat_augustus([outfile, "--outfile={0}".format(reformat_outfile)])
def pasa(args): """ %prog ${pasadb}.assemblies.fasta ${pasadb}.pasa_assemblies.gff3 Wraps `pasa_asmbls_to_training_set.dbi`. """ from jcvi.formats.base import SetFile from jcvi.formats.gff import Gff p = OptionParser(pasa.__doc__) p.set_home("pasa") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, gffile = args transcodergff = fastafile + ".transdecoder.gff3" transcodergenomegff = fastafile + ".transdecoder.genome.gff3" if need_update((fastafile, gffile), (transcodergff, transcodergenomegff)): cmd = "{0}/scripts/pasa_asmbls_to_training_set.dbi".format( opts.pasa_home) cmd += " --pasa_transcripts_fasta {0} --pasa_transcripts_gff3 {1}".\ format(fastafile, gffile) sh(cmd) completeids = fastafile.rsplit(".", 1)[0] + ".complete.ids" if need_update(transcodergff, completeids): cmd = "grep complete {0} | cut -f1 | sort -u".format(transcodergff) sh(cmd, outfile=completeids) complete = SetFile(completeids) seen = set() completegff = transcodergenomegff.rsplit(".", 1)[0] + ".complete.gff3" fw = open(completegff, "w") gff = Gff(transcodergenomegff) for g in gff: a = g.attributes if "Parent" in a: id = a["Parent"][0] else: id = a["ID"][0] asmbl_id = id.split("|")[0] if asmbl_id not in complete: continue print >> fw, g if g.type == "gene": seen.add(id) fw.close() logging.debug("A total of {0} complete models extracted to `{1}`.".\ format(len(seen), completegff))
def batchlobstr(args): """ %prog batchlobstr samples.csv Run lobSTR sequentially on list of samples. Each line contains: sample-name,s3-location """ p = OptionParser(batchlobstr.__doc__) p.add_option("--sep", default=",", help="Separator for building commandline") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samplesfile, = args store = opts.output_path computed = ls_s3(store) fp = open(samplesfile) skipped = total = 0 for row in fp: total += 1 sample, s3file = row.strip().split(",")[:2] exec_id, sample_id = sample.split("_") bamfile = s3file.replace(".gz", "").replace(".vcf", ".bam") gzfile = sample + ".{0}.vcf.gz".format("hg38") if gzfile in computed: skipped += 1 continue print opts.sep.join( "python -m jcvi.variation.str lobstr".split() + [ "hg38", "--input_bam_path", bamfile, "--output_path", store, "--sample_id", sample_id, "--workflow_execution_id", exec_id, "--lobstr_home", opts.lobstr_home, "--workdir", opts.workdir, ] ) fp.close() logging.debug("Total skipped: {0}".format(percentage(skipped, total)))
def dn(args): """ %prog dn folder Run Trinity-DN on a folder of reads. When paired-end (--paired) mode is on, filenames will be scanned based on whether they contain "_1_" and "_2_". """ p = OptionParser(dn.__doc__) p.add_option("--paired", default=False, action="store_true", help="Paired-end mode [default: %default]") p.set_home("trinity") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args paired = opts.paired thome = opts.trinity_home tfolder = folder + "_DN" cwd = os.getcwd() mkdir(tfolder) os.chdir(tfolder) flist = glob("../" + folder + "/*") if paired: f1 = [x for x in flist if "_1_" in x or ".1." in x] f2 = [x for x in flist if "_2_" in x or ".2." in x] assert len(f1) == len(f2) r1, r2 = "left.fastq", "right.fastq" reads = ((f1, r1), (f2, r2)) else: r = "single.fastq" reads = ((flist, r), ) for fl, r in reads: fm = FileMerger(fl, r) fm.merge(checkexists=True) cmd = op.join(thome, "Trinity.pl") cmd += " --seqType fq --JM 100G --CPU {0}".format(opts.cpus) if paired: cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1]) else: cmd += " --single {0}".format(reads[0][-1]) runfile = "run.sh" write_file(runfile, cmd, meta="run script") os.chdir(cwd)
def compile(args): """ %prog compile samples.csv Compile vcf results into master spreadsheet. """ from multiprocessing import Pool p = OptionParser(compile.__doc__) p.add_option("--db", default="hg38,hg38-named", help="Use these lobSTR db") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args workdir = opts.workdir dbs = opts.db.split(",") mkdir(workdir) os.chdir(workdir) stridsfile = "STR.ids" vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print >> fw, "\n".join(uids) fw.close() # Generate two alleles dipuids = [] for uid in uids: dipuids.extend([uid + ".1", uid + ".2"]) fw = open("header.ids", "w") print >> fw, ",".join(dipuids) fw.close() p = Pool(processes=opts.cpus) run_args = [(x, opts.store, opts.cleanup) for x in vcffiles] #run(run_args[0]) for res in p.map_async(run, run_args).get(): continue
def pasa(args): """ %prog ${pasadb}.assemblies.fasta ${pasadb}.pasa_assemblies.gff3 Wraps `pasa_asmbls_to_training_set.dbi`. """ from jcvi.formats.base import SetFile from jcvi.formats.gff import Gff p = OptionParser(pasa.__doc__) p.set_home("pasa") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, gffile = args transcodergff = fastafile + ".transdecoder.gff3" transcodergenomegff = fastafile + ".transdecoder.genome.gff3" if need_update((fastafile, gffile), (transcodergff, transcodergenomegff)): cmd = "{0}/scripts/pasa_asmbls_to_training_set.dbi".format(opts.pasa_home) cmd += " --pasa_transcripts_fasta {0} --pasa_transcripts_gff3 {1}".\ format(fastafile, gffile) sh(cmd) completeids = fastafile.rsplit(".", 1)[0] + ".complete.ids" if need_update(transcodergff, completeids): cmd = "grep complete {0} | cut -f1 | sort -u".format(transcodergff) sh(cmd, outfile=completeids) complete = SetFile(completeids) seen = set() completegff = transcodergenomegff.rsplit(".", 1)[0] + ".complete.gff3" fw = open(completegff, "w") gff = Gff(transcodergenomegff) for g in gff: a = g.attributes if "Parent" in a: id = a["Parent"][0] else: id = a["ID"][0] asmbl_id = id.split("|")[0] if asmbl_id not in complete: continue print >> fw, g if g.type == "gene": seen.add(id) fw.close() logging.debug("A total of {0} complete models extracted to `{1}`.".\ format(len(seen), completegff))
def compilevcf(args): """ %prog compilevcf samples.csv Compile vcf results into master spreadsheet. """ p = OptionParser(compilevcf.__doc__) p.add_option("--db", default="hg38", help="Use these lobSTR db") p.add_option("--nofilter", default=False, action="store_true", help="Do not filter the variants") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args workdir = opts.workdir store = opts.output_path cleanup = not opts.nocleanup filtered = not opts.nofilter dbs = opts.db.split(",") cwd = os.getcwd() mkdir(workdir) os.chdir(workdir) samples = op.join(cwd, samples) stridsfile = "STR.ids" if samples.endswith((".vcf", ".vcf.gz")): vcffiles = [samples] else: vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print >> fw, "\n".join(uids) fw.close() run_args = [(x, filtered, cleanup, store) for x in vcffiles] cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) for res in p.map_async(run_compile, run_args).get(): continue
def alignextend(args): """ %prog alignextend ref.fasta read.1.fastq read.2.fastq Wrapper around AMOS alignextend. """ choices = "prepare,align,filter,rmdup,genreads".split(",") p = OptionParser(alignextend.__doc__) p.add_option("--nosuffix", default=False, action="store_true", help="Do not add /1/2 suffix to the read [default: %default]") p.add_option("--rc", default=False, action="store_true", help="Reverse complement the reads before alignment") p.add_option("--len", default=100, type="int", help="Extend to this length") p.add_option("--stage", default="prepare", choices=choices, help="Start from certain stage") p.add_option("--dup", default=10, type="int", help="Filter duplicates with coordinates within this distance") p.add_option("--maxdiff", default=1, type="int", help="Maximum number of differences") p.set_home("amos") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ref, r1, r2 = args pf = op.basename(r1).split(".")[0] cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl") if not opts.nosuffix: cmd += " -suffix" bwa_idx = "{0}.ref.fa.sa".format(pf) if not need_update(ref, bwa_idx): cmd += " -noindex" cmd += " -threads {0}".format(opts.cpus) offset = guessoffset([r1]) if offset == 64: cmd += " -I" if opts.rc: cmd += " -rc" cmd += " -allow -len {0} -dup {1}".format(opts.len, opts.dup) cmd += " -min {0} -max {1}".format(2 * opts.len, 20 * opts.len) cmd += " -maxdiff {0}".format(opts.maxdiff) cmd += " -stage {0}".format(opts.stage) cmd += " ".join(("", pf, ref, r1, r2)) sh(cmd)
def augustus(args): """ %prog augustus species gffile fastafile Train AUGUSTUS model given gffile and fastafile. Whole procedure taken from: <http://www.molecularevolution.org/molevolfiles/exercises/augustus/training.html> """ p = OptionParser(snap.__doc__) p.add_option("--autotrain", default=False, action="store_true", help="Run autoAugTrain.pl to iteratively train AUGUSTUS") p.set_home("augustus") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) species, gffile, fastafile = args mhome = opts.augustus_home augdir = "augustus" cwd = os.getcwd() mkdir(augdir) os.chdir(augdir) target = "{0}/config/species/{1}".format(mhome, species) if op.exists(target): logging.debug("Removing existing target `{0}`".format(target)) sh("rm -rf {0}".format(target)) sh("{0}/scripts/new_species.pl --species={1}".format(mhome, species)) sh("{0}/scripts/gff2gbSmallDNA.pl ../{1} ../{2} 1000 raw.gb".\ format(mhome, gffile, fastafile)) sh("{0}/bin/etraining --species={1} raw.gb 2> train.err".\ format(mhome, species)) sh("cat train.err | perl -pe 's/.*in sequence (\S+): .*/$1/' > badgenes.lst") sh("{0}/scripts/filterGenes.pl badgenes.lst raw.gb > training.gb".\ format(mhome)) sh("grep -c LOCUS raw.gb training.gb") # autoAugTrain failed to execute, disable for now if opts.autotrain: sh("rm -rf {0}".format(target)) sh("{0}/scripts/autoAugTrain.pl --trainingset=training.gb --species={1}".\ format(mhome, species)) os.chdir(cwd) sh("cp -r {0} augustus/".format(target))
def augustus(args): """ %prog augustus fastafile Run parallel AUGUSTUS. Final results can be reformatted using annotation.reformat.augustus(). """ p = OptionParser(augustus.__doc__) p.add_option("--species", default="maize", help="Use species model for prediction") p.add_option("--hintsfile", help="Hint-guided AUGUSTUS") p.add_option("--nogff3", default=False, action="store_true", help="Turn --gff3=off") p.set_home("augustus") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args cpus = opts.cpus mhome = opts.augustus_home gff3 = not opts.nogff3 suffix = ".gff3" if gff3 else ".out" cfgfile = op.join(mhome, "config/extrinsic/extrinsic.M.RM.E.W.cfg") outdir = mkdtemp(dir=".") fs = split([fastafile, outdir, str(cpus)]) augustuswrap_params = partial(augustuswrap, species=opts.species, gff3=gff3, cfgfile=cfgfile, hintsfile=opts.hintsfile) g = Jobs(augustuswrap_params, fs.names) g.run() gff3files = [x.rsplit(".", 1)[0] + suffix for x in fs.names] outfile = fastafile.rsplit(".", 1)[0] + suffix FileMerger(gff3files, outfile=outfile).merge() shutil.rmtree(outdir) if gff3: from jcvi.annotation.reformat import augustus as reformat_augustus reformat_outfile = outfile.replace(".gff3", ".reformat.gff3") reformat_augustus([outfile, "--outfile={0}".format(reformat_outfile)])
def compile(args): """ %prog compile samples.csv Compile vcf results into master spreadsheet. """ p = OptionParser(compile.__doc__) p.add_option("--db", default="hg38", help="Use these lobSTR db") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args workdir = opts.workdir store = opts.output_path cleanup = not opts.nocleanup dbs = opts.db.split(",") cwd = os.getcwd() mkdir(workdir) os.chdir(workdir) samples = op.join(cwd, samples) stridsfile = "STR.ids" vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print >> fw, "\n".join(uids) fw.close() p = Pool(processes=opts.cpus) run_args = [(x, store, cleanup) for x in vcffiles] for res in p.map_async(run, run_args).get(): continue
def merge(args): """ %prog merge outdir output.gff Follow-up command after grid jobs are completed after parallel(). """ from jcvi.formats.gff import merge as gmerge p = OptionParser(merge.__doc__) p.set_home("maker") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) outdir, outputgff = args fsnames, suffix = get_fsnames(outdir) nfs = len(fsnames) cmd = op.join(opts.maker_home, "bin/gff3_merge") outfile = "merge.sh" write_file(outfile, mergesh.format(suffix, cmd)) # Generate per split directory # Note that gff3_merge write to /tmp, so I limit processes here to avoid # filling up disk space sh("parallel -j 8 merge.sh {} ::: " + " ".join(fsnames)) # One final output gffnames = glob("*.all.gff") assert len(gffnames) == nfs # Again, DO NOT USE gff3_merge to merge with a smallish /tmp/ area gfflist = "gfflist" fw = open(gfflist, "w") print("\n".join(gffnames), file=fw) fw.close() nlines = sum(1 for x in open(gfflist)) assert nlines == nfs # Be extra, extra careful to include all results gmerge([gfflist, "-o", outputgff]) logging.debug("Merged GFF file written to `{0}`".format(outputgff))
def locus(args): """ %prog locus bamfile Extract selected locus from a list of TREDs for validation, and run lobSTR. """ from jcvi.formats.sam import get_minibam # See `Format-lobSTR-database.ipynb` for a list of TREDs for validation INCLUDE = ["HD", "SBMA", "SCA1", "SCA2", "SCA8", "SCA17", "DM1", "DM2"] p = OptionParser(locus.__doc__) p.add_option("--tred", choices=INCLUDE, help="TRED name") p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bamfile, = args lhome = opts.lobstr_home tred = opts.tred tredsfile = op.join(datadir, "TREDs.meta.csv") tf = pd.read_csv(tredsfile, index_col=0) row = tf.ix[tred] seqid, start_end = row["repeat_location"].split(":") PAD = 1000 start, end = start_end.split('-') start, end = int(start) - PAD, int(end) + PAD region = "{}:{}-{}".format(seqid, start, end) minibamfile = get_minibam(bamfile, region) c = seqid.replace("chr", "") cmd, vcf = allelotype_on_chr(minibamfile, c, lhome, "TREDs") sh(cmd) parser = LobSTRvcf(columnidsfile=None) parser.parse(vcf, filtered=False) k, v = parser.items()[0] print >> sys.stderr, "{} => {}".format(tred, v.replace(',', '/'))
def snap(args): """ %prog snap species gffile fastafile Train SNAP model given gffile and fastafile. Whole procedure taken from: <http://gmod.org/wiki/MAKER_Tutorial_2012> """ p = OptionParser(snap.__doc__) p.set_home("maker") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) species, gffile, fastafile = args gffile = os.path.abspath(gffile) fastafile = os.path.abspath(fastafile) mhome = opts.maker_home snapdir = "snap" mkdir(snapdir) cwd = os.getcwd() os.chdir(snapdir) newgffile = "training.gff3" logging.debug("Construct GFF file combined with sequence ...") sh("cat {0} > {1}".format(gffile, newgffile)) sh('echo "##FASTA" >> {0}'.format(newgffile)) sh("cat {0} >> {1}".format(fastafile, newgffile)) logging.debug("Make models ...") sh("{0}/src/bin/maker2zff training.gff3".format(mhome)) sh("{0}/exe/snap/fathom -categorize 1000 genome.ann genome.dna".format( mhome)) sh("{0}/exe/snap/fathom -export 1000 -plus uni.ann uni.dna".format(mhome)) sh("{0}/exe/snap/forge export.ann export.dna".format(mhome)) sh("{0}/exe/snap/hmm-assembler.pl {1} . > {1}.hmm".format(mhome, species)) os.chdir(cwd) logging.debug("SNAP matrix written to `{0}/{1}.hmm`".format( snapdir, species))
def close(args): """ %prog close scaffolds.fasta PE*.fastq Run GapFiller to fill gaps. """ p = OptionParser(close.__doc__) p.set_home("gapfiller") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) scaffolds = args[0] libtxt = write_libraries(args[1:], aligner="bwa") cmd = "perl " + op.join(opts.gapfiller_home, "GapFiller.pl") cmd += " -l {0} -s {1} -T {2}".format(libtxt, scaffolds, opts.cpus) runsh = "run.sh" write_file(runsh, cmd)
def scaffold(args): """ %prog scaffold contigs.fasta MP*.fastq Run SSPACE scaffolding. """ p = OptionParser(scaffold.__doc__) p.set_home("sspace") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) contigs = args[0] libtxt = write_libraries(args[1:]) cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Basic_v2.0.pl") cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus) runsh = "run.sh" write_file(runsh, cmd)
def prepare(args): """ %prog prepare alignAssembly.config est.fasta ref.fasta Generate PASA run script. """ p = OptionParser(prepare.__doc__) p.set_home("pasa") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) cfg, est, ref = args phome = opts.pasa_home cmd = op.join(phome, "scripts/Launch_PASA_pipeline.pl") cmd += " -c {0} --CPU {1}".format(cfg, opts.cpus) cmd += " -C -R --ALIGNERS blat,gmap" cmd += " -t {0} -g {1}".format(est, ref) runfile = "run.sh" write_file(runfile, cmd, meta="run script")
def htt(args): """ %prog htt bamfile chr4:3070000-3080000 Extract HTT region and run lobSTR. """ from jcvi.formats.sam import get_minibam p = OptionParser(htt.__doc__) p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, region = args lhome = opts.lobstr_home minibamfile = get_minibam(bamfile, region) c = region.split(":")[0].replace("chr", "") cmd, vcf = allelotype_on_chr(minibamfile, c, lhome, "hg38") sh(cmd)
def genemark(args): """ %prog genemark species fastafile Train GENEMARK model given fastafile. GENEMARK self-trains so no trainig model gff file is needed. """ p = OptionParser(genemark.__doc__) p.add_option("--junctions", help="Path to `junctions.bed` from Tophat2") p.set_home("gmes") p.set_cpus(cpus=32) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) species, fastafile = args junctions = opts.junctions mhome = opts.gmes_home license = op.expanduser("~/.gm_key") assert op.exists(license), "License key ({0}) not found!".format(license) cmd = "{0}/gmes_petap.pl --sequence {1}".format(mhome, fastafile) cmd += " --cores {0}".format(opts.cpus) if junctions: intronsgff = "introns.gff" if need_update(junctions, intronsgff): jcmd = "{0}/bet_to_gff.pl".format(mhome) jcmd += " --bed {0} --gff {1} --label Tophat2".\ format(junctions, intronsgff) sh(jcmd) cmd += " --ET {0} --et_score 10".format(intronsgff) else: cmd += " --ES" sh(cmd) logging.debug("GENEMARK matrix written to `output/gmhmm.mod")
def snap(args): """ %prog snap species gffile fastafile Train SNAP model given gffile and fastafile. Whole procedure taken from: <http://gmod.org/wiki/MAKER_Tutorial_2012> """ p = OptionParser(snap.__doc__) p.set_home("maker") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) species, gffile, fastafile = args mhome = opts.maker_home snapdir = "snap" mkdir(snapdir) cwd = os.getcwd() os.chdir(snapdir) newgffile = "training.gff3" logging.debug("Construct GFF file combined with sequence ...") sh("cat ../{0} > {1}".format(gffile, newgffile)) sh('echo "##FASTA" >> {0}'.format(newgffile)) sh("cat ../{0} >> {1}".format(fastafile, newgffile)) logging.debug("Make models ...") sh("{0}/bin/maker2zff training.gff3".format(mhome)) sh("{0}/exe/snap/fathom -categorize 1000 genome.ann genome.dna".format(mhome)) sh("{0}/exe/snap/fathom -export 1000 -plus uni.ann uni.dna".format(mhome)) sh("{0}/exe/snap/forge export.ann export.dna".format(mhome)) sh("{0}/exe/snap/hmm-assembler.pl {1} . > {1}.hmm".format(mhome, species)) os.chdir(cwd) logging.debug("SNAP matrix written to `{0}/{1}.hmm`".format(snapdir, species))