def bam(args): """ %prog snp input.gsnap ref.fasta Convert GSNAP output to BAM. """ from jcvi.formats.sizes import Sizes from jcvi.formats.sam import index p = OptionParser(bam.__doc__) p.set_home("eddyyeh") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gsnapfile, fastafile = args EYHOME = opts.eddyyeh_home pf = gsnapfile.rsplit(".", 1)[0] uniqsam = pf + ".unique.sam" if need_update((gsnapfile, fastafile), uniqsam): cmd = op.join(EYHOME, "gsnap2gff3.pl") sizesfile = Sizes(fastafile).filename cmd += " --format sam -i {0} -o {1}".format(gsnapfile, uniqsam) cmd += " -u -l {0} -p {1}".format(sizesfile, opts.cpus) sh(cmd) index([uniqsam])
def simulate(args): """ %prog simulate run_dir 1 300 Simulate BAMs with varying inserts with dwgsim. The above command will simulate between 1 to 300 CAGs in the HD region, in a directory called `run_dir`. """ p = OptionParser(simulate.__doc__) p.add_option("--ref", default="/Users/htang/projects/ref/hg38.upper.fa", help="Reference genome sequence") add_simulate_options(p) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) rundir, startunits, endunits = args startunits, endunits = int(startunits), int(endunits) basecwd = os.getcwd() mkdir(rundir) os.chdir(rundir) cwd = os.getcwd() # Huntington region pad_left, pad_right = 1000, 10000 chr, start, end = 'chr4', 3074877, 3074933 fasta = Fasta(opts.ref) seq_left = fasta[chr][start - pad_left:start - 1] seq_right = fasta[chr][end:end + pad_right] motif = 'CAG' reffastafile = "ref.fasta" seq = str(fasta[chr][start - pad_left:end + pad_right]) make_fasta(seq, reffastafile, id=chr.upper()) # Write fake sequence for units in range(startunits, endunits + 1): pf = str(units) mkdir(pf) os.chdir(pf) seq = str(seq_left) + motif * units + str(seq_right) fastafile = pf + ".fasta" make_fasta(seq, fastafile, id=chr.upper()) # Simulate reads on it wgsim([ fastafile, "--depth={}".format(opts.depth), "--readlen={}".format(opts.readlen), "--distance={}".format(opts.distance), "--outfile={}".format(pf) ]) read1 = pf + ".bwa.read1.fastq" read2 = pf + ".bwa.read2.fastq" samfile, _ = align(["../{}".format(reffastafile), read1, read2]) indexed_samfile = index([samfile]) sh("mv {} ../{}.bam".format(indexed_samfile, pf)) sh("mv {}.bai ../{}.bam.bai".format(indexed_samfile, pf)) os.chdir(cwd) shutil.rmtree(pf) os.chdir(basecwd)