import sys import os from os.path import * import subprocess import argparse import bqd, graph_qualdepth as qd import samtools from bam_to_qualdepth import set_unmapped_mapped_reads import json import log logc = log.get_config( 'graphsample.log' ) logger = log.setup_logger( 'graphsample', logc ) def main(): args = parse_args() args = handle_args( args ) if not args.qualdepth: jfile = make_json( args.bamfile, args.outpath ) else: jfile = args.qualdepth pngfile = make_image( jfile, args.outpath ) def make_json( bamfile, outpathprefix ): pileup = samtools.nogap_mpileup(bamfile) stats = bqd.parse_pileup( pileup ) set_unmapped_mapped_reads( bamfile, stats ) outfile = outpathprefix + '.qualdepth.json' with open( outfile, 'w' ) as fh: json.dump( stats, fh )
from glob import glob from os.path import * import os import sys import re import log from Bio import SeqIO import gzip logger = log.setup_logger(__name__, log.get_config()) ROCHE_FILE = '\S+?(?:__[0-9]){0,1}__(?:TI|RL)\d+__\d{4}_\d{2}_\d{2}__\w+.(sff|fastq)' ''' Matches roche sff or fastq files sample__region__barcode__year_month_day__type.filetype ''' ROCHE_ID = '[A-Z0-9]{14}' ''' Matches Roche accessions which are just 14 Alpha Numeric uppercase characters @AAAAAAAAAAAAAA ''' IONTORRENT_FILE = '\S+?__[0-9]__IX\d{3}__\d{4}_\d{2}_\d{2}__\w+.(sff|fastq)' ''' Matches IonTorrent file names(essentially same as roche) sample__region__barcode__year__month__day__type.filetype ''' IONTORRENT_ID = '[A-Z]{5}:[0-9]+:[0-9]+' '''
#!/usr/bin/env python from subprocess import Popen, PIPE import sys import json from collections import namedtuple from itertools import izip from matplotlib.lines import Line2D import log import samtools logger = log.setup_logger(__name__, log.get_config()) # Alias our region strings G = 'Gap' N = 'Normal' LC = 'LowCoverage' LQ = 'LowQuality' LCQ = 'LowCovQual' # As a list REGIONTYPES = [ G, N, LC, LQ, LCQ ] def parse_pileup( pileup ): ''' Parses the raw pileup output from samtools mpileup and returns a dictionary with stats for every reference in the pileup
import argparse import sys import samtools import re import shutil import os.path from ngs_mapper.bam import sortbam, indexbam import log logger = log.setup_logger('tagreads', log.get_config()) # Exception for when headers exist class HeaderExists(Exception): pass # The next 3 tuples have to be the same length and each index in each is related the same index in each tuple # AKA zip( IDS, PLATFORMS, ID_MAP ) should work as expected # Read group ID list IDS = ('Roche454', 'IonTorrent', 'MiSeq', 'Sanger') # Valid platforms for read groups PLATFORMS = ('L454', 'IONTORRENT', 'ILLUMINA', 'CAPILLARY') # Read name map to ID name ID_MAP = (re.compile('[0-9A-Z]{14}'), re.compile('[A-Z0-9]{5}:\d{1,}:\d{1,}'), re.compile('M[0-9]{5}:\d+:[\w\d-]+:\d:\d{4}:\d{4,5}:\d{4,5}'), re.compile('.*')) # Read Group Template RG_TEMPLATE = {'SM': None, 'ID': None, 'PL': None, 'CN': None}
def main(): args, qsubargs = parse_args() # Qsub job? if qsubargs: runsampleargs, _ = split_args(' '.join(sys.argv[1:])) print pbs_job(runsampleargs, qsubargs) sys.exit(1) # So we can set the global logger global logger # Setup analysis directory if os.path.isdir(args.outdir): if os.listdir(args.outdir): raise AlreadyExists("{0} already exists and is not empty".format( args.outdir)) else: os.makedirs(args.outdir) # tempdir root will be TMPDIR environ variable if it exists # unless outdir is set # allows user to specify TMPDIR somewhere else if they want such as # /dev/shm tmpdir = args.outdir # Directory analysis is run in will be inside of tmpdir tdir = tempfile.mkdtemp('runsample', args.prefix, dir=tmpdir) os.environ['TMPDIR'] = tdir bamfile = os.path.join(tdir, args.prefix + '.bam') flagstats = os.path.join(tdir, 'flagstats.txt') consensus = bamfile + '.consensus.fasta' vcf = bamfile + '.vcf' bwalog = os.path.join(tdir, 'bwa.log') stdlog = os.path.join(tdir, args.prefix + '.std.log') logfile = os.path.join(tdir, args.prefix + '.log') CN = args.CN # Set the global logger config = log.get_config(logfile) logger = log.setup_logger('runsample', config) #make_project_repo( tdir ) logger.info("--- Starting {0} --- ".format(args.prefix)) if args.config: logger.info("--- Using custom config from {0} ---".format(args.config)) # Write all stdout/stderr to a logfile from the various commands with open(stdlog, 'wb') as lfile: cmd_args = { 'samplename': args.prefix, 'tdir': tdir, 'readsdir': args.readsdir, 'reference': os.path.join(tdir, os.path.basename(args.reference)), 'bamfile': bamfile, 'flagstats': flagstats, 'consensus': consensus, 'vcf': vcf, 'CN': CN, 'trim_qual': args.trim_qual, 'trim_outdir': os.path.join(tdir, 'trimmed_reads'), 'filtered_dir': os.path.join(tdir, 'filtered'), 'head_crop': args.head_crop, 'minth': args.minth, 'config': args.config, 'platforms': args.platforms, 'drop_ns': args.drop_ns, 'index_min': args.index_min, 'primer_info': (args.primer_file, args.primer_seed, args.palindrom_clip, args.simple_clip) } # Best not to run across multiple cpu/core/threads on any of the pipeline steps # as multiple samples may be running concurrently already logger.debug("Copying reference file {0} to {1}".format( args.reference, cmd_args['reference'])) shutil.copy(args.reference, cmd_args['reference']) # Return code list rets = [] logger.debug(cmd_args) #Filter def select_keys(d, keys): return dict(((k, v) for k, v in d.items() if k in keys)) #convert sffs to fastq print sh.convert_formats(cmd_args['readsdir'], _out=sys.stdout, _err=sys.stderr) #print sh.sff_to_fastq(cmd_args['readsdir'], _out=sys.stdout, _err=sys.stderr) try: if cmd_args['config']: __result = sh.ngs_filter(cmd_args['readsdir'], config=cmd_args['config'], outdir=cmd_args['filtered_dir']) else: filter_args = select_keys( cmd_args, ["drop_ns", "platforms", "index_min"]) __result = sh.ngs_filter(cmd_args['readsdir'], outdir=cmd_args['filtered_dir'], **filter_args) logger.debug('ngs_filter: %s' % __result) except sh.ErrorReturnCode, e: logger.error(e.stderr) sys.exit(1) #Trim reads cmd = 'trim_reads {filtered_dir} -q {trim_qual} -o {trim_outdir} --head-crop {head_crop}' if cmd_args['config']: cmd += ' -c {config}' primer_info = cmd_args['primer_info'] if primer_info[0]: cmd += " --primer-file %s --primer-seed %s --palindrome-clip %s --simple-clip %s " % primer_info p = run_cmd(cmd.format(**cmd_args), stdout=lfile, stderr=subprocess.STDOUT) rets.append(p.wait()) if rets[-1] != 0: logger.critical("{0} did not exit sucessfully".format( cmd.format(**cmd_args))) # Filter on index quality and Ns # Mapping with open(bwalog, 'wb') as blog: cmd = 'run_bwa_on_samplename {trim_outdir} {reference} -o {bamfile}' if cmd_args['config']: cmd += ' -c {config}' p = run_cmd(cmd.format(**cmd_args), stdout=blog, stderr=subprocess.STDOUT) # Wait for the sample to map rets.append(p.wait()) # Everything else is dependant on bwa finishing so might as well die here if rets[-1] != 0: cmd = cmd.format(**cmd_args) logger.critical( "{0} failed to complete sucessfully. Please check the log file {1} for more details" .format(cmd, bwalog)) sys.exit(1) # Tag Reads cmd = 'tagreads {bamfile} -CN {CN}' if cmd_args['config']: cmd += ' -c {config}' p = run_cmd(cmd.format(**cmd_args), stdout=lfile, stderr=subprocess.STDOUT) r = p.wait() if r != 0: logger.critical("{0} did not exit sucessfully".format( cmd.format(**cmd_args))) rets.append(r) # Variant Calling cmd = 'base_caller {bamfile} {reference} {vcf} -minth {minth}' if cmd_args['config']: cmd += ' -c {config}' p = run_cmd(cmd.format(**cmd_args), stdout=lfile, stderr=subprocess.STDOUT) r = p.wait() if r != 0: logger.critical("{0} did not exit sucessfully".format( cmd.format(**cmd_args))) rets.append(r) if rets[-1] != 0: cmd = cmd.format(**cmd_args) logger.critical('{0} failed to complete successfully'.format( cmd.format(**cmd_args))) # Flagstats with open(flagstats, 'wb') as flagstats: cmd = 'samtools flagstat {bamfile}' p = run_cmd(cmd.format(**cmd_args), stdout=flagstats, stderr=lfile, script_dir='') r = p.wait() if r != 0: logger.critical("{0} did not exit sucessfully".format( cmd.format(**cmd_args))) rets.append(r) # Graphics cmd = 'graphsample {bamfile} -od {tdir}' p = run_cmd(cmd.format(**cmd_args), stdout=lfile, stderr=subprocess.STDOUT) r = p.wait() if r != 0: logger.critical("{0} did not exit sucessfully".format( cmd.format(**cmd_args))) rets.append(r) # Read Graphics fastqs = ' '.join( glob.glob(os.path.join(cmd_args['trim_outdir'], '*.fastq'))) cmd = 'fqstats -o {0}.reads.png {1}'.format( cmd_args['bamfile'].replace('.bam', ''), fastqs) p = run_cmd(cmd, stdout=lfile, stderr=subprocess.STDOUT) r = p.wait() if r != 0: logger.critical("{0} did not exit sucessfully".format(cmd)) rets.append(r) # Consensus cmd = 'vcf_consensus {vcf} -i {samplename} -o {consensus}' p = run_cmd(cmd.format(**cmd_args), stdout=lfile, stderr=subprocess.STDOUT) r = p.wait() if r != 0: logger.critical("{0} did not exit sucessfully".format( cmd.format(**cmd_args))) rets.append(r) # If sum is > 0 then one of the commands failed if sum(rets) != 0: logger.critical( "!!! There was an error running part of the pipeline !!!") logger.critical("Please check the logfile {0}".format(logfile)) sys.exit(1) logger.info("--- Finished {0} ---".format(args.prefix)) #subprocess.call( 'git add -A', cwd=tdir, shell=True, stdout=lfile, stderr=subprocess.STDOUT ) #subprocess.call( 'git commit -am \'runsample\'', cwd=tdir, shell=True, stdout=lfile, stderr=subprocess.STDOUT ) logger.debug("Moving {0} to {1}".format(tdir, args.outdir)) # Cannot log any more below this line as the log file will be moved in the following code if not os.path.isdir(args.outdir): shutil.move(tdir, args.outdir) else: file_list = [os.path.join(tdir, m) for m in os.listdir(tdir)] for f in file_list: shutil.move(f, args.outdir)
""" import subprocess import os import argparse import sys from os.path import basename, join, isdir, dirname, expandvars from glob import glob import tempfile import reads import shlex import data from ngs_mapper import compat import log lconfig = log.get_config() logger = log.setup_logger( 'trim_reads', lconfig ) def main(): args = parse_args() trim_reads_in_dir( args.readsdir, args.q, args.outputdir, head_crop=args.headcrop, platforms=args.platforms, primer_info=[args.primer_file, args.primer_seed, args.palindrom_clip, args.simple_clip] ) def trim_reads_in_dir( *args, **kwargs ): '''
#. Symlink all original .ab1 files into this directory #. Convert all .ab1 to .fastq #. Parse the sanger filename and create ReadsBySample/samplename directory #. Symlink all .fastq and .ab1 files for that samplename from ReadData into Samplename directory """ import shutil from os.path import * import os from glob import glob from Bio import SeqIO import re import sys import log logger = log.setup_logger(basename(__file__), log.get_config()) # For invalid formatted filenames class InvalidFormat(Exception): pass def sync_sanger(runpath, ngsdata): rund = basename(runpath) rawd = join(ngsdata, 'RawData', 'Sanger', rund) readd = join(ngsdata, 'ReadData', 'Sanger', rund) sync_run(runpath, ngsdata) sync_readdata(rawd, ngsdata) link_reads(readd, ngsdata)
def main(): args,qsubargs = parse_args() # Qsub job? if qsubargs: runsampleargs, _ = split_args(' '.join(sys.argv[1:])) print pbs_job(runsampleargs, qsubargs) sys.exit(1) # So we can set the global logger global logger # Setup analysis directory if os.path.isdir( args.outdir ): if os.listdir( args.outdir ): raise AlreadyExists( "{0} already exists and is not empty".format(args.outdir) ) else: os.makedirs(args.outdir) # tempdir root will be TMPDIR environ variable if it exists # unless outdir is set # allows user to specify TMPDIR somewhere else if they want such as # /dev/shm tmpdir = args.outdir # Directory analysis is run in will be inside of tmpdir tdir = tempfile.mkdtemp('runsample', args.prefix, dir=tmpdir) os.environ['TMPDIR'] = tdir bamfile = os.path.join( tdir, args.prefix + '.bam' ) flagstats = os.path.join( tdir, 'flagstats.txt' ) consensus = bamfile+'.consensus.fasta' vcf = bamfile+'.vcf' bwalog = os.path.join( tdir, 'bwa.log' ) stdlog = os.path.join( tdir, args.prefix + '.std.log' ) logfile = os.path.join( tdir, args.prefix + '.log' ) CN = args.CN # Set the global logger config = log.get_config( logfile ) logger = log.setup_logger( 'runsample', config ) #make_project_repo( tdir ) logger.info( "--- Starting {0} --- ".format(args.prefix) ) if args.config: logger.info( "--- Using custom config from {0} ---".format(args.config) ) # Write all stdout/stderr to a logfile from the various commands with open(stdlog,'wb') as lfile: cmd_args = { 'samplename': args.prefix, 'tdir': tdir, 'readsdir': args.readsdir, 'reference': os.path.join(tdir, os.path.basename(args.reference)), 'bamfile': bamfile, 'flagstats': flagstats, 'consensus': consensus, 'vcf': vcf, 'CN': CN, 'trim_qual': args.trim_qual, 'trim_outdir': os.path.join(tdir,'trimmed_reads'), 'filtered_dir' : os.path.join(tdir, 'filtered'), 'head_crop': args.head_crop, 'minth': args.minth, 'config': args.config, 'platforms': args.platforms, 'drop_ns': args.drop_ns, 'index_min': args.index_min, 'primer_info' : (args.primer_file, args.primer_seed, args.palindrom_clip, args.simple_clip) } # Best not to run across multiple cpu/core/threads on any of the pipeline steps # as multiple samples may be running concurrently already logger.debug( "Copying reference file {0} to {1}".format(args.reference,cmd_args['reference']) ) shutil.copy( args.reference, cmd_args['reference'] ) # Return code list rets = [] logger.debug(cmd_args) #Filter def select_keys(d, keys): return dict( ((k, v) for k, v in d.items() if k in keys)) #convert sffs to fastq print sh.convert_formats(cmd_args['readsdir'], _out=sys.stdout, _err=sys.stderr) #print sh.sff_to_fastq(cmd_args['readsdir'], _out=sys.stdout, _err=sys.stderr) try: if cmd_args['config']: __result = sh.ngs_filter(cmd_args['readsdir'], config=cmd_args['config'], outdir=cmd_args['filtered_dir']) else: filter_args = select_keys(cmd_args, ["drop_ns", "platforms", "index_min"]) __result = sh.ngs_filter(cmd_args['readsdir'], outdir=cmd_args['filtered_dir'], **filter_args) logger.debug( 'ngs_filter: %s' % __result ) except sh.ErrorReturnCode, e: logger.error(e.stderr) sys.exit(1) #Trim reads cmd = 'trim_reads {filtered_dir} -q {trim_qual} -o {trim_outdir} --head-crop {head_crop}' if cmd_args['config']: cmd += ' -c {config}' primer_info = cmd_args['primer_info'] if primer_info[0]: cmd += " --primer-file %s --primer-seed %s --palindrome-clip %s --simple-clip %s " % primer_info p = run_cmd( cmd.format(**cmd_args), stdout=lfile, stderr=subprocess.STDOUT ) rets.append( p.wait() ) if rets[-1] != 0: logger.critical( "{0} did not exit sucessfully".format(cmd.format(**cmd_args)) ) # Filter on index quality and Ns # Mapping with open(bwalog, 'wb') as blog: cmd = 'run_bwa_on_samplename {trim_outdir} {reference} -o {bamfile}' if cmd_args['config']: cmd += ' -c {config}' p = run_cmd( cmd.format(**cmd_args), stdout=blog, stderr=subprocess.STDOUT ) # Wait for the sample to map rets.append( p.wait() ) # Everything else is dependant on bwa finishing so might as well die here if rets[-1] != 0: cmd = cmd.format(**cmd_args) logger.critical( "{0} failed to complete sucessfully. Please check the log file {1} for more details".format(cmd,bwalog) ) sys.exit(1) # Tag Reads cmd = 'tagreads {bamfile} -CN {CN}' if cmd_args['config']: cmd += ' -c {config}' p = run_cmd( cmd.format(**cmd_args), stdout=lfile, stderr=subprocess.STDOUT ) r = p.wait() if r != 0: logger.critical( "{0} did not exit sucessfully".format(cmd.format(**cmd_args)) ) rets.append( r ) # Variant Calling cmd = 'base_caller {bamfile} {reference} {vcf} -minth {minth}' if cmd_args['config']: cmd += ' -c {config}' p = run_cmd( cmd.format(**cmd_args), stdout=lfile, stderr=subprocess.STDOUT ) r = p.wait() if r != 0: logger.critical( "{0} did not exit sucessfully".format(cmd.format(**cmd_args)) ) rets.append( r ) if rets[-1] != 0: cmd = cmd.format(**cmd_args) logger.critical( '{0} failed to complete successfully'.format(cmd.format(**cmd_args)) ) # Flagstats with open(flagstats,'wb') as flagstats: cmd = 'samtools flagstat {bamfile}' p = run_cmd( cmd.format(**cmd_args), stdout=flagstats, stderr=lfile, script_dir='' ) r = p.wait() if r != 0: logger.critical( "{0} did not exit sucessfully".format(cmd.format(**cmd_args)) ) rets.append( r ) # Graphics cmd = 'graphsample {bamfile} -od {tdir}' p = run_cmd( cmd.format(**cmd_args), stdout=lfile, stderr=subprocess.STDOUT ) r = p.wait() if r != 0: logger.critical( "{0} did not exit sucessfully".format(cmd.format(**cmd_args)) ) rets.append( r ) # Read Graphics fastqs = ' '.join( glob.glob( os.path.join( cmd_args['trim_outdir'], '*.fastq' ) ) ) cmd = 'fqstats -o {0}.reads.png {1}'.format(cmd_args['bamfile'].replace('.bam',''),fastqs) p = run_cmd( cmd, stdout=lfile, stderr=subprocess.STDOUT ) r = p.wait() if r != 0: logger.critical( "{0} did not exit sucessfully".format(cmd) ) rets.append( r ) # Consensus cmd = 'vcf_consensus {vcf} -i {samplename} -o {consensus}' p = run_cmd( cmd.format(**cmd_args), stdout=lfile, stderr=subprocess.STDOUT ) r = p.wait() if r != 0: logger.critical( "{0} did not exit sucessfully".format(cmd.format(**cmd_args)) ) rets.append( r ) # If sum is > 0 then one of the commands failed if sum(rets) != 0: logger.critical( "!!! There was an error running part of the pipeline !!!" ) logger.critical( "Please check the logfile {0}".format(logfile) ) sys.exit( 1 ) logger.info( "--- Finished {0} ---".format(args.prefix) ) #subprocess.call( 'git add -A', cwd=tdir, shell=True, stdout=lfile, stderr=subprocess.STDOUT ) #subprocess.call( 'git commit -am \'runsample\'', cwd=tdir, shell=True, stdout=lfile, stderr=subprocess.STDOUT ) logger.debug( "Moving {0} to {1}".format( tdir, args.outdir ) ) # Cannot log any more below this line as the log file will be moved in the following code if not os.path.isdir( args.outdir ): shutil.move( tdir, args.outdir ) else: file_list = [os.path.join(tdir,m) for m in os.listdir(tdir)] for f in file_list: shutil.move( f, args.outdir )
#. Symlink all original .ab1 files into this directory #. Convert all .ab1 to .fastq #. Parse the sanger filename and create ReadsBySample/samplename directory #. Symlink all .fastq and .ab1 files for that samplename from ReadData into Samplename directory """ import shutil from os.path import * import os from glob import glob from Bio import SeqIO import re import sys import log logger = log.setup_logger( basename(__file__), log.get_config() ) # For invalid formatted filenames class InvalidFormat(Exception): pass def sync_sanger( runpath, ngsdata ): rund = basename( runpath ) rawd = join( ngsdata, 'RawData', 'Sanger', rund ) readd = join( ngsdata, 'ReadData', 'Sanger', rund ) sync_run( runpath, ngsdata ) sync_readdata( rawd, ngsdata ) link_reads( readd, ngsdata ) def sync_run( runpath, ngsdata ): '''
import sys import os from os.path import * import subprocess import argparse import bqd, graph_qualdepth as qd import samtools from bam_to_qualdepth import set_unmapped_mapped_reads import json import log logc = log.get_config('graphsample.log') logger = log.setup_logger('graphsample', logc) def main(): args = parse_args() args = handle_args(args) if not args.qualdepth: jfile = make_json(args.bamfile, args.outpath) else: jfile = args.qualdepth pngfile = make_image(jfile, args.outpath) def make_json(bamfile, outpathprefix): pileup = samtools.nogap_mpileup(bamfile) stats = bqd.parse_pileup(pileup) set_unmapped_mapped_reads(bamfile, stats) outfile = outpathprefix + '.qualdepth.json'
import argparse import sys import samtools import re import shutil import os.path from ngs_mapper.bam import sortbam, indexbam import log logger = log.setup_logger('tagreads',log.get_config()) # Exception for when headers exist class HeaderExists(Exception): pass # The next 3 tuples have to be the same length and each index in each is related the same index in each tuple # AKA zip( IDS, PLATFORMS, ID_MAP ) should work as expected # Read group ID list IDS = ('Roche454', 'IonTorrent', 'MiSeq', 'Sanger') # Valid platforms for read groups PLATFORMS = ('L454', 'IONTORRENT', 'ILLUMINA', 'CAPILLARY') # Read name map to ID name ID_MAP = ( re.compile( '[0-9A-Z]{14}' ), re.compile( '[A-Z0-9]{5}:\d{1,}:\d{1,}' ), re.compile( 'M[0-9]{5}:\d+:[\w\d-]+:\d:\d{4}:\d{4,5}:\d{4,5}' ), re.compile( '.*' ) ) # Read Group Template RG_TEMPLATE = { 'SM': None, 'ID': None,
from glob import glob from os.path import * import numpy as np import matplotlib.pyplot as plt from nose.tools import ok_, eq_ from datetime import datetime import log logc = log.get_config() logger = log.setup_logger( 'graph_times', logc ) def main(): ss = start_stop( 'Projects' ) logger.info( "Plotting all projects inside of {0}".format('Projects') ) x,y = [],[] samplenames = sorted(ss.keys()) for sn in samplenames: x.append( sn ) y.append( ss[sn] ) fig = plt.figure() fig.set_size_inches( 20.0, 8.0 ) fig.suptitle( 'Pipeline Time per Sample' ) ax = plt.gca() ax.plot( range(len(x)), y ) ax.set_xlim([0,len(x)-1]) ax.set_ylim([0,max(y)]) ax.set_xticks( range(0,len(x)) ) ax.set_xticklabels( x, rotation='vertical' ) ax.set_ylabel( 'Seconds' )