def rewrite_headers(self,outPath,lineLen=70,delim=' ',order=[],ow=False,chmod=755): """ PURPOSE * reorganize the headers of a fasta file: >supercontig:CpipQ1:supercont3.1:1:3873040:1 supercontig supercont3.1 to >supercont3.1 supercontig:CpipQ1:supercont3.1:1:3873040:1 supercontig NOTES 1) If ow, ignores outPath 2) delim is what to spilt on 3) order is a list of index numbers from the original header, reorganized for the new header. Exp: delim=' ',order=[2,0,1] would produce what is seen above. 4) chmod= set new file with this mode (exp: 755) 5) lineLen sets fastaSeq line length in new file. """ if ow: outPath = tempfile.NamedTemporaryFile(suffix='.renamed.fas') else: outPath = open(outPath,'w') while 1: try: f = self.next() except StopIteration: break fSplit = f[0].lstrip('>').rstrip('\n').split(delim) newHead = delim.join([fSplit[x] for x in order]) outPath.write('>%s\n%s\n' % (newHead,'\n'.join(fold_seq(f[1],lineLen)))) self._file.close() absPath = os.path.abspath if ow: outPath.flush() outPath.delete = False os.rename(absPath(outPath.name),absPath(self._file.name)) try: chmodResult = runExternalApp('chmod', '%s %s' % (chmod,absPath(self._file.name))) except ExternalError as err: sys.stderr.write('%s\n' % (err)) else: try: outPath.close() chmodResult = runExternalApp('chmod', '%s %s' % (chmod,absPath(outPath.name))) except ExternalError as err: sys.stderr.write('%s\n' % (err))
def run_samtools(tool,argsList): """Core wrapper for calls to samtools and associated stderr/stdout reporting. See specific func defs for each tool's samtool's help text. tool : sort,view,index,etc... argsList : list obj with appropriatly structured args for samtools "tool" """ # Construct cmdArgs cmdArgs = "%s %s" % (tool," ".join(argsList)) # Run and capture output print "Setting up samtools call with the following cmd:\n\tsamtools %s" % (cmdArgs) results = runExternalApp('samtools',cmdArgs) # Report stdout and stderr if results[0]: for line in results[0].split('\n'): print('[%s] %s' % (whoami(),line)) if results[1]: for line in results[1].split('\n'): sys.stderr.write('[%s] %s' % (whoami(),line)) return results
def init_ensembl_sqlDB(sqlPath,user,password,host='localhost',tmp=None): """Given the path to a directory containing an ensembl db dump, this will use MySQLdb to create a new database using the 'species_version.sql' file in the directory and populate it with the tables and info. tmp = None: unzipped in place tmp = defined dir path: unzipped to that dir tmp Files/Dirs will be deleted """ import os import MySQLdb as sql # unzip the sql and table files sqlFiles = os.listdir(sqlPath) for i in range(len(sqlFiles)): sqlFiles[i] = '%s/%s' % (sqlPath.rstrip('/'),sqlFiles[i]) for f in sqlFiles: if f.endswith('.gz'): gResult = runExternalApp('gunzip',f) elif f.endswith('.zip'): zResult = runExternalApp('unzip',f) else: pass # Get the *.sql file path for f in sqlFiles: if f.endswith('.sql'): schemaFile = f try: type(schemaFile) except NameError: UnexpectedValueError("ERROR: xxxxxxxxxx") dbName = schemaFile.split('/')[-1][:-4] db = sql.connect(user=user,passwd=password,host=host) c = db.cursor() c.execute('DROP DATABASE IF EXISTS %s' % (dbName)) c.execute('CREATE DATABASE %s' % (dbName)) sqlArgs = '-u %s -p %s %s < %s' % (user,password,dbName,schemaFile) sqlResult = runExternalApp('mysql',sqlArgs) impArgs = '-u %s -p %s --fields_escaped_by=\\ %s -L *.txt' % (user,password,dbName) importResult = runExternalApp('mysqlimport',impArgs)
def extractFromDoubleSidedBedtoolOut(filePath, cols, side="right", outDir="."): """Creates new file from filePath using only the bedInfo from the left/right (based on 'side') side of a BEDtools outFile with double- sided output (side=[3,6]). 'cols' must be a list with length of columns in each 'side' of the double output. 'side' = keep the 'right' or 'left' side of the output line.""" # Prepare outDir if it doesnt already exist mkdirp(outDir) inFile = open(filePath, "rU") outFilePath = "%s/%s_%s.bed" % (outDir, filePath.replace(".bed", "").split("/")[-1], side) outFile = open(outFilePath, "w") lineNum = 0 for line in inFile: lineNum += 1 if line.startswith("track"): continue line = line.strip("\n").split("\t") # Divide the line into two based on cols divLine = line[: cols[0]], line[-(len(line) - cols[0]) :] # Ensure the length of each new line is what we expect, then write out cleaned line if not ((len(divLine[0]) == cols[0]) and (len(divLine[1]) == cols[1])): raise InvalidFileFormatError( 'line %s in file %s has unexpected number of columns or the values in "cols" is incorrect.' % (lineNum, filePath) ) else: if side == "right": outFile.write("%s\n" % ("\t".join(divLine[1]))) elif side == "left": outFile.write("%s\n" % ("\t".join(divLine[0]))) else: raise InvalidOptionError('option "side" must be one of %s. Was: %s.' % (["right", "left"], side)) outFile.close() # Sort and remove redundancy from line in new file resultSort = runExternalApp("sort", "-u %s > %s.tmp" % (outFilePath, outFilePath)) resultMv = runExternalApp("mv", "%s.tmp %s" % (outFilePath, outFilePath)) return outFilePath
def bowtie_index(reference_in,ebwt_outfile_base,runDir,options=None): """Create bowtie indexes from new fasta set. options : quoted string representing valid cmd line bowtie-build options runDir : path to dir to place stdErr/stdOut logs - all steps of pipeline scripts should share same runDir ---------- bowtie-build help text: Usage: bowtie-build [options]* <reference_in> <ebwt_outfile_base> reference_in comma-separated list of files with ref sequences ebwt_outfile_base write Ebwt data to files with this dir/basename Options: -f reference files are Fasta (default) -c reference sequences given on cmd line (as <seq_in>) -C/--color build a colorspace index -a/--noauto disable automatic -p/--bmax/--dcv memory-fitting -p/--packed use packed strings internally; slower, uses less mem -B build both letter- and colorspace indexes --bmax <int> max bucket sz for blockwise suffix-array builder --bmaxdivn <int> max bucket sz as divisor of ref len (default: 4) --dcv <int> diff-cover period for blockwise (default: 1024) --nodc disable diff-cover (algorithm becomes quadratic) -r/--noref don't build .3/.4.ebwt (packed reference) portion -3/--justref just build .3/.4.ebwt (packed reference) portion -o/--offrate <int> SA is sampled every 2^offRate BWT chars (default: 5) -t/--ftabchars <int> # of chars consumed in initial lookup (default: 10) --ntoa convert Ns in reference to As --seed <int> seed for random number generator -q/--quiet verbose output (for debugging) -h/--help print detailed description of tool and its options --usage print this usage message --version print version information and quit """ # make runDir if it does not yet exist print "creating: %s" % (runDir) mkdirp(runDir) # Construct cmdArgs if options: cmdArgs = "%s %s %s" % (options,reference_in,ebwt_outfile_base) # Run bowtie-build and capture output btBuildResults = runExternalApp('bowtie-build',cmdArgs) # Report bowtie-build stdout and stderr if btBuildResults[0]: print('[%s] %s' % (whoami(),btBuildResults[0])) if btBuildResults[1]: sys.stderr('[%s] %s' % (whoami(),btBuildResults[1])) return btBuildResults
def runSCOPE(pLen,genes,jobName,scopeDir,outDir,paramName,jMem='2000',verbose=False): """Perform a SCOPE run. Complain and quit if error occurs. Notes: scopeDir = full path. genes = 'gene;gene;gene;etc' pLen = promorter length to use.""" # Get full path (if not given) for outDir since we will be jumping around in the directory tree if not outDir.startswith('/'): outDir = os.getcwd()+'/'+outDir outDir = outDir.rstrip('/') else: outDir = outDir.rstrip('/') # Set up argString outPathBase = '%s/%s.%s' % (outDir,jobName,pLen) argString = '''-Xmx%sm -cp dist/scope.jar edu.dartmouth.bglab.beam.CGIScope -pf "%s" -ofx "%s.xml" -oft "%s.txt" -oje "%s" -qg "%s" -sgl "%s" -drb "true" -dra "true" -drbp "true"''' \ % (jMem, paramName, outPathBase, outPathBase, jobName,genes,pLen) # Change to scopeDir for execution bc SCOPE is a PITA. os.chdir(scopeDir) mkdirp(outDir) # make outDir along with parent dirs as needed print 'starting run...' resultSCOPE = runExternalApp('java',argString) # write stdOut/Err to files if requested if verbose: stdOutFile = open(outPathBase+'.out','w') stdErrFile = open(outPathBase+'.err','w') stdOutFile.write(resultSCOPE[0]) stdErrFile.write(resultSCOPE[1]) stdOutFile.close() stdErrFile.close() return resultSCOPE
def tophat_align(bowtie_index,readsA,readsB=None,qualsA=None,qualsB=None,options=None,runDir=None): # TODO: eliminate "Reconstituting reference FASTA file from Bowtie index \ [FAILED] \ Error: bowtie-inspect returned an error." """ Wrapper for calling tophat and dealing with output. **ARGS** _name_ _type_ _desc_ bowtie_index : String bowtie index base-name in $BOWTIE_INDEXES readsA : List List of FilePaths to fastQ files readsB : List/None qualsA : List/None qualsB : List/None options : String quoted comma-sep str of CLI tophat options ------------------ **NOTES** If (readsA AND readsB) OR (qualsA AND qualsB), the order of filePaths corresponding to PE-mates must match: readsA,readsB = [readsA_1,readsA_2], [readsB_1,readsB_2] qualsA,qualsB = [qualsA_1,qualsA_2], [qualsB_1,qualsB_2] ------------------ **TOPHAT HELP TEXT FOLLOWS** tophat: TopHat maps short sequences from spliced transcripts to whole genomes. Usage: tophat [options] <bowtie_index> <reads1[,reads2,...,readsN]> [reads1[,reads2,...,readsN]] [quals1,[quals2,...,qualsN]] [quals1[,quals2,...,qualsN]] Options: -v/--version -o/--output-dir <string> [ default: ./tophat_out ] -a/--min-anchor <int> [ default: 8 ] -m/--splice-mismatches <0-2> [ default: 0 ] -i/--min-intron-length <int> [ default: 50 ] -I/--max-intron-length <int> [ default: 500000 ] -g/--max-multihits <int> [ default: 40 ] -F/--min-isoform-fraction <float> [ default: 0.15 ] --max-insertion-length <int> [ default: 3 ] --max-deletion-length <int> [ default: 3 ] --solexa-quals --solexa1.3-quals (same as phred64-quals) --phred64-quals (same as solexa1.3-quals) -Q/--quals --integer-quals -C/--color (Solid - color space) --color-out --library-type (--fr-unstranded, --fr-firststrand, --fr-secondstrand, --ff-unstranded, --ff-firststrand, --ff-secondstrand) -p/--num-threads <int> [ default: 1 ] -G/--GTF <filename> -j/--raw-juncs <filename> --insertions <filename> --deletions <filename> -r/--mate-inner-dist <int> --mate-std-dev <int> [ default: 20 ] --no-novel-juncs --allow-indels --no-novel-indels --no-gtf-juncs --no-coverage-search --coverage-search --no-closure-search --closure-search --fill-gaps --microexon-search --butterfly-search --no-butterfly-search --keep-tmp --tmp-dir <dirname> Advanced Options: --segment-mismatches <int> [ default: 2 ] --segment-length <int> [ default: 25 ] --min-closure-exon <int> [ default: 100 ] --min-closure-intron <int> [ default: 50 ] --max-closure-intron <int> [ default: 5000 ] --min-coverage-intron <int> [ default: 50 ] --max-coverage-intron <int> [ default: 20000 ] --min-segment-intron <int> [ default: 50 ] --max-segment-intron <int> [ default: 500000 ] SAM Header Options (for embedding sequencing run metadata in output): --rg-id <string> (read group ID) --rg-sample <string> (sample ID) --rg-library <string> (library ID) --rg-description <string> (descriptive string, no tabs allowed) --rg-platform-unit <string> (e.g Illumina lane ID) --rg-center <string> (sequencing center name) --rg-date <string> (ISO 8601 date of the sequencing run) --rg-platform <string> (Sequencing platform descriptor) for detailed help see http://tophat.cbcb.umd.edu/manual.html """ # make runDir if it does not yet exist # => tophat takes care of this for us ##mkdirp(runDir) # Construct cmdArgs if not type(readsA) == type([]): raise TypeError('readsA type should be "[]".') readsA = ','.join(readsA) # format readsB if readsB: if not type(readsB) == type([]): raise TypeError('readsB type should be "[]".') readsB = ','.join(readsB) else: readsB = '' # format qualsA if qualsA: if not type(qualsA) == type([]): raise TypeError('qualsA type should be "[]".') qualsA = ','.join(qualsA) else: qualsA = '' # format qualsB if qualsB: if not type(qualsB) == type([]): raise TypeError('qualsB type should be "[]".') qualsB = ','.join(qualsB) else: qualsB = '' # format runDir if runDir: runDir = ' -o %s' % (runDir) else: runDir = '' # format cmdArgs if options == None: options = '' # format bowtie_index # This seems needed or tophat cant reconstitute the fasta files for some reason # and returns the following error: # * Reconstituting reference FASTA file from Bowtie index # * [FAILED] # * Error: bowtie-inspect returned an error. if not bowtie_index.startswith('/'): bowtie_index = "$BOWTIE_INDEXES/%s" % (bowtie_index) cmdArgs = "%s %s %s %s %s %s" % (options+runDir,bowtie_index,readsA,readsB,qualsA,qualsB) # Run and capture output print "Setting up tophat call with the following cmd:\n\t\ttophat %s" % (cmdArgs) thResults = runExternalApp('tophat',cmdArgs) # Report stdout and stderr if thResults[0]: for line in thResults[0].split('\n'): print('[%s] %s' % (whoami(),line)) if thResults[1]: for line in thResults[1].split('\n'): sys.stderr.write('[%s] %s' % (whoami(),line)) return thResults
def bowtie_align(ebwt,readsString,hit,runDir,options=None): """Run alignment of fastQ to bowtie index. options : quoted string representing valid cmd line bowtie-build options runDir : path to dir to place stdErr/stdOut logs - all steps of pipeline scripts should share same runDir readsString : appropriate quoted string representing which fastq files to use (see bowtie -h). ---------- bowtie help text: Usage: bowtie [options]* <ebwt> {-1 <m1> -2 <m2> | --12 <r> | <s>} [<hit>] <m1> Comma-separated list of files containing upstream mates (or the sequences themselves, if -c is set) paired with mates in <m2> <m2> Comma-separated list of files containing downstream mates (or the sequences themselves if -c is set) paired with mates in <m1> <r> Comma-separated list of files containing Crossbow-style reads. Can be a mixture of paired and unpaired. Specify "-" for stdin. <s> Comma-separated list of files containing unpaired reads, or the sequences themselves, if -c is set. Specify "-" for stdin. <hit> File to write hits to (default: stdout) Input: -q query input files are FASTQ .fq/.fastq (default) -f query input files are (multi-)FASTA .fa/.mfa -r query input files are raw one-sequence-per-line -c query sequences given on cmd line (as <mates>, <singles>) -C reads and index are in colorspace -Q/--quals <file> QV file(s) corresponding to CSFASTA inputs; use with -f -C --Q1/--Q2 <file> same as -Q, but for mate files 1 and 2 respectively -s/--skip <int> skip the first <int> reads/pairs in the input -u/--qupto <int> stop after first <int> reads/pairs (excl. skipped reads) -5/--trim5 <int> trim <int> bases from 5' (left) end of reads -3/--trim3 <int> trim <int> bases from 3' (right) end of reads --phred33-quals input quals are Phred+33 (default) --phred64-quals input quals are Phred+64 (same as --solexa1.3-quals) --solexa-quals input quals are from GA Pipeline ver. < 1.3 --solexa1.3-quals input quals are from GA Pipeline ver. >= 1.3 --integer-quals qualities are given as space-separated integers (not ASCII) Alignment: -v <int> report end-to-end hits w/ <=v mismatches; ignore qualities or -n/--seedmms <int> max mismatches in seed (can be 0-3, default: -n 2) -e/--maqerr <int> max sum of mismatch quals across alignment for -n (def: 70) -l/--seedlen <int> seed length for -n (default: 28) --nomaqround disable Maq-like quality rounding for -n (nearest 10 <= 30) -I/--minins <int> minimum insert size for paired-end alignment (default: 0) -X/--maxins <int> maximum insert size for paired-end alignment (default: 250) --fr/--rf/--ff -1, -2 mates align fw/rev, rev/fw, fw/fw (default: --fr) --nofw/--norc do not align to forward/reverse-complement reference strand --maxbts <int> max # backtracks for -n 2/3 (default: 125, 800 for --best) --pairtries <int> max # attempts to find mate for anchor hit (default: 100) -y/--tryhard try hard to find valid alignments, at the expense of speed --chunkmbs <int> max megabytes of RAM for best-first search frames (def: 64) Reporting: -k <int> report up to <int> good alignments per read (default: 1) -a/--all report all alignments per read (much slower than low -k) -m <int> suppress all alignments if > <int> exist (def: no limit) -M <int> like -m, but reports 1 random hit (MAPQ=0); requires --best --best hits guaranteed best stratum; ties broken by quality --strata hits in sub-optimal strata aren't reported (requires --best) Output: -t/--time print wall-clock time taken by search phases -B/--offbase <int> leftmost ref offset = <int> in bowtie output (default: 0) --quiet print nothing but the alignments --refout write alignments to files refXXXXX.map, 1 map per reference --refidx refer to ref. seqs by 0-based index rather than name --al <fname> write aligned reads/pairs to file(s) <fname> --un <fname> write unaligned reads/pairs to file(s) <fname> --max <fname> write reads/pairs over -m limit to file(s) <fname> --suppress <cols> suppresses given columns (comma-delim'ed) in default output --fullref write entire ref name (default: only up to 1st space) Colorspace: --snpphred <int> Phred penalty for SNP when decoding colorspace (def: 30) or --snpfrac <dec> approx. fraction of SNP bases (e.g. 0.001); sets --snpphred --col-cseq print aligned colorspace seqs as colors, not decoded bases --col-cqual print original colorspace quals, not decoded quals --col-keepends keep nucleotides at extreme ends of decoded alignment SAM: -S/--sam write hits in SAM format --mapq <int> default mapping quality (MAPQ) to print for SAM alignments --sam-nohead supppress header lines (starting with @) for SAM output --sam-nosq supppress @SQ header lines for SAM output --sam-RG <text> add <text> (usually "lab=value") to @RG line of SAM header Performance: -o/--offrate <int> override offrate of index; must be >= index's offrate -p/--threads <int> number of alignment threads to launch (default: 1) --mm use memory-mapped I/O for index; many 'bowtie's can share --shmem use shared mem for index; many 'bowtie's can share Other: --seed <int> seed for random number generator --verbose verbose output (for debugging) --version print version information and quit -h/--help print this usage message """ # make runDir if it does not yet exist mkdirp(runDir) # Construct cmdArgs if options: cmdArgs = "%s %s %s %s" % (options,ebwt,readsString,hit) else: cmdArgs = "%s %s %s" % (ebwt,readsString,hit) # Run and capture output print "Setting up bowtie call with the following cmd:\n\t\tbowtie %s" % (cmdArgs) btResults = runExternalApp('bowtie',cmdArgs) # Report stdout and stderr if btResults[0]: for line in btResults[0].split('\n'): print('[%s] %s' % (whoami(),line)) if btResults[1]: for line in btResults[1].split('\n'): sys.stderr.write('[%s] %s' % (whoami(),line)) return btResults
def divByWindow(bedA_Path, bedB_Path, win=[500, 500], cols=[6, 6], side="right", outDir="."): """Create files separating features in bedB by those alling within the area defined by <win> and those outside this area in bedA. If A.bed is stranded, the area is defined by win[0] upstrm and win[1] dwnstrm on the FEATURE's strand. Otherwise its win[0] upstrm and win[1] dwnstrm on the CONTIG/CHROM's plus strand. Files ouput to outDir. NOTE: See DOC for extractFromDoubleSidedBedtoolOut() regarding 'cols' and 'side'""" # Prepare outDir if it doesnt already exist mkdirp(outDir) # Collect some useful info bedA_name = bedA_Path.split("/")[-1].replace(".bed", "") bedB_name = bedB_Path.split("/")[-1].replace(".bed", "") B_in_A_winComboPath = "%s/%s_featsIn_%s_Win%sl%sr_combo.bed" % (outDir, bedB_name, bedA_name, win[0], win[1]) # Establish whether inputs look like BED files: testA = open(bedA_Path, "rU") testB = open(bedB_Path, "rU") linesA = [] linesB = [] for i in range(2): linesA.append(testA.readline()) linesB.append(testB.readline()) testA.close() testB.close() if not isBEDline(linesA[1]): raise InvalidFileFormatError("%s does not seem to be in BED format." % (bedA_Path)) if not isBEDline(linesB[1]): raise InvalidFileFormatError("%s does not seem to be in BED format." % (bedB_Path)) # If bedA is stranded: use windowBed with -sw option, otherwise with only -l,-r options # to create file from bedB features INSIDE window around features in bedA. if isStranded(linesA[1]): resultWinBed = runExternalApp( "windowBed", "-a %s -b %s -l %s -r %s -sw > %s" % (bedA_Path, bedB_Path, win[0], win[1], B_in_A_winComboPath), ) else: resultWinBed = runExternalApp( "windowBed", "-a %s -b %s -l %s -r %s > %s" % (bedA_Path, bedB_Path, win[0], win[1], B_in_A_winComboPath) ) # Clean B_in_A_winComboPath of the matching bedA entry and remove any redundant bedB entries cleanedBsInWinPath = extractFromDoubleSidedBedtoolOut(B_in_A_winComboPath, cols=cols, side=side, outDir=outDir) # Change file name to reflect its not combo anymore cleanedBsInWinNewPath = cleanedBsInWinPath.replace("_combo_", "_cleaned_") resultMv = runExternalApp("mv", "%s %s" % (cleanedBsInWinPath, cleanedBsInWinNewPath)) # Create file with bedB feats OUTSIDE of window of features in bedA. cleanedBsNotInWinPath = cleanedBsInWinNewPath.replace("_featsIn_", "_featsNotIn_") onlyInA(bedB_Path, cleanedBsInWinNewPath, cleanedBsNotInWinPath) # resultIsectBed = runExternalApp('intersectBed','-a %s -b %s -v > %s' % \ # (bedB_Path, # cleanedBsInWinNewPath, # cleanedBsNotInWinPath)) # Return Filenames of divided bed files return (cleanedBsInWinNewPath, cleanedBsNotInWinPath)