def loadFilteredContigLengths(infile, outfile): ''' load contig lengths ''' outname = P.snip(os.path.dirname(infile), ".dir") + "_" + P.snip(os.path.basename(infile), ".tsv") + ".load" P.load(infile, outname) P.touch(outfile)
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = open('dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.bam") statfile = P.snip(f, ".bam") + ".dupstats" if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [ x for x in open(statfile, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def buildUniformityOfCoverage(infiles, outfile): ''' build matrix of coverage over contigs ''' bam = infiles[0] track = P.snip(os.path.basename(bam), ".bam") tmp_bed = P.getTempFilename(".") + ".bed" tmp_bam = P.getTempFilename(".") + ".bam" # filter for mapped reads statement = '''cat %(bam)s | python %(scriptsdir)s/bam2bam.py --filter=mapped --log=/dev/null > %(tmp_bam)s ; samtools index %(tmp_bam)s''' P.run() for infs in infiles[1:]: for inf in infs: if P.snip(inf, ".lengths.tsv") == track: length_file = inf statement = '''cat %(length_file)s | awk 'NR>1 {printf("%%s\\t0\\t%%s\\n", $1, $2)}' > %(tmp_bed)s''' P.run() statement = '''python %(scriptsdir)s/bam2peakshape.py --only-interval %(tmp_bam)s %(tmp_bed)s --log=%(outfile)s.log --output-filename-pattern=%(track)s.%%s''' P.run() os.unlink(tmp_bed) os.unlink(tmp_bam)
def plotFalsePositiveRates(infile, outfile): ''' barplot the false positive rates across taxonomic levels ''' R('''library(ggplot2)''') R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % infile) for i in [0, 1]: # specificity outf = P.snip(outfile, ".pdf") + ".%i.specificity.pdf" % i R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = fp_rate, fill = track, stat = "identity"))''' % i) R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''') R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))''' ) R('''ggsave("%s")''' % outf) # sensitivity outf = P.snip(outfile, ".pdf") + ".%i.sensitivity.pdf" % i R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = tp_rate, fill = track, stat = "identity"))''' % i) R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''') R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))''' ) R('''ggsave("%s")''' % outf) P.touch(outfile)
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = open('dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.bam") statfile = P.snip(f, ".bam") + ".dupstats" if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in open( statfile, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def plotFalsePositiveRates(infile, outfile): ''' barplot the false positive rates across taxonomic levels ''' R('''library(ggplot2)''') R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % infile) for i in [0, 1]: # specificity outf = P.snip(outfile, ".pdf") + ".%i.specificity.pdf" % i R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = fp_rate, fill = track, stat = "identity"))''' % i) R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''') R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))''') R('''ggsave("%s")''' % outf) # sensitivity outf = P.snip(outfile, ".pdf") + ".%i.sensitivity.pdf" % i R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = tp_rate, fill = track, stat = "identity"))''' % i) R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''') R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))''') R('''ggsave("%s")''' % outf) P.touch(outfile)
def createConfigFiles(infile, outfile): ''' create all of the relevant .ini files in each working directory in order to execute the transcript building ''' # test options for cufflinks cuff_opts = P.snip(infile, ".log").split("_") cuff_options = [] for opt in cuff_opts: if len(opt)>6: # not ideal to do my length but all I can think of at the moment cuff_options.append("--" + opt) else: cuff_options.append(opt) cuff_options = " ".join(cuff_options) options = PARAMS["cufflinks_options"] # directory for output config outdir = P.snip(infile, ".log") outf = open(os.path.join(outdir, "pipeline.ini"), "w") config_headers = [] lines = [] for line in open("pipeline.ini").readlines(): lines.append(line) if line.find("[cufflinks]") != -1: outf.write( "[cufflinks]\n\n# general cufflinks options\n\noptions=%s %s \n" % (options,cuff_options) ) elif "[cufflinks]\n" in lines and "[cuffdiff\n]" not in lines: if line.find("options=") != -1: continue else: outf.write(line) else: outf.write(line) outf.close()
def runFeatureCounts(annotations_file, bamfile, outfile, nthreads=4, strand=2, options=""): '''run feature counts on *annotations_file* with *bam_file*. If the bam-file is paired, paired-end counting is enabled and the bam file automatically sorted. ''' # featureCounts cannot handle gzipped in or out files outfile = P.snip(outfile, ".gz") tmpdir = P.getTempDir() annotations_tmp = os.path.join(tmpdir, 'geneset.gtf') bam_tmp = os.path.join(tmpdir, os.path.basename(bamfile)) # -p -B specifies count fragments rather than reads, and both # reads must map to the feature # for legacy reasons look at feature_counts_paired if BamTools.isPaired(bamfile): # select paired end mode, additional options paired_options = "-p -B" # remove .bam extension bam_prefix = P.snip(bam_tmp, ".bam") # sort by read name paired_processing = \ """samtools sort -@ %(nthreads)i -n %(bamfile)s %(bam_prefix)s; checkpoint; """ % locals() bamfile = bam_tmp else: paired_options = "" paired_processing = "" job_threads = nthreads # AH: what is the -b option doing? statement = '''mkdir %(tmpdir)s; zcat %(annotations_file)s > %(annotations_tmp)s; checkpoint; %(paired_processing)s featureCounts %(options)s -T %(nthreads)i -s %(strand)s -b -a %(annotations_tmp)s %(paired_options)s -o %(outfile)s %(bamfile)s >& %(outfile)s.log; checkpoint; gzip -f %(outfile)s; checkpoint; rm -rf %(tmpdir)s ''' P.run()
def splitMultiAndSingleExonLincRna(infile, outfiles): ''' pulls out the multi-exonic and the single exonic lincRNA transcripts from the lincrna.gtf.gz ''' inf = gzip.open(infile) multi = gzip.open(P.snip(infile, ".gtf.gz") + ".multi_exon.gtf.gz", "w") single = gzip.open(P.snip(infile, ".gtf.gz") + ".single_exon.gtf.gz", "w") for entry in GTF.transcript_iterator(GTF.iterator(inf)): if len(entry) > 1: for exon in entry: multi.write("\t".join(map(str, [exon.contig, exon.source, exon.feature, exon.start, exon.end, ".", exon.strand, "."])) + "\t" + exon.attributes + "\n") elif len(entry) == 1: for exon in entry: single.write("\t".join(map(str, [exon.contig, exon.source, exon.feature, exon.start, exon.end, ".", exon.strand, "."])) + "\t" + exon.attributes + "\n") for outfile in outfiles: outf = P.snip(outfile, ".gz") if not os.path.exists(outfile): statement = '''gzip %(outf)s''' P.run()
def filterByCoverage(infiles, outfile): fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect( os.path.join(PARAMS["results_resultsdir"], PARAMS["database"])) cc = dbh.cursor() contigs = set() for infile in infiles[1:]: dirsplit = infile.split("/") infile = os.path.join( PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1]) tablename = P.toTable(os.path.basename(infile)) if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"): statement = """SELECT contig_id ave FROM (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id) WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"]) for data in cc.execute(statement).fetchall(): contigs.add(data[0]) outf = open(outfile, "w") print contigs for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)): identifier = fasta.title.split(" ")[0] if identifier in contigs: outf.write(">%s\n%s\n" % (identifier, fasta.sequence)) outf.close()
def splitMultiAndSingleExonLincRna(infile, outfiles): ''' pulls out the multi-exonic and the single exonic lincRNA transcripts from the lincrna.gtf.gz ''' inf = gzip.open(infile) multi = gzip.open(P.snip(infile, ".gtf.gz") + ".multi_exon.gtf.gz", "w") single = gzip.open(P.snip(infile, ".gtf.gz") + ".single_exon.gtf.gz", "w") for entry in GTF.transcript_iterator(GTF.iterator(inf)): if len(entry) > 1: for exon in entry: multi.write("\t".join( map(str, [ exon.contig, exon.source, exon.feature, exon.start, exon.end, ".", exon.strand, "." ])) + "\t" + exon.attributes + "\n") elif len(entry) == 1: for exon in entry: single.write("\t".join( map(str, [ exon.contig, exon.source, exon.feature, exon.start, exon.end, ".", exon.strand, "." ])) + "\t" + exon.attributes + "\n") for outfile in outfiles: outf = P.snip(outfile, ".gz") if not os.path.exists(outfile): statement = '''gzip %(outf)s''' P.run()
def createMAFAlignment(infiles, outfile): """ Takes all .axt files in the input directory, filters them to remove files based on supplied regular expressions, converts to a single maf file using axtToMaf, filters maf alignments under a specified length. """ outfile = P.snip(outfile, ".gz") axt_dir = PARAMS["phyloCSF_location_axt"] to_ignore = re.compile(PARAMS["phyloCSF_ignore"]) axt_files = [] for axt_file in os.listdir(axt_dir): if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file): axt_files.append(os.path.join(axt_dir, axt_file)) axt_files = (" ").join(sorted(axt_files)) E.info("axt files from which MAF alignment will be created: %s" % axt_files) target_genome = PARAMS["phyloCSF_target_genome"] target_contigs = os.path.join(PARAMS["annotations_annotations_dir"], PARAMS_ANNOTATIONS["interface_contigs"]) query_genome = PARAMS["phyloCSF_query_genome"] query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"], PARAMS_ANNOTATIONS["interface_contigs"]) tmpf1 = P.getTempFilename("./phyloCSF") tmpf2 = P.getTempFilename("./phyloCSF") to_cluster = False # concatenate axt files, then remove headers statement = ("zcat %(axt_files)s" " > %(tmpf1)s;" " axtToMaf " " -tPrefix=%(target_genome)s." " -qPrefix=%(query_genome)s." " %(tmpf1)s" " %(target_contigs)s" " %(query_contigs)s" " %(tmpf2)s") P.run() E.info("Temporary axt file created %s" % os.path.abspath(tmpf1)) E.info("Temporary maf file created %s" % os.path.abspath(tmpf2)) removed = P.snip(outfile, ".maf") + "_removed.maf" to_cluster = False filtered = PipelineLncRNA.filterMAF(tmpf2, outfile, removed, PARAMS["phyloCSF_filter_alignments"]) E.info("%s blocks were ignored in MAF alignment" " because length of target alignment was too short" % filtered[0]) E.info("%s blocks were output to filtered MAF alignment" % filtered[1]) os.unlink(tmpf1) os.unlink(tmpf2) to_cluster = False statement = ("gzip %(outfile)s;" " gzip %(removed)s") P.run()
def loadPicardHistogram( infiles, outfile, suffix, column, pipeline_suffix = ".picard_stats" ): '''extract a histogram from a picard output file and load it into database.''' tablename = P.toTable( outfile ) tname = "%s_%s" % (tablename, suffix) tname = P.snip( tname, "_metrics") + "_histogram" # some files might be missing xfiles = [ x for x in infiles if os.path.exists( "%s.%s" % (x, suffix) ) ] if len(xfiles) == 0: E.warn ( "no files for %s" % tname ) return header = ",".join( [P.snip( os.path.basename(x), pipeline_suffix) for x in xfiles ] ) filenames = " ".join( [ "%s.%s" % (x, suffix) for x in xfiles ] ) # there might be a variable number of columns in the tables # only take the first ignoring the rest statement = """python %(scriptsdir)s/combine_tables.py --regex-start="## HISTOGRAM" --missing=0 --take=2 %(filenames)s | python %(scriptsdir)s/csv2db.py --header=%(column)s,%(header)s --replace-header --index=track --table=%(tname)s >> %(outfile)s """ P.run()
def postprocess( self, infiles, outfile ): '''collect output data and postprocess.''' track = P.snip( os.path.basename(outfile), ".bam" ) outf = P.snip( outfile, ".bam" ) tmpdir = self.tmpdir_fastq strip_cmd, unique_cmd = "", "" if self.remove_non_unique: unique_cmd = '| python %%(scriptsdir)s/bam2bam.py --filter=unique --log=%(outfile)s.log' % locals() if self.strip_sequence: strip_cmd = '| python %%(scriptsdir)s/bam2bam.py --strip=sequence --log=%(outfile)s.log' % locals() statement = ''' cp %(tmpdir)s/Log.std.out %(outfile)s.std.log; cp %(tmpdir)s/Log.final.out %(outfile)s.final.log; cp %(tmpdir)s/SJ.out.tab %(outfile)s.junctions; cat %(tmpdir)s/Log.out >> %(outfile)s.log; cp %(tmpdir)s/Log.progress.out %(outfile)s.progress; samtools view -uS %(tmpdir)s/%(track)s.sam %(unique_cmd)s %(strip_cmd)s | samtools sort - %(outf)s 2>>%(outfile)s.log; samtools index %(outfile)s;''' % locals() return statement
def build(self, infile): track = self.getTrack(infile) format = self.getFormat(infile) if format.endswith(".gz"): format = P.snip(format, ".gz") format = format.upper() # cortex_var only uses paired end information to # remove pcr duplicates if not self.checkPairs(infile): paired = "--se_list" reads = os.path.join(os.getcwd(), infile) elif len(self.checkPairs(infile)) > 1: paired = "--pe_list" read1 = infile format = P.snip(format, ".1") read2 = self.checkPairs(infile)[1] elif self.checkPairs(infile) == "interleaved": raise ValueError, "pipeline does not support file of type 'interleaved'" temp = P.getTempDir() read1_new = os.path.join(temp, P.snip(read1, ".1.gz")) read2_new = os.path.join(temp, P.snip(read2, ".2.gz")) # paired end list list1 = open("cortex_var.dir/read1.txt", "w") list2 = open("cortex_var.dir/read2.txt", "w") list1.write(read1_new + "\n") list2.write(read2_new + "\n") list1.close() list2.close() list1 = os.path.abspath("cortex_var.dir/read1.txt") list2 = os.path.abspath("cortex_var.dir/read2.txt") reads = ",".join([os.path.join(os.getcwd(), x) for x in [read1_new, read2_new]]) statement = ( """ gunzip -c %(read1)s > %(read1_new)s ; gunzip -c %(read2)s > %(read2_new)s ; cd cortex_var.dir ; %%(cortex_var_executable)s %(paired)s %(list1)s,%(list2)s --format %(format)s --mem_height 15 --quality_score_threshold %%(cortex_var_qual_threshold)i --remove_pcr_duplicates --remove_low_coverage_supernodes %%(cortex_var_rm_low_coverage_supernodes)i --sample_id %(track)s --kmer_size %%(kmer)s --dump_binary dump_binary.ctx ; rm -rf %(temp)s """ % locals() ) return statement
def loadContigLengths(infile, outfile): ''' load contig lengths ''' outname = P.snip(os.path.dirname(infile), ".dir") + "_" + P.snip( os.path.basename(infile), ".tsv") + ".load" P.load(infile, outname) P.touch(outfile)
def loadContigGCContent(infile, outfile): ''' load contig GC content ''' outname = P.snip(os.path.dirname(infile), ".dir") + \ "_" + P.snip(os.path.basename(infile), ".tsv") + ".load" P.load(infile, outname, "--index=id") P.touch(outfile)
def loadContigLengths(infile, outfile): ''' load contig lengths ''' outname = P.snip(os.path.dirname(infile), ".dir") + \ "_" + P.snip(os.path.basename(infile), ".tsv") + ".load" P.load(infile, outname, "--index=scaffold_name") P.touch(outfile)
def loadAlignmentStats(infiles, outfile): '''merge alignment stats into single tables.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(f, ".bam.stats") fn = f + ".alignment_summary_metrics" if not os.path.exists(fn): E.warn("file %s missing" % fn) continue lines = [ x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s ''' P.run() for suffix, column in (("quality_by_cycle_metrics", "cycle"), ("quality_distribution_metrics", "quality")): # some files might be missing - bugs in Picard xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] header = ",".join([P.snip(x, ".bam.stats") for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) tname = "%s_%s" % (tablename, suffix) statement = """python %(scriptsdir)s/combine_tables.py --missing=0 %(filenames)s | python %(scriptsdir)s/csv2db.py --header=%(column)s,%(header)s --replace-header --index=track --table=%(tname)s >> %(outfile)s """ P.run() os.unlink(tmpfilename)
def loadAlignmentStats(infiles, outfile): '''merge alignment stats into single tables.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(f, ".bam.stats") fn = f + ".alignment_summary_metrics" if not os.path.exists(fn): E.warn("file %s missing" % fn) continue lines = [ x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s ''' P.run() for suffix, column in (("quality_by_cycle_metrics", "cycle"), ("quality_distribution_metrics", "quality")): # some files might be missing - bugs in Picard xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] header = ",".join([P.snip(x, ".bam.stats") for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) tname = "%s_%s" % (tablename, suffix) statement = """python %(scriptsdir)s/combine_tables.py --missing=0 %(filenames)s | python %(scriptsdir)s/csv2db.py --header=%(column)s,%(header)s --replace-header --index=track --table=%(tname)s >> %(outfile)s """ P.run() os.unlink(tmpfilename)
def preprocess(self, infile): ''' fastq files need to be converted to fasta and pairs need to be merged ''' mtype = None # check for paired end data either in the same file or in a separate file # for each read - will need to be gunzipped # check compression status if infile.endswith(".gz"): if len(self.checkPairs( infile)) > 1: # check for paired data in separate files read1 = infile read2 = self.checkPairs(infile)[1] temp = P.getTempDir() elif self.checkPairs == "interleaved": infile_new = os.path.join(temp, P.snip(infile, ".gz")) zippy = """gunzip -c %(infile)s > %(infile_new)s; """ % locals( ) else: zippy = "" # only need to convert if the data are in fastq format if self.getFormat(infile).find("fastq") != -1 and len( self.checkPairs(infile) ) > 1: # reads are fastq and paired in separate files mtype = "--merge" # argument for conversion tool elif self.getFormat(infile).find("fastq") != -1 and self.checkPairs( infile ) == "interleaved": # reads are fastq and in the same file mtype = "--paired" # argument for conversion tool # requires a merge of the fastq files in to fasta format if mtype: # the reads are paired end if mtype == "--merge": outf = P.snip(os.path.basename(read1), ".fastq.1.gz") + ".fa" # check if file exists - metaphlan also performs this preprocessing step if not os.path.exists(outf): statement = '''python %%(scriptsdir)s/fastqs2fasta.py -a %(read1)s -b %(read2)s --log=%(read1)s.log > %(outf)s ''' % locals() P.run() else: E.info("no need to create file %s - exists" % outf) elif mtype == "--paired": outf = P.snip(os.path.basename(infile_new), ".fastq") + ".fa" statement = '''%(zippy)s''' P.run() statement = '''fq2fa %(mtype)s %(infile_new)s %(outf)s rm -rf %(temp)s''' % locals() P.run() else: statement = None return statement
def reMergeBamfiles(infiles, sentinal): infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles] outfile = P.snip(sentinal, ".sentinal") + ".bam" bad_samples = PARAMS["options_to_remove"].split(",") to_merge = IDR.filterBadLibraries(infiles, bad_samples) IDR.mergeBams(to_merge, outfile) P.touch(sentinal)
def build(self, infile): track = self.getTrack(infile) format = self.getFormat(infile) if format.endswith(".gz"): format = P.snip(format, ".gz") format = format.upper() # cortex_var only uses paired end information to # remove pcr duplicates if not self.checkPairs(infile): paired = "--se_list" reads = os.path.join(os.getcwd(), infile) elif len(self.checkPairs(infile)) > 1: paired = "--pe_list" read1 = infile format = P.snip(format, ".1") read2 = self.checkPairs(infile)[1] elif self.checkPairs(infile) == "interleaved": raise ValueError, "pipeline does not support file of type 'interleaved'" temp = P.getTempDir() read1_new = os.path.join(temp, P.snip(read1, ".1.gz")) read2_new = os.path.join(temp, P.snip(read2, ".2.gz")) # paired end list list1 = open("cortex_var.dir/read1.txt", "w") list2 = open("cortex_var.dir/read2.txt", "w") list1.write(read1_new + "\n") list2.write(read2_new + "\n") list1.close() list2.close() list1 = os.path.abspath("cortex_var.dir/read1.txt") list2 = os.path.abspath("cortex_var.dir/read2.txt") reads = ",".join( [os.path.join(os.getcwd(), x) for x in [read1_new, read2_new]]) statement = ''' gunzip -c %(read1)s > %(read1_new)s ; gunzip -c %(read2)s > %(read2_new)s ; cd cortex_var.dir ; %%(cortex_var_executable)s %(paired)s %(list1)s,%(list2)s --format %(format)s --mem_height 15 --quality_score_threshold %%(cortex_var_qual_threshold)i --remove_pcr_duplicates --remove_low_coverage_supernodes %%(cortex_var_rm_low_coverage_supernodes)i --sample_id %(track)s --kmer_size %%(kmer)s --dump_binary dump_binary.ctx ; rm -rf %(temp)s ''' % locals() return statement
def poolSampleBamfiles(infiles, sentinal): """ Merge filtered sample files for each tissue """ infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles] outfile = P.snip(sentinal, ".sentinal") + ".bam" IDR.mergeBams(infiles, outfile) P.touch(sentinal)
def buildSpeciesMap(infiles, outfile): ''' build species map file for input into contigs2random_samples.py ''' to_cluster = True bam = infiles[0] contig = [x for x in infiles[1] if P.snip(x, ".fa") == P.snip(bam, ".bam")][0] statement = ''' cat %(contig)s | python %(scriptsdir)s/bam2species_map.py -b %(bam)s --log=%(outfile)s.log > %(outfile)s''' P.run()
def mergeEffects(infiles, outfile): '''load transcript effects into single table.''' tablename = P.toTable(outfile) outf = open('effects.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [x for x in gzip.open(f, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() statement = '''cat effect.txt | python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --index=transcript_id \ --table=%(tablename)s \ > %(outfile)s''' P.run() for suffix in ("cds", "intron", "splicing", "translation", "genes"): outf = open('effects.' + suffix + '.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") statfile = f + "." + suffix + ".gz" print(statfile) if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in gzip.open(statfile, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty --index=transcript_id --table=%(tablename)s_%(suffix)s --ignore-column=seq_na --ignore-column=seq_aa >> %(outfile)s''' P.run()
def preprocess(self, infile): ''' fastq files need to be converted to fasta and pairs need to be merged ''' mtype = None # check for paired end data either in the same file or in a separate file # for each read - will need to be gunzipped # check compression status if infile.endswith(".gz"): # check for paired data in separate files if len(self.checkPairs(infile)) > 1: read1 = infile read2 = self.checkPairs(infile)[1] temp = P.getTempDir() elif self.checkPairs == "interleaved": infile_new = os.path.join(temp, P.snip(infile, ".gz")) zippy = """gunzip -c %(infile)s > %(infile_new)s; """ % locals() else: zippy = "" # only need to convert if the data are in fastq format # reads are fastq and paired in separate files if self.getFormat(infile).find("fastq") != -1 and len(self.checkPairs(infile)) > 1: mtype = "--merge" # argument for conversion tool # reads are fastq and in the same file elif self.getFormat(infile).find("fastq") != -1 and self.checkPairs(infile) == "interleaved": mtype = "--paired" # argument for conversion tool # requires a merge of the fastq files in to fasta format if mtype: # the reads are paired end if mtype == "--merge": outf = P.snip(os.path.basename(read1), ".fastq.1.gz") + ".fa" # check if file exists - metaphlan also performs this # preprocessing step if not os.path.exists(outf): statement = '''python %%(scriptsdir)s/fastqs2fasta.py -a %(read1)s -b %(read2)s --log=%(read1)s.log > %(outf)s ''' % locals() P.run() else: E.info("no need to create file %s - exists" % outf) elif mtype == "--paired": outf = P.snip(os.path.basename(infile_new), ".fastq") + ".fa" statement = '''%(zippy)s''' P.run() statement = '''fq2fa %(mtype)s %(infile_new)s %(outf)s rm -rf %(temp)s''' % locals() P.run() else: statement = None return statement
def preprocess(self, infile): ''' fastq files need to be converted to fasta and pairs need to be merged ''' mtype = None # check for paired end data either in the same file or in a separate file # for each read - will need to be gunzipped # check compression status if infile.endswith(".gz"): if len(self.checkPairs( infile)) > 1: # check for paired data in separate files read1 = infile read2 = self.checkPairs(infile)[1] temp = P.getTempDir() read1_new = os.path.join(temp, P.snip(infile, ".gz")) read2_new = os.path.join( temp, P.snip(self.checkPairs(infile)[1], ".gz")) zippy = """gunzip -c %(read1)s > %(read1_new)s ; gunzip -c %(read2)s > %(read2_new)s; """ % locals() elif self.checkPairs == "interleaved": infile_new = os.path.join(temp, P.snip(infile, ".gz")) zippy = """gunzip -c %(infile)s > %(infile_new)s; """ % locals( ) else: zippy = "" # only need to convert if the data are in fastq format if self.getFormat(infile).find("fastq") != -1 and len( self.checkPairs(infile) ) > 1: # reads are fastq and paired in separate files mtype = "--merge" # argument for conversion tool elif self.getFormat(infile).find("fastq") != -1 and self.checkPairs( infile ) == "interleaved": # reads are fastq and in the same file mtype = "--paired" # argument for conversion tool # build statement if mtype: # the reads are paired end if mtype == "--merge": outf = P.snip(os.path.basename(read1_new), ".fastq.1") + ".fa" statement = '''%(zippy)s fq2fa %(mtype)s %(read1_new)s %(read2_new)s %(outf)s ''' % locals() elif mtype == "--paired": outf = P.snip(os.path.basename(infile_new), ".fastq") + ".fa" statement = '''%(zippy)s fq2fa %(mtype)s %(infile_new)s %(outf)s rm -rf %(temp)s''' % locals() else: statement = None return statement
def plotCoverageHistogram(infile, outfile): ''' plot the coverage over kmers ''' inf = P.snip(infile, ".contigs.fa") + ".stats.txt" outf = P.snip(inf, ".txt") + ".pdf" R('''library(plotrix)''') R('''data = read.table("%s", header=TRUE)''' % inf) R('''pdf("%s", height = 7, width = 7 )''' % outf) R('''weighted.hist(data$short1_cov, data$lgth, breaks=seq(0, 200, by=1))''') R["dev.off"]()
def alignmentTargets(genome_files, contig_files): ''' generator object to produce filenames for aligning contigs to known ncbi genomes ''' parameters = [] for genome, contig in itertools.product(genome_files, contig_files): outfile = os.path.join("alignment.dir", P.snip( contig, ".contigs.fa") + "_vs_" + P.snip(os.path.basename(genome), ".fna")) + ".delta" parameters.append([genome, outfile, contig]) return parameters
def summarizeProcessing(infile, outfile): '''build processing summary.''' def _parseLog(inf, step): inputs, outputs = [], [] if step == "reconcile": for line in inf: x = re.search( "first pair: (\d+) reads, second pair: (\d+) reads, shared: (\d+) reads", line) if x: i1, i2, o = map(int, x.groups()) inputs = [i1, i2] outputs = [o, o] break elif step == "contaminants": lines = inf.readlines() assert lines[0].startswith("cutadapt") lines = "@@@".join(lines) for part in lines.split("cutadapt")[1:]: results, adapters = parseCutadapt( ("cutadapt" + part).split("@@@")) inputs.append(results["processed_reads"]) outputs.append(results["unchanged_reads"]) else: for line in inf: if line.startswith("Input:"): inputs.append( int(re.match("Input: (\d+) reads.", line).groups()[0])) elif line.startswith("Output:"): outputs.append( int( re.match("Output: (\d+) reads.", line).groups()[0])) return zip(inputs, outputs) infile2 = checkPairs(infile) if infile2: track = P.snip(infile, ".fastq.1.gz") else: track = P.snip(infile, ".fastq.gz") outf = IOTools.openFile(outfile, "w") outf.write("track\tstep\tpair\tinput\toutput\n") for step in "contaminants", "artifacts", "trim", "filter", "reconcile": fn = infile + "_%s.log" % step if not os.path.exists(fn): continue for x, v in enumerate(_parseLog(IOTools.openFile(fn), step)): outf.write("%s\t%s\t%i\t%i\t%i\n" % (track, step, x, v[0], v[1])) outf.close()
def splitPooledBamfiles(infile, sentinel): infile = P.snip(infile, ".sentinel") + ".bam" outfile = P.snip(sentinel, ".sentinel") params = '2' try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinel)
def findNPeaksForIndividualReplicates(infiles, outfile): idr_thresh = PARAMS["idr_options_inter_replicate_threshold"] try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "findNPeaks", params=[str(idr_thresh), ], infiles=infiles, outfiles=outfile)
def buildSpeciesMap(infiles, outfile): ''' build species map file for input into contigs2random_samples.py ''' to_cluster = True bam = infiles[0] contig = [ x for x in infiles[1] if P.snip(x, ".fa") == P.snip(bam, ".bam") ][0] statement = ''' cat %(contig)s | python %(scriptsdir)s/bam2species_map.py -b %(bam)s --log=%(outfile)s.log > %(outfile)s''' P.run()
def findNPeaksForPooledPseudoreplicates(infiles, outfile): idr_thresh = PARAMS["idr_options_pooled_consistency_threshold"] try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "findNPeaks", params=[str(idr_thresh), ], infiles=infiles, outfiles=outfile)
def summarizeProcessing(infile, outfile): '''build processing summary.''' def _parseLog(inf, step): inputs, outputs = [], [] if step == "reconcile": for line in inf: x = re.search( "first pair: (\d+) reads, second pair: (\d+) reads, shared: (\d+) reads", line) if x: i1, i2, o = map(int, x.groups()) inputs = [i1, i2] outputs = [o, o] break elif step == "contaminants": lines = inf.readlines() assert lines[0].startswith("cutadapt") lines = "@@@".join(lines) for part in lines.split("cutadapt")[1:]: results, adapters = parseCutadapt( ("cutadapt" + part).split("@@@")) inputs.append(results["processed_reads"]) outputs.append(results["unchanged_reads"]) else: for line in inf: if line.startswith("Input:"): inputs.append( int(re.match("Input: (\d+) reads.", line).groups()[0])) elif line.startswith("Output:"): outputs.append( int(re.match("Output: (\d+) reads.", line).groups()[0])) return zip(inputs, outputs) infile2 = checkPairs(infile) if infile2: track = P.snip(infile, ".fastq.1.gz") else: track = P.snip(infile, ".fastq.gz") outf = IOTools.openFile(outfile, "w") outf.write("track\tstep\tpair\tinput\toutput\n") for step in "contaminants", "artifacts", "trim", "filter", "reconcile": fn = infile + "_%s.log" % step if not os.path.exists(fn): continue for x, v in enumerate(_parseLog(IOTools.openFile(fn), step)): outf.write("%s\t%s\t%i\t%i\t%i\n" % (track, step, x, v[0], v[1])) outf.close()
def poolInputBamfiles(infiles, sentinal): """ Merge filtered input files for each tissue, with the option of excluding undesirable libraries. """ infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles] outfile = P.snip(sentinal, ".sentinal") + ".bam" bad_samples = PARAMS["filter_remove_inputs"].split(",") to_merge = IDR.filterBadLibraries(infiles, bad_samples) IDR.mergeBams(to_merge, outfile) P.touch(sentinal)
def loadLCA(infile, outfile): ''' load LCA results ''' tablename = P.snip(os.path.dirname(infile), ".dir") + \ "_" + os.path.basename(P.snip(infile, ".gz")) tablename = P.toTable(tablename + ".load") statement = '''zcat %(infile)s | python %(scriptsdir)s/csv2db.py -t %(tablename)s --index=id --log=%(outfile)s.log > %(outfile)s''' P.run()
def alignmentTargets(genome_files, contig_files): ''' generator object to produce filenames for aligning contigs to known ncbi genomes ''' parameters = [] for genome, contig in itertools.product(genome_files, contig_files): outfile = os.path.join( "alignment.dir", P.snip(contig, ".contigs.fa") + "_vs_" + P.snip(os.path.basename(genome), ".fna")) + ".delta" parameters.append([genome, outfile, contig]) return parameters
def splitPooledBamfiles(infile, sentinal): infile = P.snip(infile, ".sentinal") + ".bam" outfile = P.snip(sentinal, ".sentinal") params = '2' module = P.snip(IDR.__file__, ".py") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinal)
def splitPooledBamfiles(infile, sentinal): infile = P.snip(infile, ".sentinal") + ".bam" outfile = P.snip(sentinal, ".sentinal") params = '2' module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinal)
def findNPeaksForPooledPseudoreplicates(infiles, outfile): idr_thresh = PARAMS["idr_options_pooled_consistency_threshold"] try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "findNPeaks", params=[ str(idr_thresh), ], infiles=infiles, outfiles=outfile)
def linkBamToWorkingDirs(infiles, outfile): ''' symlink the bam file and index to the working directories for execution of the transcript building pipeline ''' bamfile = P.snip(infiles[0], ".bai") indexfile = infiles[0] directories = [P.snip(logfile, ".log") for logfile in infiles[1]] for directory in directories: os.symlink(os.path.abspath(bamfile), os.path.join(directory, bamfile)) os.symlink(os.path.abspath(indexfile), os.path.join(directory, indexfile)) updateFile(outfile)
def loadEdgeR( infile, outfile ): '''load EdgeR per-chunk summary stats.''' prefix = P.snip( outfile, ".load" ) for fn in glob.glob( infile + "*_summary.tsv" ): prefix = P.snip(fn[len(infile)+1:], "_summary.tsv") P.load( fn, prefix + ".deseq_summary.load", collapse = 0, transpose = "sample") P.touch( outfile )
def findNPeaksForIndividualReplicates(infiles, outfile): idr_thresh = PARAMS["idr_options_inter_replicate_threshold"] try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "findNPeaks", params=[ str(idr_thresh), ], infiles=infiles, outfiles=outfile)
def loadCufflinks( infile, outfile ): '''load expression level measurements.''' track = P.snip( outfile, ".load" ) P.load( infile + ".genes_tracking.gz", outfile = track + "_genefpkm.load", options = "--index=gene_id --ignore-column=tracking_id --ignore-column=class_code --ignore-column=nearest_ref_id" ) track = P.snip( outfile, ".load" ) P.load( infile + ".fpkm_tracking.gz", outfile = track + "_fpkm.load", options = "--index=tracking_id --ignore-column=nearest_ref_id --rename-column=tracking_id:transcript_id" ) P.touch( outfile )
def compareAbundanceOfFalsePositiveSpecies(infiles, outfile): ''' boxplot the relative abundance of false positive species compared to true positives ''' tablename_estimate = P.toTable(infiles[0]) track = P.snip( os.path.basename(infiles[0]).replace("metaphlan_", ""), ".load") tablename_true = [ P.toTable(x) for x in infiles[1:] if P.snip(os.path.basename(x), ".load") == track ][0] dbh = sqlite3.connect("csvdb") cc = dbh.cursor() tmp = P.getTempFile(".") tmp.write("taxa\tabundance\tstatus\n") estimate = {} true = set() for data in cc.execute( """SELECT taxon, rel_abundance FROM %s WHERE taxon_level == 'species'""" % tablename_estimate).fetchall(): estimate[data[0]] = data[1] for data in cc.execute("""SELECT taxa FROM %s WHERE level == 'species'""" % tablename_true).fetchall(): true.add(data[0]) for taxa, abundance in estimate.iteritems(): if taxa in true: tmp.write("%s\t%f\ttp\n" % (taxa, abundance)) else: tmp.write("%s\t%f\tfp\n" % (taxa, abundance)) tmp.close() inf = tmp.name if track.find("15M") != -1: col = "cadetblue" elif track.find("30M") != -1: col = "lightblue" elif track.find("50M") != -1: col = "slategray" R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % inf) R('''library(ggplot2)''') R('''ggplot(dat, aes(x = status, y = log2(abundance))) + geom_boxplot(colour = "%s") + geom_hline(yintersect=0, linetype="dashed")''' % col) R('''ggsave("%s")''' % outfile) os.unlink(inf)
def splitBamfiles(infile, sentinel): """ For all tracks, split the filtered bamfile in two using pysam """ infile = P.snip(infile, ".sentinel") + ".bam" outfile = P.snip(sentinel, ".sentinel") params = '2' try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinel)