def removeBamfiles(infiles, outfile): for bamfile in infiles: bam_index = bamfile + ".bai" os.unlink(bamfile) if os.path.exists(bam_index): os.unlink(bam_index) P.touch(outfile)
def splitFiles(infile, outfile): ''' Arbitrarily split files into chunks for parallelisation ''' Timeseries.splitFiles(infile=infile, nchunks=PARAMS['resampling_chunks'], out_dir="parallel_files.dir") P.touch(outfile)
def reMergeBamfiles(infiles, sentinel): infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles] outfile = P.snip(sentinel, ".sentinel") + ".bam" bad_samples = PARAMS["options_to_remove"].split(",") to_merge = IDR.filterBadLibraries(infiles, bad_samples) IDR.mergeBams(to_merge, outfile) P.touch(sentinel)
def calculateM3DSpikeClustersPvalue(infiles, outfile): job_options = "-l mem_free=4G -pe dedicated 1" design = infiles[-1] infiles = infiles[:-1] RRBS.calculateM3DSpikepvalue(infiles, outfile, design, submit=True, job_options=job_options) P.touch(outfile)
def poolSampleBamfiles(infiles, sentinel): """ Merge filtered sample files for each tissue """ infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles] outfile = P.snip(sentinel, ".sentinel") + ".bam" IDR.mergeBams(infiles, outfile) P.touch(sentinel)
def callPeaksOnPooledReplicates(infile, outfile): # fetch peak calling parameters PARAMS_PEAKCALLER = get_peak_caller_parameters( PARAMS["options_peak_caller"]) # call peaks on pseudoreplicates IDR.callIDRPeaks(infile, outfile, PARAMS["options_peak_caller"], PARAMS["options_control_type"], PARAMS_PEAKCALLER, pseudoreplicate=False) P.touch(outfile)
def callPeaksOnIndividualReplicates(infile, outfile): infile = P.snip(infile, ".sentinel") + ".bam" # fetch peak calling parameters PARAMS_PEAKCALLER = get_peak_caller_parameters( PARAMS["options_peak_caller"]) # call peaks IDR.callIDRPeaks(infile, outfile, PARAMS["options_peak_caller"], PARAMS["options_control_type"], PARAMS_PEAKCALLER) P.touch(outfile)
def genReplicateData(infile, outfile): ''' Split each replicate into a separate file for clustering within each replicate. Relies on each replicate being the same across the whole time series. ''' outdir = outfile.split("/")[0] Timeseries.splitReplicates(infile=infile, axis="column", group_var="replicates", outdir=outdir) P.touch(outfile)
def splitPooledBamfiles(infile, sentinel): infile = P.snip(infile, ".sentinel") + ".bam" outfile = P.snip(sentinel, ".sentinel") params = '2' try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinel)
def poolInputBamfiles(infiles, sentinel): """ Merge filtered input files for each tissue, with the option of excluding undesirable libraries. """ infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles] outfile = P.snip(sentinel, ".sentinel") + ".bam" bad_samples = PARAMS["filter_remove_inputs"].split(",") if len(infiles) > 1: to_merge = IDR.filterBadLibraries(infiles, bad_samples) IDR.mergeBams(to_merge, outfile) else: os.symlink(os.path.abspath(infiles[0]), outfile) os.symlink(os.path.abspath(infiles[0]) + ".bai", outfile + ".bai") P.touch(sentinel)
def timePointDiffExpression(infile, outfile): ''' Within each condition test for differentially expressed genes against the baseline time point. Uses DESeq2. ''' statement = ''' cgat timeseries2diffgenes --log=%(outfile)s.log --method=timepoint --alpha=%(deseq_alpha)s --results-directory=diff_timepoints.dir %(infile)s ''' P.run() P.touch(outfile)
def summariseReadStart(infile, outfile): # this only works for fastq files. Fails with .sra files # this function and the next section should be replaced with a call to # fastq-dump if the file ends with .sra and then use the functions of # the fastq module to count the first bases in the fastq records. # for now, create empty outfile if infile.endswith(".sra"): P.touch(outfile) else: statement = '''zcat %(infile)s | paste - - - - | cut -f2 | cut -c1-3 | sort | uniq -c | sort -nk1 | awk -F' ' 'BEGIN{total=0; sum=0} {total+=$1; OFS"\\t"; if($2=="CGG"||$2=="TGG"||$2=="CGA"||$2=="TGA") {sum+=$1; print $1, $2}} END {print total-sum,"others"}' > %(outfile)s ''' % locals() P.run()
def loadGeneSummary(infile, outfile): '''summarize binding information per gene.''' dbh = connect() table = P.toTable(outfile) cc = dbh.cursor() cc.execute("""DROP TABLE IF EXISTS %(table)s """ % locals()) cc.execute("""CREATE TABLE %(table)s AS SELECT gene_id, SUM( tata ) AS tata, SUM( cpg ) AS cpg FROM promotorinfo_transcripts AS p, annotations.transcript_info as i WHERE i.transcript_id = p.transcript_id GROUP BY gene_id""" % locals()) cc.close() P.touch(outfile)
def splitBamfiles(infile, sentinel): """ For all tracks, split the filtered bamfile in two using pysam """ infile = P.snip(infile, ".sentinel") + ".bam" outfile = P.snip(sentinel, ".sentinel") params = '2' try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinel)
def subtractBedFiles(infile, subtractfile, outfile): '''subtract intervals in *subtractfile* from *infile* and store in *outfile*. ''' if iotools.isEmpty(subtractfile): shutil.copyfile(infile, outfile) return elif iotools.isEmpty(infile): P.touch(outfile) return statement = ''' intersectBed -v -a %(infile)s -b %(subtractfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %(outfile)s ; tabix -p bed %(outfile)s ''' P.run()
def conditionDiffExpression(infile, outfile): ''' Call DEGs showing statistically significantly different expression based on interaction terms between condition and time point. Uses DESeq2. ''' job_options = "-l mem_free=4G" statement = ''' zcat %(infile)s | cgat timeseries2diffgenes --log=%(outfile)s.log --method=condition --alpha=%(deseq_alpha)s --results-directory=diff_condition.dir ''' P.run() P.touch(outfile)
def intersectBedFiles(infiles, outfile): '''merge :term:`bed` formatted *infiles* by intersection and write to *outfile*. Only intervals that overlap in all files are retained. Interval coordinates are given by the first file in *infiles*. Bed files are normalized (overlapping intervals within a file are merged) before intersection. Intervals are renumbered starting from 1. ''' if len(infiles) == 1: shutil.copyfile(infiles[0], outfile) elif len(infiles) == 2: if iotools.isEmpty(infiles[0]) or iotools.isEmpty(infiles[1]): P.touch(outfile) else: statement = ''' intersectBed -u -a %s -b %s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %%(outfile)s ''' % (infiles[0], infiles[1]) P.run() else: tmpfile = P.getTempFilename(".") # need to merge incrementally fn = infiles[0] if iotools.isEmpty(infiles[0]): P.touch(outfile) return statement = '''mergeBed -i %(fn)s > %(tmpfile)s''' P.run() for fn in infiles[1:]: if iotools.isEmpty(infiles[0]): P.touch(outfile) os.unlink(tmpfile) return statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s''' P.run() statement = '''cat %(tmpfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %(outfile)s ''' P.run() os.unlink(tmpfile)
def genResampleData(infile, outfile): ''' Resample the data n-times with replacement - generates n flat files which are then propagated at later stages. Files are generally small though. ''' time_agg = list(TIME.__dict__['track2groups'].keys()) time_points = [int(str(x).split("-")[1]) for x in time_agg] time_points.sort() time_points = list(set(time_points)) rep_agg = list(REPLICATE.__dict__['track2groups'].keys()) replicates = [str(x).split("-")[2] for x in rep_agg] time_rep_comb = [x for x in itertools.product(time_points, replicates)] time_cond = ro.StrVector([x[0] for x in time_rep_comb]) rep_cond = ro.StrVector([x[1] for x in time_rep_comb]) ref_gtf = str(infile).split("-")[1] condition = (str(infile).split("-")[0]).strip("deseq.dir/") time_points = ",".join([str(i) for i in time_points]) replicates = ",".join(replicates) statement = ''' cgat data2resamples --log=%(outfile)s.log --time=%(time_points)s --replicates=%(replicates)s --condition=%(condition)s --resamples=%(resampling_resample)s --input-gtf=%(ref_gtf)s --output-file-directory=clustering.dir --seed=%(resampling_seed)s %(infile)s ''' P.run() P.touch(outfile)
def summarizeEffectsPerGene(infile, outfile): '''summarize effects on a per-gene level.''' tablename = outfile[:-len(".load")] track = infile[:-len("_effects.load")] dbhandle = connect() statement = ''' CREATE TABLE %(tablename)s AS SELECT DISTINCT gene_id, COUNT(*) AS ntranscripts, MIN(e.nalleles) AS min_nalleles, MAX(e.nalleles) AS max_nalleles, MIN(e.stop_min) AS min_stop_min, MAX(e.stop_min) AS max_stop_min, MIN(e.stop_max) AS min_stop_max, MAX(e.stop_max) AS max_stop_max, SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_knockout, SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_affected FROM annotations.transcript_info as i, %(track)s_effects AS e WHERE i.transcript_id = e.transcript_id GROUP BY i.gene_id ''' % locals() Database.executewait( dbhandle, "DROP TABLE IF EXISTS %(tablename)s" % locals()) Database.executewait(dbhandle, statement) Database.executewait( dbhandle, "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" % locals()) dbhandle.commit() P.touch(outfile)
def makeSummaryPlots(infile, outfile): job_options = "-l mem_free=48G" RRBS.summaryPlots(infile, outfile, submit=True, job_options=job_options) P.touch(outfile)
def loadSummary(infile, outfile): '''load several rates into a single convenience table. ''' stmt_select = [] stmt_from = [] stmt_where = ["1"] track = infile[:-len(".gtf.gz")] tablename = "%s_evol" % track if os.path.exists("%s_rates.load" % track): stmt_select.append("a.distance AS ks, a.aligned AS aligned") stmt_from.append('''LEFT JOIN %(track)s_rates AS a ON r.gene_id = a.gene_id AND a.aligned >= %(rates_min_aligned)i AND a.distance <= %(rates_max_rate)f''') if os.path.exists("%s_coverage.load" % track): stmt_select.append("cov.nmatches AS nreads, cov.mean AS meancoverage") stmt_from.append( "LEFT JOIN %(track)s_coverage AS cov ON r.gene_id = cov.gene_id") if os.path.exists("%s_repeats_gc.load" % track): stmt_select.append("ar_gc.exons_mean AS repeats_gc") stmt_from.append( "LEFT JOIN %(track)s_repeats_gc AS ar_gc ON r.gene_id = ar_gc.gene_id" ) if os.path.exists("%s_repeats_rates.load" % track): stmt_select.append( "ar.exons_length AS ar_aligned, ar.exons_median AS ka, a.distance/ar.exons_median AS kska" ) stmt_from.append('''LEFT JOIN %(track)s_repeats_rates AS ar ON r.gene_id = ar.gene_id AND ar.exons_nval >= %(rates_min_repeats)i''') if os.path.exists("%s_introns_rates.load" % track): stmt_select.append( "ir.aligned AS ir_aligned, ir.distance AS ki, a.distance/ir.distance AS kski" ) stmt_from.append('''LEFT JOIN %(track)s_introns_rates AS ir ON r.gene_id = ir.gene_id AND ir.aligned >= %(rates_min_aligned)i''') x = locals() x.update(PARAMS) stmt_select = ", ".join(stmt_select) % x stmt_from = " ".join(stmt_from) % x stmt_where = " AND ".join(stmt_where) % x dbhandle = sqlite3.connect(PARAMS["database_name"]) Database.executewait(dbhandle, "DROP TABLE IF EXISTS %(tablename)s " % locals()) statement = ''' CREATE TABLE %(tablename)s AS SELECT CAST(r.gene_id AS TEXT) AS gene_id, r.exons_sum as length, r.exons_pGC as pgc, %(stmt_select)s FROM %(track)s_annotation AS r %(stmt_from)s WHERE %(stmt_where)s ''' % locals() Database.executewait(dbhandle, statement) dbhandle.commit() P.touch(outfile)
def loadMACS(infile, outfile, bamfile, tablename=None): '''load MACS results in *tablename* This method loads only positive peaks. It filters peaks by p-value, q-value and fold change and loads the diagnostic data and re-calculates peakcenter, peakval, ... using the supplied bamfile. If *tablename* is not given, it will be :file:`<track>_intervals` where track is derived from ``infile`` and assumed to end in :file:`.macs`. This method creates two optional additional files: * if the file :file:`<track>_diag.xls` is present, load MACS diagnostic data into the table :file:`<track>_macsdiag`. * if the file :file:`<track>_model.r` is present, call R to create a MACS peak-shift plot and save it as :file:`<track>_model.pdf` in the :file:`export/MACS` directory. This method creates :file:`<outfile>.tsv.gz` with the results of the filtering. ''' track = P.snip(os.path.basename(infile), ".macs") folder = os.path.dirname(infile) if len(folder) > 0: infilename = folder + "/" + track + "_peaks.xls" filename_diag = folder + "/" + track + "_diag.xls" filename_r = folder + "/" + track + "_model.r" filename_rlog = folder + "/" + track + ".r.log" filename_pdf = track + "_model.pdf" else: infilename = track + "_peaks.xls" filename_diag = track + "_diag.xls" filename_r = track + "_model.r" filename_rlog = track + ".r.log" filename_pdf = track + "_model.pdf" if not os.path.exists(infilename): E.warn("could not find %s" % infilename) P.touch(outfile) return # create plot by calling R if os.path.exists(filename_r): if len(folder) > 0: statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; mv %(filename_pdf)s %(folder)s/%(filename_pdf)s; ''' else: statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; ''' P.run() # filter peaks shift = getPeakShiftFromMacs(infile) assert shift is not None, "could not determine peak shift from MACS file %s" % infile E.info("%s: found peak shift of %i" % (track, shift)) samfiles = [pysam.Samfile(bamfile, "rb")] offsets = [shift / 2] outtemp = P.getTempFile(".") tmpfilename = outtemp.name outtemp.write("\t".join(( "interval_id", "contig", "start", "end", "npeaks", "peakcenter", "length", "avgval", "peakval", "nprobes", "pvalue", "fold", "qvalue", "macs_summit", "macs_nprobes", )) + "\n") id = 0 # get thresholds max_qvalue = float(PARAMS["macs_max_qvalue"]) # min, as it is -10log10 min_pvalue = float(PARAMS["macs_min_pvalue"]) counter = E.Counter() with iotools.openFile(infilename, "r") as ins: for peak in WrapperMACS.iteratePeaks(ins): if peak.fdr > max_qvalue: counter.removed_qvalue += 1 continue elif peak.pvalue < min_pvalue: counter.removed_pvalue += 1 continue assert peak.start < peak.end npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks( peak.contig, peak.start, peak.end, samfiles, offsets) outtemp.write("\t".join(map(str, ( id, peak.contig, peak.start, peak.end, npeaks, peakcenter, length, avgval, peakval, nreads, peak.pvalue, peak.fold, peak.fdr, peak.start + peak.summit - 1, peak.tags))) + "\n") id += 1 counter.output += 1 outtemp.close() # output filtering summary outf = iotools.openFile("%s.tsv.gz" % outfile, "w") outf.write("category\tcounts\n") outf.write("%s\n" % counter.asTable()) outf.close() E.info("%s filtering: %s" % (track, str(counter))) if counter.output == 0: E.warn("%s: no peaks found" % track) # load data into table if tablename is None: tablename = "%s_macs_intervals" % track statement = '''cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=interval_id --add-index=contig,start --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename) # load diagnostic data if os.path.exists(filename_diag): tablename = "%s_macsdiag" % track statement = ''' cat %(filename_diag)s | sed "s/FC range.*/fc\\tnpeaks\\tp90\\tp80\\tp70\\tp60\\tp50\\tp40\\tp30\\tp20/" | cgat csv2db %(csv2db_options)s --map=fc:str --table=%(tablename)s >> %(outfile)s ''' P.run()
def plotHeatmap(results, norm_matrix, threshold_stat, p_threshold, fc_threshold, outfile): ''' plot heatmap of differentially abundant genes ''' if threshold_stat == "p": p = "P.Value" elif threshold_stat == "padj": p = "adj.P.Val" else: p = "adj.P.Val" temp = P.getTempFilename(".") R('''library(gplots)''') R('''library(gtools)''') E.info("reading data") R('''mat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % norm_matrix) R('''rownames(mat) <- mat$taxa mat <- as.matrix(mat[,1:ncol(mat)-1])''') R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % results) E.info("data loaded") R('''t <- dat$taxa[dat$%s < %f & abs(dat$logFC) > %f]''' % (p, p_threshold, fc_threshold)) R('''diff.genes <- unique(t)''') ############################## # this is a hack # to avoid errors when # a single differential # abundant feature is found ############################## R('''write.table(diff.genes, file = "%s", row.names = F, sep = "\t")''' % temp) tmp = open(temp) tmp.readline() if len(tmp.readlines()) == 1: P.touch(outfile) else: R('''mat <- mat[as.character(diff.genes), ] samples <- colnames(mat) mat <- as.data.frame(t(apply(mat, 1, scale))) colnames(mat) <- samples mat <- mat[, mixedsort(colnames(mat))] colours = colorRampPalette(c("blue", "white", "red"))(75) pdf("%s", height = 12, width = 12) heatmap.2(as.matrix(mat), trace = "none", scale = "none", col = colours, Colv = F, dendrogram = "row", margins = c(18, 18)) dev.off()''' % outfile) os.unlink(temp)
def filterBamfiles(infile, sentinel): """ Pre-process bamfiles prior to peak calling. i) sort bamfiles ii) remove unmapped readswith bam2bam.py iii) remove non-uniquely mapping reads with bam2bam.py (optional) iv) remove duplicates with Picards MarkDuplicates (optional) v) remove reads from masked regions with bedtools intersect (optional) vi) index """ # create tempfile for Picard's MarkDuplicates picard_tmp = P.getTempDir(PARAMS["scratchdir"]) outfile = P.snip(sentinel, ".sentinel") + ".bam" # ensure bamfile is sorted, statement = ["samtools sort @IN@ -o @[email protected]", ] # remove unmapped reads statement.append("cgat bam2bam" " --method=filter --filter-method=mapped" " --log=%(outfile)s.log" " < @[email protected]" " > @OUT@") # remove non-uniquely mapping reads, if requested if PARAMS["filter_remove_non_unique"]: statement.append("cgat bam2bam" " --method=filter --filter-method=unique" " --log=%(outfile)s.log" " < @IN@" " > @OUT@") # remove duplicates, if requested if PARAMS["filter_remove_duplicates"]: statement.append("MarkDuplicates" " INPUT=@IN@" " ASSUME_SORTED=true" " REMOVE_DUPLICATES=true" " QUIET=false" " OUTPUT=@OUT@" " METRICS_FILE=/dev/null" " VALIDATION_STRINGENCY=SILENT" " TMP_DIR=%(picard_tmp)s" " 2> %(outfile)s.log") # mask regions, if intervals supplied if PARAMS["filter_mask_intervals"]: mask = PARAMS["filter_mask_intervals"] statement.append("bedtools intersect" " -abam @IN@" " -b %(mask)s" " -wa" " -v" " > @OUT@") statement.append("mv @IN@ %(outfile)s") statement.append("samtools index %(outfile)s") job_memory = "5G" statement = P.joinStatements(statement, infile) P.run() P.touch(sentinel) shutil.rmtree(picard_tmp)