def summarizeMACSFDR(infiles, outfile): '''compile table with peaks that would remain after filtering by fdr. ''' fdr_thresholds = numpy.arange(0, 1.05, 0.05) outf = iotools.openFile(outfile, "w") outf.write("track\t%s\n" % "\t".join(map(str, fdr_thresholds))) for infile in infiles: called = [] track = P.snip(os.path.basename(infile), ".macs") infilename = infile + "_peaks.xls.gz" inf = iotools.openFile(infilename) peaks = list(WrapperMACS.iteratePeaks(inf)) for threshold in fdr_thresholds: called.append(len([x for x in peaks if x.fdr <= threshold])) outf.write("%s\t%s\n" % (track, "\t".join(map(str, called)))) outf.close()
def loadTranscriptomeValidation(infiles, outfile): '''load transcriptome validation data into database.''' to_cluster = USECLUSTER headers = ",".join( [P.tablequote(P.snip(x, ".accepted.bam")) for x in infiles]) infiles = " ".join(["%s.log" % x for x in infiles]) tablename = P.toTable(outfile) statement = ''' cgat combine_tables --header-names=%(headers)s %(infiles)s | cgat table2table --transpose | perl -p -e "s/bin/track/" | cgat csv2db --table=%(tablename)s > %(outfile)s ''' P.run()
def makeSalmonIndex(infile, outfile): # Long transcripts cause indexing to use lots of memory? job_memory = "64G" job_threads = 1 gtf_basename = P.snip(os.path.basename(infile), ".gtf.gz") transcript_fasta = "salmon_index/" + gtf_basename + "transcripts.fa" fastaref = PARAMS["portcullis_fastaref"] index_options = PARAMS["salmon_indexoptions"] tmpfile = P.get_temp_filename() statement = ''' gunzip -c %(infile)s > %(tmpfile)s; gffread %(tmpfile)s -g %(fastaref)s -w %(transcript_fasta)s; salmon index -p %(job_threads)s %(index_options)s -t %(transcript_fasta)s -i %(outfile)s --perfectHash; rm %(tmpfile)s ''' P.run(statement)
def exportSequencesFromBedFile(infile, outfile, masker=None, mode="intervals"): '''export sequences for intervals in :term:`bed`-formatted *infile* to :term:`fasta` formatted *outfile* ''' track = P.snip(infile, ".bed.gz") fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) outs = iotools.open_file(outfile, "w") ids, seqs = [], [] for bed in Bed.setName(Bed.iterator(iotools.open_file(infile))): lcontig = fasta.getLength(bed.contig) if mode == "intervals": seqs.append(fasta.getSequence(bed.contig, "+", bed.start, bed.end)) ids.append("%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end)) elif mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end)) seqs.append(fasta.getSequence(bed.contig, "+", start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end)) seqs.append(fasta.getSequence(bed.contig, "+", start, end)) masked = maskSequences(seqs, masker) outs.write("\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)])) outs.close()
def mergeSummarizedContextStats(infiles, outfile, samples_in_columns=False): """combine output from :func:`summarizeTagsWithinContext`. Arguments --------- infiles : list List of filenames in :term:`tsv` format outfile : string Output filename in :term:`tsv` format. samples_in_columns : If True, put samples in columns. The default is to put them in rows. """ header = ",".join( [P.snip(os.path.basename(x), ".contextstats.tsv.gz") for x in infiles]) filenames = " ".join(infiles) if not samples_in_columns: transpose_cmd = \ """| cgat table2table --transpose""" % P.getParams() else: transpose_cmd = "" statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --skip-titles %(filenames)s | perl -p -e "s/bin/track/; s/\?/Q/g" %(transpose_cmd)s | gzip > %(outfile)s """ P.run(statement)
def loadBigWigStats(infiles, outfile): '''merge and load bigwig summary for all wiggle files. Summarise and merge bigwig files for all samples and load into a table called bigwig_stats Parameters ---------- infiles : list Input filenames in :term:`bigwig` format outfile : string Output filename, the table name is derived from `outfile`. ''' data = " ".join([ '<( bigWigInfo %s | perl -p -e "s/:/\\t/; s/ //g; s/,//g")' % x for x in infiles ]) headers = ",".join([P.snip(os.path.basename(x), ".bw") for x in infiles]) load_statement = P.build_load_statement(P.toTable(outfile), options="--add-index=track") statement = '''cgat combine_tables --header-names=%(headers)s --skip-titles --missing-value=0 --ignore-empty %(data)s | perl -p -e "s/bin/track/" | cgat table2table --transpose | %(load_statement)s > %(outfile)s ''' P.run()
def runPCA(infile, outfile, rownames=1): ''' run principle components analysis on normalised matrix ''' # ncol = len(open(infile).readline().strip("\n").split("\t")) # read in and format data R('''dat <- read.csv("%s", header=T, stringsAsFactors=F, sep="\t", row.names=%i)''' % (infile, rownames)) # run PCA R('''pc.dat <- prcomp(as.matrix(t(dat)))''') # get scores R('''pc.dat.scores <- data.frame(pc.dat$x)''') R('''pc.dat.scores$sample <- rownames(pc.dat.scores)''') R('''pc.dat.scores <- pc.dat.scores[, c("sample", colnames(pc.dat.scores)[1:ncol(pc.dat.scores)-1])]''' ) R('''write.table(pc.dat.scores, file="%s", sep="\t", quote=F, row.names=F)''' % outfile) # get the variance explained outf_ve = P.snip(outfile, ".tsv") + ".ve.tsv" R('''ve <- data.frame(summary(pc.dat)$importance)''') R('''ve <- ve[2,]''') R('''write.table(ve, file="%s", sep="\t", quote=F, row.names=F)''' % outf_ve)
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() # Load into database P.load(outf.name, outfile, options="--add-index=track") os.unlink(outf.name)
def buildContigBed(infile, outfile): ''' Gets the contig sizes and co-ordinates from an indexed genome :term:`fasta` file and outputs them to :term:`BED` format Parameters ---------- infile : str infile is constructed from `PARAMS` variable to retrieve the `genome` :term:`fasta` file Returns ------- outfile : str :term:`BED` format file containing contig name, value (0) and contig size in nucleotides. The output file name is defined in `PARAMS: interface_contigs_bed` ''' prefix = P.snip(infile, ".fasta") fasta = IndexedFasta.IndexedFasta(prefix) outs = iotools.open_file(outfile, "w") for contig, size in fasta.getContigSizes(with_synonyms=False).items(): outs.write("%s\t%i\t%i\n" % (contig, 0, size)) outs.close()
def build_bam_stats(infiles, outfile): '''count number of reads mapped, duplicates, etc. Excludes regions overlapping repetitive RNA sequences Parameters ---------- infiles : list infiles[0] : str Input filename in :term:`bam` format infiles[1] : str Input filename with number of reads per sample outfile : str Output filename with read stats annotations_interface_rna_gtf : str :term:`PARMS`. :term:`gtf` format file with repetitive rna ''' job_memory = "32G" # Only one sample if len(infiles) == 3: bamfile, readsfile, rna_file = infiles # If there are multiple samples, programme specifies which .nreads file to use, by matching name to bam file else: bamfile = infiles[0] rna_file = infiles[-1] # Split file name up into directory and file name(/), then further split up by file name and file type and take file name (.) bam_name = bamfile.split('/')[1].split('.')[0] for i in range(1, len(infiles) - 1): nread_name = infiles[i].split('/')[1].split('.')[0] if bam_name == nread_name: readsfile = infiles[i] break else: continue nreads = ModuleTrna.getNumReadsFromReadsFile(readsfile) track = P.snip(os.path.basename(readsfile), ".nreads") # if a fastq file exists, submit for counting if os.path.exists(track + ".fastq.gz"): fastqfile = track + ".fastq.gz" elif os.path.exists(track + ".fastq.1.gz"): fastqfile = track + ".fastq.1.gz" else: fastqfile = None if fastqfile is not None: fastq_option = "--fastq-file=%s" % fastqfile else: fastq_option = "" statement = ''' cgat bam2stats %(fastq_option)s --force-output --mask-bed-file=%(rna_file)s --ignore-masked-reads --num-reads=%(nreads)i --output-filename-pattern=%(outfile)s.%%s < %(bamfile)s > %(outfile)s ''' P.run(statement)
def buildUngappedContigBed(infile, outfiles): ''' Constructs :term:`BED` format files containing both gapped and ungapped contig sizes from an index genome :term:`fasta` file. Parameters ---------- infile: str infile is constructed from `PARAMS` variable to retrieve the `genome` :term:`fasta` file assembly_gaps_min_size: int `PARAMS` - the minimum size (in nucleotides) for an assembly gap Returns ------- outfiles: list two separate :term:`BED` format output files containing the contig sizes for contigs with and without gaps. The names are defined in the `PARAMS` `interface_contigs_ungapped_bed` and `interface_gaps_bed` parameters. ''' prefix = P.snip(infile, ".fasta") fasta = IndexedFasta.IndexedFasta(prefix) outs_nogap = iotools.open_file(outfiles[0], "w") outs_gap = iotools.open_file(outfiles[1], "w") min_gap_size = PARAMS["assembly_gaps_min_size"] for contig, size in fasta.getContigSizes(with_synonyms=False).items(): seq = fasta.getSequence(contig) def gapped_regions(seq): is_gap = seq[0] == "N" last = 0 for x, c in enumerate(seq): if c == "N": if not is_gap: last = x is_gap = True else: if is_gap: yield (last, x) last = x is_gap = False if is_gap: yield last, size last_end = 0 for start, end in gapped_regions(seq): if end - start < min_gap_size: continue if last_end != 0: outs_nogap.write("%s\t%i\t%i\n" % (contig, last_end, start)) outs_gap.write("%s\t%i\t%i\n" % (contig, start, end)) last_end = end if last_end < size: outs_nogap.write("%s\t%i\t%i\n" % (contig, last_end, size)) outs_nogap.close() outs_gap.close()
# Pipeline configuration ################################################### PARAMS = P.get_parameters([ "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml", "pipeline.yml" ]) # Add automatically created files to the interface. This is required # when the pipeline is peek'ed. The statement below will # add the following to the dictionary: # # "geneset.dir/lincrna_gene_tss.bed.gz" maps to # "interface_geneset_lincrna_gene_tss_bed" PARAMS.update( dict([("interface_geneset_%s" % re.sub("[.]", "_", os.path.basename(P.snip(x, ".gz"))), x) for x in glob.glob('geneset.dir/*.bed.gz')])) def connect(): '''connect to database.''' dbh = sqlite3.connect(PARAMS["database_name"]) return dbh def connectToUCSC(): return gtfsubset.connectToUCSC(host=PARAMS["ucsc_host"], user=PARAMS["ucsc_user"], database=PARAMS["ucsc_database"])
def matchBgSequenceComposition(gc_load_files, background, foreground, fasta_file, outfile, database="csvdb", header_line=True, bg_stat="pCpG", stat="fisher"): ''' take the background set and subset it for intervals with a sequence composition distribution that is the same as the foreground set. Subsetting is done without replacement. This requires that the background set is sufficiently large, if the returned matched_background set is <90% size of foreground set, the pipeline will crash. ''' background_file = open(background) foreground_file = open(foreground) if header_line: background_file.readline() foreground_file.readline() background_set = set() foreground_set = set() for interval_id in background_file.readlines(): background_set.add(interval_id[:-1]) for interval_id in foreground_file.readlines(): foreground_set.add(interval_id[:-1]) dbh = sqlite3.connect(database) cc = dbh.cursor() tablenames = [ filenameToTablename(P.snip(os.path.basename(x), ".load")) for x in gc_load_files ] # jj: cpg scores rounded to three dp. # background dict - key <cpg score>: val <set of gene_ids with that score> # foreground dict - key <gene_id>: val <cpg score> gc = {"background": collections.defaultdict(set), "foreground": {}} for tablename in tablenames: # MM: need to make sure `-` in filenames don't break the sql statement tablename = tablename.replace("-", "_") for data in cc.execute("""SELECT * FROM %s;""" % tablename): interval_id = data[3].split(" ")[0] cpg = data[2] # jj: store background in 1 percent bins cpg_str = "%.3f" % cpg if re.search("background", tablename): if interval_id in background_set: gc["background"][cpg_str].add(interval_id) elif re.search("foreground", tablename): if interval_id in foreground_set: gc["foreground"][interval_id] = cpg_str else: raise ValueError("Unrecognized table name %s. Should contain" "'foreground' or 'background'" % tablename) # debug: pickle and dump the gc dict pickle_file = P.snip(foreground, ".foreground.tsv") + ".p" pickle.dump(gc, open(pickle_file, "wb")) # match the background set to the foreground set by taking a random # background interval with the the same sequence composition as each # foreground interval. outf = open(outfile, "w") if header_line: outf.write("gene_id\n") # jj: sample background gene_ids without replacement matched_background = set() X = 0 for interval, cpg in gc["foreground"].items(): # print("Finding background for foreground gene: %s (%s)" % # (interval, cpg)) if cpg in list(gc["background"].keys()): # get set of bg gene_ids with relevant cpg score bg_gene_ids = gc["background"][cpg] # print "There are %i background genes in total" % len(bg_gene_ids) # remove foreground genes from background set bg_gene_ids = bg_gene_ids - foreground_set # print("There are %i background genes after removing foreground" % # len(bg_gene_ids)) if bg_gene_ids: # select one gene_id to add to matched_background bg_id = random.sample(gc["background"][cpg], 1)[0] matched_background.add(bg_id) # remove selected background gene_id from set gc["background"][cpg].remove(bg_id) else: X += 1 E.warn("Missing background gene for %s %s, no gene with" " matching %s" % (foreground_file.name, interval, bg_stat)) else: X += 1 E.warn("Missing background gene for %s %s, no gene with" " matching %s" % (foreground_file.name, interval, bg_stat)) # Hack # jj: check that background gene_list is <10% shorter than foregroung # hack # MM: only need to check sufficient background size for Fisher's exact test if stat == "fisher": assert len(matched_background) > 0.9 * len(foreground_set), ( "There are insufficient genes with matched background to perform" " test for sample %s" % foreground_file) else: pass print("Number of genes with no available background: %i" % X) print("Foreground set: %i" % len(foreground_set)) print("Backfround set: %i" % len(matched_background)) outf.write("\n".join(matched_background) + "\n") outf.close()
def buildStatement(self, *args, **PARAMS): """ Generate run statement for processing single, paired, or paired + singleton samples. Required arguments: index reference """ run_options = PARAMS["sortmerna_run_options"] threads = PARAMS["sortmerna_threads"] # A comma separated list of references references = PARAMS["sortmerna_reference"] references = ' --ref '.join(references.split(',')) # All listed references must be pre-indexed in this location index_dir = PARAMS[ "sortmerna_index"] # Check this isn't automatically passed. tmpf = P.get_temp_dir('.') tmpf_kvdb = os.path.join(tmpf, 'kvdb') tmpf_readb = os.path.join(tmpf, 'readb') if not self.fastn2: # Run sortMeRNA for single reads in_fastn1 = self.fastn1 in_prefix = P.snip(in_fastn1, self.fn_suffix, strip_path=True) out_prefix = os.path.join(self.outdir, in_prefix) # Run sortMeRNA for single reads statement = ( "sortmerna" " --index 0" # skip indexing, assume in idx-dir " --fastx" " --reads %(in_fastn1)s" " --ref %(references)s" " --idx-dir %(index_dir)s" # location of reference indexes " --aligned %(out_prefix)s_aligned" # output location of aligned seq " --other %(out_prefix)s_unaligned" # output location of unalinged seq " --readb %(tmpf_readb)s" # location of tmp file for reads " --kvdb %(tmpf_kvdb)s" # location of tmp file for kv pairs " --threads %(threads)s" " --zip-out" % locals()) else: # Run sortMeRNA for paired reads in_fastn1 = self.fastn1 in_fastn2 = self.fastn2 in_prefix = P.snip(in_fastn1, self.fn_suffix, strip_path=True) out_prefix = os.path.join(self.outdir, in_prefix) # Run sortMeRNA for single reads statement = ( "sortmerna" " --index 0" # skip indexing, assume in idx-dir " --fastx" " --reads %(in_fastn1)s" # First read file " --reads %(in_fastn2)s" # Second read file " --ref %(references)s" " --idx-dir %(index_dir)s" # location of reference indexes " --aligned %(out_prefix)s_aligned" # output location of aligned seq " --other %(out_prefix)s_unaligned" # output location of unalinged seq " --readb %(tmpf_readb)s" # location of tmp file for reads " --kvdb %(tmpf_kvdb)s" # location of tmp file for kv pairs " --paired_in" # If one read is aligned, both are output to aligned file " --out2" # Output paired reads to separate files " --threads %(threads)s" " --zip-out" % locals()) if self.fastn3 and not PARAMS.get('sortmerna_skip_singletons', False): in_fastn3 = self.fastn3 statement_2 = ( "sortmerna" " --index 0" # skip indexing, assume in idx-dir " --fastx" " --reads %(in_fastn3)s" " --idx-dir %(index_dir)s" # location of reference indexes " --ref %(references)s" " --aligned %(out_prefix)s_aligned_singleton" # output location of aligned seq " --other %(out_prefix)s_unaligned_singleton" # output location of unalinged seq " --readb %(tmpf_readb)s" # location of tmp file for reads " --kvdb %(tmpf_kvdb)s" # location of tmp file for kv pairs " --threads %(threads)s" " --zip-out" % locals()) statement = " && ".join([ statement, "rm -rf %(tmpf)s/*" % locals(), # location of tmp_readb & kvdb statement_2, "rm -rf %(tmpf)s" % locals() ]) return statement, run_options
def loadBAMStats(infiles, outfile): '''load output of :func:`buildBAMStats` into database. Arguments --------- infiles : string Input files, output from :func:`buildBAMStats`. outfile : string Logfile. The table name will be derived from `outfile`. ''' header = ",".join([P.snip(os.path.basename(x), ".readstats") for x in infiles]) filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles]) tablename = P.to_table(outfile) load_statement = P.build_load_statement( tablename, options="--add-index=track " " --allow-empty-file") E.info("loading bam stats - summary") statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | cgat table2table --transpose | %(load_statement)s > %(outfile)s""" to_cluster = False P.run(statement) for suffix in ("nm", "nh"): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) load_statement = P.build_load_statement( "%s_%s" % (tablename, suffix), options="--allow-empty-file") statement = """cgat combine_tables --header-names=%(header)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | %(load_statement)s >> %(outfile)s """ to_cluster = False P.run(statement) # load mapping qualities, there are two columns per row # 'all_reads' and 'filtered_reads' # Here, only filtered_reads are used (--take=3) for suffix in ("mapq",): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) load_statement = P.build_load_statement( "%s_%s" % (tablename, suffix), options=" --allow-empty-file") statement = """cgat combine_tables --header-names=%(header)s --skip-titles --missing-value=0 --ignore-empty --take=3 %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | %(load_statement)s >> %(outfile)s """ to_cluster = False P.run(statement)
def getID(infile): return P.snip(os.path.basename(infile), ".mutect.snp.annotated.filtered.vcf")
import glob from pathlib import Path from ruffus import * from cgatcore import pipeline as P # load options from the config file PARAMS = P.get_parameters( ["%s/pipeline.yml" % os.path.splitext(__file__)[0], "pipeline.yml"]) #get all files within the directory to process SEQUENCEFILES = ("*fastq.gz") SEQUENCEFILES_REGEX = regex(r"(\S+).(fastq.gz)") scriptsdir = os.path.dirname(os.path.abspath(__file__)) scriptsdir = P.snip(scriptsdir, "pipelines") + "scripts" PARAMS["scriptsdir"] = scriptsdir reportdir = os.path.dirname(os.path.abspath(__file__)) reportdir = os.path.join(reportdir, "pipeline_docs", "Rmd") PARAMS["reportdir"] = reportdir ######################################################## ######################################################## ######################################################## # Run humann3 on concatenated fastq.gz # produces a humann3.dir which conatins # a folder for each sample, which contains # pathcoverage, pathabundance and genefamilies files. ########################################################
def maskLowComplexity(fastq1, outfile): '''Either softmask low complexity regions, or remove reads with a large proportion of low complexity. Uses BBTools scripts bbduk.sh (removal), or bbmask.sh. Entropy is calculated as shannon entropy for of kmers with a specified size within a sliding window. Ranges from 0: mask nothing, 0.0001: mask homopolymers, 1: mask everything. ''' bb_options = ' '.join(PARAMS['dust_options'].split(',')) # bbmap assumes the file format based on the output being *fq.gz # I can't find any instructions as to how to override this. if IS_PAIRED: fastq2 = P.snip(fastq1, '.1.gz') + '.2.gz' fastq3 = P.snip(fastq1, '.1.gz') + '.3.gz' outfile1 = P.snip(outfile, '.1.gz') + '.1.fq.gz' outfile2 = P.snip(outfile, '.1.gz') + '.2.fq.gz' outfile3 = P.snip(outfile, '.1.gz') + '.3.fq.gz' out_disc1 = P.snip(outfile, '_masked.fastq.1.gz') + '_discarded.fastq.1.fq.gz' out_disc2 = P.snip(outfile, '_masked.fastq.1.gz') + '_discarded.fastq.2.fq.gz' out_disc3 = P.snip(outfile, '_masked.fastq.1.gz') + '_discarded.fastq.3.fq.gz' if PARAMS['dust_discard_low_complexity']: statement1 = ("bbduk.sh" " in=%(fastq1)s" " in2=%(fastq2)s" " out=%(outfile1)s" " out2=%(outfile2)s" " outm=%(out_disc1)s" " outm2=%(out_disc2)s" " entropy=%(dust_entropy)s" " threads=%(dust_threads)s" " %(bb_options)s" " &> %(outfile)s.log") if IOTools.open_file(fastq3).read(1): statement2 = (" bbduk.sh" " in=%(fastq3)s" " out=%(outfile3)s" " outm=%(out_disc3)s" " entropy=%(dust_entropy)s" " threads=%(dust_threads)s" " %(bb_options)s" " &>> %(outfile)s.log") else: statement2 = (" touch %(outfile3)s %(out_disc3)s") statement = " && ".join([statement1, statement2]) P.run(statement, job_options=PARAMS['dust_run_options']) else: statement1 = ("bbmask.sh" " in=%(fastq1)s" " out=%(outfile1)s" " entropy=%(dust_entropy)s" " threads=%(dust_threads)s" " overwrite=t" " lowercase=t" " %(bb_options)s" " &> %(outfile)s.log &&" " bbmask.sh" " in=%(fastq2)s" " out=%(outfile2)s" " entropy=%(dust_entropy)s" " threads=%(dust_threads)s" " overwrite=t" " lowercase=t" " %(bb_options)s" " &>> %(outfile)s.log") if IOTools.open_file(fastq3).read(1): statement2 = (" bbmask.sh" " in=%(fastq3)s" " out=%(outfile3)s" " entropy=%(dust_entropy)s" " threads=%(dust_threads)s" " overwrite=t" " lowercase=t" " %(bb_options)s" " &>> %(outfile)s.log") else: statement2 = (" touch %(outfile3)s") statement = " && ".join([statement1, statement2]) P.run(statement, job_options=PARAMS['dust_run_options']) # Renaming files because of bbmap idiosyncracies of1 = P.snip(outfile1, '.fq.gz') + '.gz' of2 = P.snip(outfile2, '.fq.gz') + '.gz' of3 = P.snip(outfile3, '.fq.gz') + '.gz' os.rename(outfile1, of1) os.rename(outfile2, of2) os.rename(outfile3, of3) if PARAMS['dust_discard_low_complexity']: od1 = P.snip(out_disc1, '.fq.gz') + '.gz' od2 = P.snip(out_disc2, '.fq.gz') + '.gz' od3 = P.snip(out_disc3, '.fq.gz') + '.gz' os.rename(out_disc1, od1) os.rename(out_disc2, od2) os.rename(out_disc3, od3) else: outfile1 = P.snip(outfile, '.gz') + '.fq.gz' out_disc = P.snip(outfile, '_masked.fastq.1.gz') + '_discarded.fastq.1.fq.gz' if PARAMS['dust_discard_low_complexity']: statement = ("bbduk.sh" " in=%(fastq1)s" " out=%(outfile1)s" " outm=%(out_disc)s" " entropy=%(dust_entropy)s" " threads=%(dust_threads)s" " lowercase=t" " %(bb_options)s" " &> %(outfile)s.log") P.run(statement, job_options=PARAMS['dust_run_options']) else: statement = ("bbmask.sh" " in=%(fastq1)s" " out=%(outfile1)s" " entropy=%(dust_entropy)s" " threads=%(dust_threads)s" " lowercase=t" " %(bb_options)s" " &> %(outfile.log") P.run(statement, job_options=PARAMS['dust_run_options']) os.rename(outfile1, outfile) if PARAMS['dust_discard_low_complexity']: od1 = P.snip(out_disc, '.fq.gz') + '.gz' os.rename(out_disc, od1)
def removeHost(fastq1, outfile): '''Remove host contamination using bmtagger''' outf_host = P.snip(outfile, '_dehost.fastq.1.gz') + '_host.txt' outf_host_stub = P.snip(outf_host, '.txt') + '_toremove' # Currently disabled. Has no effect. See drop_fastq.py # # Whether to keep pair if a read is identified as host. # if PARAMS['bmtagger_keep_pairs']: # keep_pairs = True # E.info("BMTagger: reads with a pair identified as host will be" # " discarded") # else: # keep_pairs = False # E.info("BMTagger: reads with a pair identified as host will be" # " kept as singletons (assuming they are not also identified" # " as host)") if IS_PAIRED: fastq2 = P.snip(fastq1, '.1.gz') + '.2.gz' fastq3 = P.snip(fastq1, '.1.gz') + '.3.gz' to_remove_paired = P.get_temp_filename('.') to_remove_singletons = P.get_temp_filename('.') # In some cases, it may be desirable to screen against multiple hosts. indexes = zip(PARAMS['bmtagger_bitmask'].split(','), PARAMS['bmtagger_srprism'].split(',')) for n, indexes in enumerate(indexes, 1): n = str(n) bitmask, srprism = indexes # Screen the paired reads, then singletons tmpdir1 = P.get_temp_dir('.') tmpdir2 = P.get_temp_dir('.') tmpf1 = P.get_temp_filename('.') tmpf2 = P.get_temp_filename('.') tmpf3 = P.get_temp_filename('.') # bmtagger truncates fasta headers... sed 's/[[:space:]]\+/__/g' # It won't accept... sed 's|[[:space:]].*$|/1|' # It also fails if fastq1 header differs from fastq2 statement1 = ( "zcat %(fastq1)s > %(tmpf1)s &&" " zcat %(fastq2)s > %(tmpf2)s &&" " bmtagger.sh" " -b %(bitmask)s" " -x %(srprism)s" " -T %(tmpdir1)s" " -q1" # Input is fastq " -1 %(tmpf1)s" " -2 %(tmpf2)s" " -o %(outf_host_stub)s_paired%(n)s" " &> %(outfile)s.log &&" " cat %(outf_host_stub)s_paired%(n)s" " >> %(to_remove_paired)s &&" " rm -rf %(tmpdir1)s %(tmpf1)s %(tmpf2)s" " %(outf_host_stub)s_paired%(n)s") # Screen the singletons if IOTools.open_file(fastq3).read(1): statement2 = ( "zcat %(fastq3)s > %(tmpf3)s &&" " bmtagger.sh" " -b %(bitmask)s" " -x %(srprism)s" " -T %(tmpdir2)s" " -q1" # Input is fastq " -1 %(tmpf3)s" " -o %(outf_host_stub)s_singletons%(n)s" " &>> %(outfile)s.log &&" " cat %(outf_host_stub)s_singletons%(n)s" " >> %(to_remove_singletons)s &&" " rm -rf %(tmpdir2)s %(tmpf3)s" " %(outf_host_stub)s_singletons%(n)s") else: statement2 = ("touch %(to_remove_singletons)s &&" " rm -rf %(tmpdir2)s %(tmpf3)s") statement = " && ".join([statement1, statement2]) P.run(statement, job_options=PARAMS['bmtagger_run_options']) # Drop host contaminated reads # A hack due to the fact that BMTagger truncates fastq identifiers # TO DO: Look at bmtagger/.../bin/extract_fullseq drop_script = os.path.join( os.path.splitext(__file__)[0], 'drop_fastqs.py') fastq1_out = outfile fastq2_out = P.snip(outfile, '.1.gz') + '.2.gz' fastq3_out = P.snip(outfile, '.1.gz') + '.3.gz' fastq1_host = P.snip(outfile, '_dehost.fastq.1.gz') + '_host.fastq.1.gz' fastq2_host = P.snip(outfile, '_dehost.fastq.1.gz') + '_host.fastq.2.gz' fastq3_host = P.snip(outfile, '_dehost.fastq.1.gz') + '_host.fastq.3.gz' statement = ("python %(drop_script)s" " --fastq1 %(fastq1)s" " --fastq2 %(fastq2)s" " --fastq3 %(fastq3)s" " --to-drop-paired %(to_remove_paired)s" " --to-drop-single %(to_remove_singletons)s" " --fastq-out1 %(fastq1_out)s" " --fastq-out2 %(fastq2_out)s" " --fastq-out3 %(fastq3_out)s" " --fastq-drop1 %(fastq1_host)s" " --fastq-drop2 %(fastq2_host)s" " --fastq-drop3 %(fastq3_host)s" " &>> %(outfile)s.log") P.run(statement) os.unlink(to_remove_paired) os.unlink(to_remove_singletons) else: indexes = zip(PARAMS['bmtagger_bitmask'].split(','), PARAMS['bmtagger_srprism'].split(',')) to_remove = P.get_temp_filename('.') for n, indexes in enumerate(indexes, 1): n = str(n) bitmask, srprism = indexes # Screen the singletons tmpdir1 = P.get_temp_dir('.') tmpf = P.get_temp_filename('.') statement = ( "zcat %(fastq1)s > %(tmpf)s &&" " bmtagger.sh" " -b %(bitmask)s" " -x %(srprism)s" " -T %(tmpdir1)s" " -q1" # Input is fastq " -1 %(tmpf)s" " -o %(outf_host_stub)s_%(n)s" " &>> %(outfile)s.log &&" " cat %(outf_host_stub)s_%(n)s >> %(to_remove)s" " rm -rf %(tmpdir1)s %(tmpf)s %(outf_host_stub)s_%(n)s") P.run(statement, job_options=PARAMS['bmtagger_run_options']) # Drop host contaminated reads drop_script = ps.path.join( os.path.splitext(__file__)[0], 'drop_single_fastqs.py') fastq_host = P.snip(outfile, '_dehost.fastq.1.gz') + '_host.fastq.1.gz' statement = ("python %(drop_script)s" " --fastq1 %(fastq1)s" " --to-drop-single %(to_remove)s" " --fastq-out1 %(outfile)s" " --fastq-drop1 %(fastq_host)s" " &>> %(outfile)s.log") P.run(statement) os.unlink(to_remove)
def removeAdapters(fastq1, outfile1): '''Remove adapters using Trimmomatic''' if IS_PAIRED: fastq2 = P.snip(fastq1, FASTQ1_SUFFIX) + FASTQ2_SUFFIX outfile2 = P.snip(outfile1, '.fastq.1.gz') + '.fastq.2.gz' outf1_singletons = P.snip(outfile1, '.fastq.1.gz') + '.fastq.1s.gz' outf2_singletons = P.snip(outfile1, '.fastq.1.gz') + '.fastq.2s.gz' outf_singletons = P.snip(outfile1, '.fastq.1.gz') + '.fastq.3.gz' logfile = P.snip(outfile1, '.fastq.1.gz') + '.trim.log' logfile2 = P.snip(outfile1, '.fastq.1.gz') + '.log' statement = ( "java -Xmx5g -jar %(trimmomatic_jar_path)s PE" " -threads %(trimmomatic_n_threads)s" " -phred%(phred_format)s" " -trimlog %(logfile)s" " %(fastq1)s" # input read 1 " %(fastq2)s" # input read 2 " %(outfile1)s" # output read 1 " %(outf1_singletons)s" # output unpaired read 1 " %(outfile2)s" # output read 2 " %(outf2_singletons)s" # output unpaired read 2 " ILLUMINACLIP:" "%(trimmomatic_adapters)s:" "%(trimmomatic_seed_mismatches)s:" "%(trimmomatic_score_palendromic)s:" "%(trimmomatic_score_simple)s:" "%(trimmomatic_min_adapter_len)s:" "%(trimmomatic_keep_both_reads)s" " LEADING:%(trimmomatic_quality_leading)s" " TRAILING:%(trimmomatic_quality_trailing)s" " MINLEN:%(trimmomatic_minlen)s" " &> %(logfile2)s &&" " gzip -f %(logfile)s &&" " cat %(outf1_singletons)s %(outf2_singletons)s " " > %(outf_singletons)s &&" " rm -f %(outf1_singletons)s && rm -f %(outf2_singletons)s") P.run(statement, job_options=PARAMS['trimmomatic_run_options']) else: logfile = P.snip(outfile1, '.fastq.1.gz') + '.trim.log' logfile2 = P.snip(outfile1, '.fastq.1.gz') + '.log' statement = ( "java -Xmx5g -jar %(trimmomatic_jar_path)s PE" " -threads %(trimmomatic_n_threads)s" " -phred%(phred_format)s" " -trimlog %(logfile)s" " %(fastq1)s" # input read 1 " %(outfile1)s" # output read 1 " ILLUMINACLIP:" "%(trimmomatic_adapters)s:" "%(trimmomatic_seed_mismatches)s:" "%(trimmomatic_score_palendromic)s:" "%(trimmomatic_score_simple)s" "%(trimmomatic_min_adapter_len)s:" "%(trimmomatic_keep_both_reads)s" " LEADING:%(trimmomatic_quality_leading)s" " TRAILING:%(trimmomatic_quality_trailing)s" " MINLEN:%(trimmomatic_minlen)s" " &> %(logfile2)s &&" " gzip -f %(logfile)s") P.run(statement, job_options=PARAMS['trimmomatic_run_options'])
''' generic split by newline and tab for reading tsv files ''' return line[:-1].split("\t") ######################################################################### ######################################################################### ######################################################################### @follows(mkdir("gtfs")) @merge([PARAMS["genesets_abinitio_coding"], PARAMS["genesets_reference"]], os.path.join( "gtfs", P.snip(PARAMS["genesets_abinitio_coding"], ".gtf.gz") + "_coding.gtf.gz")) def buildCodingGeneSet(infiles, outfile): ''' takes the output from cuffcompare of a transcript assembly and filters for annotated protein coding genes. NB "pruned" refers to nomenclature in the transcript building pipeline - transcripts that appear in at least two samples. Because an abinitio assembly will often contain fragments of known transcripts and describe them as novel, the default behaviour is to produce a set that is composed of 'complete' or 'contained' transcripts
def loadPicardMetrics(infiles, outfile, suffix, pipeline_suffix=".picard_stats", tablename=None): '''load picard metrics. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``to_table(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.to_table(outfile), suffix) outf = P.get_temp_file(".") filenames = ["%s.%s" % (x, suffix) for x in infiles] first = True for filename in filenames: track = P.snip(os.path.basename(filename), "%s.%s" % (pipeline_suffix, suffix)) if not os.path.exists(filename): E.warn("File %s missing" % filename) continue lines = iotools.open_file(filename, "r").readlines() # extract metrics part rx_start = re.compile("## METRICS CLASS") for n, line in enumerate(lines): if rx_start.search(line): lines = lines[n + 1:] break for n, line in enumerate(lines): if not line.strip(): lines = lines[:n] break if len(lines) == 0: E.warn("no lines in %s: %s" % (track, filename)) continue if first: outf.write("%s\t%s" % ("track", lines[0])) fields = lines[0][:-1].split("\t") else: f = lines[0][:-1].split("\t") if f != fields: raise ValueError( "file %s has different fields: expected %s, got %s" % (filename, fields, f)) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() P.load(outf.name, outfile, tablename=tablename, options="--add-index=track --allow-empty-file") os.unlink(outf.name)
def loadPicardHistogram(infiles, outfile, suffix, column, pipeline_suffix=".picard_stats", tablename=False): '''extract a histogram from a picard output file and load it into database. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. column : string Column name to take from the histogram. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``to_table(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.to_table(outfile), suffix) tablename = tablename.replace("_metrics", "_histogram") # some files might be missing xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] if len(xfiles) == 0: E.warn("no files for %s" % tablename) return header = ",".join([P.snip(os.path.basename(x), pipeline_suffix) for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) # there might be a variable number of columns in the tables # only take the first ignoring the rest load_statement = P.build_load_statement( tablename, options="--add-index=track " " --header-names=%s,%s" " --allow-empty-file" " --replace-header" % (column, header)) statement = """cgat combine_tables --regex-start="## HISTOGRAM" --missing-value=0 --take=2 %(filenames)s | %(load_statement)s >> %(outfile)s """ to_cluster = False P.run(statement)
def buildGeneListMatrix(infiles, outfile): '''build a gene list matrix for simple pathway analysis based on hypergeometric test. A gene list is derived from a gene set by applying thresholds to the input data set. The thresholds are defined in the configuration file. ''' genesets = [] backgrounds = [] headers = [] for infile in infiles: genelist = pandas.read_csv( iotools.openFile(infile), index_col=0, sep='\t') track = P.snip(os.path.basename(infile), ".tsv.gz") headers.append(track) field = PARAMS[P.matchParameter("%s_foreground_field" % track)] min_threshold = PARAMS[P.matchParameter( "%s_foreground_min_threshold" % track)] max_threshold = PARAMS[P.matchParameter( "%s_foreground_max_threshold" % track)] genesets.append(set(genelist[ (genelist[field] >= min_threshold) & (genelist[field] <= max_threshold)].index)) E.info('%s: foreground: %f <= %s <= %f' % (track, min_threshold, field, max_threshold)) field = PARAMS[P.matchParameter("%s_background_field" % track)] min_threshold = PARAMS[P.matchParameter( "%s_background_min_threshold" % track)] max_threshold = PARAMS[P.matchParameter( "%s_background_max_threshold" % track)] E.info('%s: background: %f <= %s <= %f' % (track, min_threshold, field, max_threshold)) backgrounds.append(set(genelist[ (genelist[field] >= min_threshold) & (genelist[field] <= max_threshold)].index)) E.info("%s: fg=%i, bg=%i" % (track, len(genesets[-1]), len(backgrounds[-1]))) E.info("writing gene list matrix") with iotools.openFile(outfile, "w") as outf: SetTools.writeSets(outf, genesets, labels=headers) with iotools.openFile(outfile + ".bg.tsv.gz", "w") as outf: SetTools.writeSets(outf, backgrounds, labels=headers) E.info("writing intersection/union matrix") # build set intersection matrix matrix = SetTools.unionIntersectionMatrix(genesets) with iotools.openFile(outfile + ".matrix.gz", "w") as outf: iotools.writeMatrix(outf, matrix, headers, headers) matrix = SetTools.unionIntersectionMatrix(backgrounds) with iotools.openFile(outfile + ".bg.matrix.gz", "w") as outf: iotools.writeMatrix(outf, matrix, headers, headers)
--output-filename-pattern=%%DIR%%/ --deseq-fit-type=%(deseq_fit_type)s --deseq-dispersion-method=%(deseq_dispersion_method)s --log=%(outfile)s.log --fdr=%(edger_fdr)f" | grep -v "warnings" | gzip > %(outfile)s ''' P.run() @follows(aggregateTiledReadCounts, mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation"))) @files([((data, design), "diff_methylation/%s_%s.deseq.gz" % (P.snip(os.path.basename(data), ".counts.tsv.gz"), P.snip(os.path.basename(design), ".tsv"))) for data, design in itertools.product( glob.glob("diff_methylation/*.counts.tsv.gz"), P.asList(PARAMS["deseq_designs"]))]) def runDESeq(infiles, outfile): '''estimate differential expression using DESeq. The final output is a table. It is slightly edited such that it contains a similar output and similar fdr compared to cuffdiff. ''' runDE(infiles, outfile, "deseq") ######################################################################### #########################################################################
def loadGeneListMatrix(infile, outfile): '''load fgene list matrix into table.''' track = P.snip(infile, ".tsv.gz") P.load(infile, outfile, tablename="%s_foreground" % track) P.load(infile + ".bg.tsv.gz", outfile, tablename="%s_background" % track)
def getRunStatement(self, infile, outfile, controlfile): """ Generate a specific run statement for each peakcaller class """ # generate outfile prefix dir_name = os.path.dirname(outfile) infile_stub = P.snip(os.path.basename(infile), ".bam") control_stub = P.snip(os.path.basename(controlfile), ".bam") outfile_stub = infile_stub + "_VS_" + control_stub outfile_stub = os.path.join(dir_name, outfile_stub) # build macs2 commandline statement statement = [("macs2 callpeak" " --treatment %(infile)s" " --control %(controlfile)s" " --verbose=10")] # add additional parameters # currently the input read format has to be bam bc of ruffus regex statement.append("--format BAM") statement.append("--name %s" % outfile_stub) # require genome size, if it is not specified try to take from genome if not re.search("-g\s|--gsize", self.PARAMS_PEAKCALLER["macs2_options_parameters"]): statement.append( "--gsize %s" % self.PARAMS_PEAKCALLER["macs2_options_genome_prefix"][:2]) # set threshold for lax peak calling if self.PARAMS_PEAKCALLER["macs2_options_fdr"]: if self.PARAMS_PEAKCALLER["macs2_options_pvalue"]: raise Exception("Value specified for both macs2 options" " -pvalue and -fdr please select one or" " other option, but not both") else: threshold = "--qvalue " + \ str(self.PARAMS_PEAKCALLER["macs2_options_fdr"]) elif self.PARAMS_PEAKCALLER["macs2_options_pvalue"]: threshold = "--pvalue=" + \ str(self.PARAMS_PEAKCALLER["macs2_options_pvalue"]) else: raise Exception("Must specify a value for either" " macs2_options_pvalue or macs2_options_fdr," " but not both") statement.append(threshold) # deal with duplicate reads if self.PARAMS_PEAKCALLER["macs2_options_keep_duplicates"]: statement.append( "--keep-dup %s" % self.PARAMS_PEAKCALLER["macs2_options_keep_duplicates"]) # add additional parameters statement.append(self.PARAMS_PEAKCALLER["macs2_options_parameters"]) # write log information to sentinel file statement.append(">& %(outfile)s") statement = (" ".join(statement) % locals()) return statement
def buildDMRStats(tables, method, outfile, dbhandle): '''build dmr summary statistics. This method counts the number of up/down, 2fold up/down, etc. genes in output from (:mod:`scripts/runExpression`). This method also creates diagnostic plots in the <exportdir>/<method> directory. Tables should be labeled <tileset>_<design>_<method>. Arguments --------- tables ; list List of tables with DMR output method : string Method name outfile : string Output filename. Tab separated file summarizing ''' def togeneset(tablename): return re.match("([^_]+)_", tablename).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = iotools.openFile(outfile, "w") outf.write("\t".join(( "tileset", "design", "track1", "track2", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "up", "down", "twofold", "twofold_up", "twofold_down", )) + "\n") all_tables = set(Database.getTables(dbhandle)) outdir = os.path.join(PARAMS["exportdir"], "diff_methylation") for tablename in tables: prefix = P.snip(tablename, "_%s" % method) tileset, design = prefix.split("_") def toDict(vals, l=2): return collections.defaultdict(int, [(tuple(x[:l]), x[l]) for x in vals]) E.info("collecting data from %s" % tablename) tested = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name""" % locals()).fetchall()) status = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, status, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name,status""" % locals()).fetchall(), 3) signif = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE significant GROUP BY treatment_name,control_name""" % locals()).fetchall()) fold2 = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE (l2fold >= 1 or l2fold <= -1) AND significant GROUP BY treatment_name,control_name,significant""" % locals()).fetchall()) up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals()).fetchall()) down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals()).fetchall()) fold2up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 1 AND significant GROUP BY treatment_name,control_name,significant""" % locals()).fetchall()) fold2down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < -1 AND significant GROUP BY treatment_name,control_name,significant""" % locals()).fetchall()) groups = list(tested.keys()) for treatment_name, control_name in groups: k = (treatment_name, control_name) outf.write("\t".join( map(str, (tileset, design, treatment_name, control_name, tested[k], "\t".join([ str(status[(treatment_name, control_name, x)]) for x in keys_status ]), signif[(k)], up[k], down[k], fold2[k], fold2up[k], fold2down[k]))) + "\n") ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, '''SELECT end - start, pvalue FROM %(tablename)s WHERE significant''' % locals()).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = list(zip(*data)) pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals( ) R.png(pngfile) R.smoothScatter(R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab='log10(length)', ylab='log10(pvalue)', log="x", pch=20, cex=.1) R['dev.off']() outf.close()
def intersectionHeatmap(infiles, outfile): ''' calculate the intersection between the infiles and plot''' pandas2ri.activate() name2genes = {} df = pd.DataFrame(columns=["id_1", "id_2", "intersection", "perc"]) ix = 0 for inf in infiles: name = P.snip(os.path.basename(inf)).split(".")[0] name = name.replace(".", "_") with iotools.open_file(inf, "r") as f: genes = set() for line in f: if line[0] == "#": continue values = line.strip().split("\t") info = values[7].split(";") for x in info: if x.split("=")[0] == "SNPEFF_GENE_NAME": gene_name = x.split("=")[1] break # if no gene name found, line is skipped if gene_name: genes.update((gene_name, )) name2genes[name] = genes df.loc[ix] = [name, name, len(genes), 1.0] ix += 1 for pair in itertools.permutations(list(name2genes.keys()), 2): id_1, id_2 = pair intersection = len(name2genes[id_1].intersection(name2genes[id_2])) not_intersecting = len(name2genes[id_1].symmetric_difference( name2genes[id_2])) intersection_perc = float(intersection) / (intersection + not_intersecting) df.loc[ix] = [id_1, id_2, intersection, intersection_perc] ix += 1 variant = os.path.basename(outfile).replace("overlap_", "").replace( "_heatmap.png", "") plotIntersectionHeatmap = R(''' function(df){ library(ggplot2) m_txt = element_text(size=15) m_txt_90 = element_text(size=15, angle=90, vjust=0.5, hjust=1) l_txt = element_text(size=20) p = ggplot(df, aes(id_1, id_2, fill=100*perc)) + geom_tile() + geom_text(aes(label=intersection), size=3) + scale_fill_gradient(name="Intersection (%%)", limits=c(0,100), low="yellow", high="dodgerblue4") + theme(axis.text.x = m_txt_90, axis.text.y = m_txt, legend.text = m_txt, legend.title = m_txt, aspect.ratio=1) + xlab("") + ylab("") + ggtitle("%(variant)s") ggsave("%(outfile)s", width=10, height=10) }''' % locals()) plotIntersectionHeatmap(df)
def sortByPosition(infile, outfile): '''Add number of hits tags to sam file''' to_cluster = USECLUSTER track = P.snip(outfile, ".bam") statement = '''samtools sort %(infile)s %(track)s;''' P.run()