def generate_dexseq_design_files(infiles, outfile): track = os.path.basename(infiles[0][0]).split("-")[0] files = [ P.snip(os.path.basename(f), ".bam") for p in infiles for f in p ] files = [(f, f.split("-")[1]) for f in files] IOTools.writeLines(outfile, files, header=["track", "condition"])
def countTagsInClusters(bedfile, bamfile, outfile): bam = pysam.AlignmentFile(bamfile) outlines = [] for bed in Bed.iterator(IOTools.openFile(bedfile)): interval = (bed.start, bed.end) counts = iCLIP.count_intervals(bam, [interval], bed.contig).sum() outlines.append(["%s:%i-%i" % (bed.contig, bed.start, bed.end), str(counts)]) IOTools.writeLines(outfile, outlines, header=["position","count"])
def countTagsInClusters(bedfile, bamfile, outfile): bam = pysam.AlignmentFile(bamfile) outlines = [] for bed in Bed.iterator(IOTools.openFile(bedfile)): interval = (bed.start, bed.end) counts = iCLIP.count_intervals(bam, [interval], bed.contig).sum() outlines.append( ["%s:%i-%i" % (bed.contig, bed.start, bed.end), str(counts)]) IOTools.writeLines(outfile, outlines, header=["position", "count"])
def loadClusterCounts(infiles, outfile): '''Find the number of signficant clusters found in each sample''' tmp = P.getTempFilename(shared=True) results = [] for infile in infiles: count = IOTools.getNumLines(infile) method, track = re.match( "dedup_(.+).dir/(.+)\.clusters.bedgraph", infile).groups() results.append((method, track, count)) IOTools.writeLines(tmp, results, header=["method", "track", "count"]) P.load(tmp, outfile) os.unlink(tmp)
def generateDaParsTranscriptsToGenes(infile, outfile): import CGAT.GTF as GTF outlines = [] for transcript in GTF.transcript_iterator( GTF.iterator(IOTools.openFile(infile))): try: gene_id = transcript[0].ref_gene_id except AttributeError: gene_id = transcript[0].gene_id outlines.append((transcript[0].transcript_id, gene_id)) outlines = list(set(outlines)) IOTools.writeLines(outfile, outlines, header=["#transcript_id", "gene_id"])
def generateReaperMetaData(infile, outfile): '''Take the sample_table and use it to generate a metadata table for reaper in the correct format ''' adaptor_5prime = "AGATCGGAAGAGCGACGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT" adaptor_3prime = "AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG" outlines = [] lane = P.snip(infile[0], ".fastq.gz") for line in IOTools.openFile(infile[1]): fields = line.split("\t") barcode = fields[1] lanes = fields[-1].strip().split(",") if lane in lanes: outlines.append([barcode, adaptor_3prime, adaptor_5prime, "-"]) header = ["barcode", "3p-ad", "tabu", "5p-si"] IOTools.writeLines(outfile, outlines, header)
def generateReaperMetaData(infile, outfile): '''Take the sample_table and use it to generate a metadata table for reaper in the correct format ''' adaptor_5prime = PARAMS["reads_5prime_adapt"] adaptor_3prime = PARAMS["reads_3prime_adapt"] outlines = [] lane = P.snip(infile[0], ".fastq.clean.gz") for line in IOTools.openFile(infile[1]): fields = line.split("\t") barcode = fields[1] lanes = fields[-1].strip().split(",") if lane in lanes: outlines.append([barcode, adaptor_3prime, adaptor_5prime, "-"]) header = ["barcode", "3p-ad", "tabu", "5p-si"] IOTools.writeLines(outfile, outlines, header)
def generate_dexseq_design_files(infile, outfiles): '''take the design specification for the pipeline and convert into dexseq design matricies''' bamfiles = glob.glob("*.bam") bamfiles = [P.snip(os.path.basename(f), ".bam") for f in bamfiles] comparisons = [ line.split() for line in IOTools.openFile("design.tsv") if not line.startswith("#") ] for name, pat1, pat2 in comparisons: condition1_files = [(f, "test") for f in bamfiles if re.match(f, pat1)] condition2_files = [(f, "control") for f in bamfiles if re.match(f, pat2)] IOTools.writeLines("alt_utr_anlysis.dir/%s.design.tsv" % name, condition1_files + condition2_files, header=["track", "condition"])
def get_contigs(infile, outfile): '''Generate a pseudo-contigs file from the geneset, where the length of each contigs is determined by the GTF entry with the highest end coordinate. Will not stop things going off the end on contigs, but that doesn't really matter for our purposes''' last_contig = None max_end = 0 outlines = [] for entry in GTF.iterator(IOTools.openFile(infile)): if last_contig and entry.contig != last_contig: outlines.append([entry.contig, str(max_end)]) max_end = 0 max_end = max(max_end, entry.end) last_contig = entry.contig outlines.append([last_contig, str(max_end)]) IOTools.writeLines(outfile, outlines, header=None)
def count_bams(infiles, outfile): '''Count the number of alignments both pre and post dedup''' outlines = [] for infile in infiles: method = re.match("dedup_(.+).dir\/.+", infile) if method: method = method.groups()[0] else: method = "none" track = re.search("([^/]+).bam", infile).groups()[0] statement = '''samtools idxstats %(infile)s | awk '{sum+=$3} END{print sum}' ''' count, _ = P.execute(statement) outlines.append([method, track, count.strip()]) IOTools.writeLines(outfile, outlines, header=["method", "track", "count"])