def generate_dexseq_design_files(infiles, outfile):

        track = os.path.basename(infiles[0][0]).split("-")[0]
        files = [
            P.snip(os.path.basename(f), ".bam") for p in infiles for f in p
        ]
        files = [(f, f.split("-")[1]) for f in files]
        IOTools.writeLines(outfile, files, header=["track", "condition"])
def countTagsInClusters(bedfile, bamfile, outfile):

    bam = pysam.AlignmentFile(bamfile)

    outlines = []

    for bed in Bed.iterator(IOTools.openFile(bedfile)):
        interval = (bed.start, bed.end)
        counts = iCLIP.count_intervals(bam, [interval], bed.contig).sum()
        outlines.append(["%s:%i-%i" % (bed.contig, bed.start, bed.end), str(counts)])

    IOTools.writeLines(outfile, outlines, header=["position","count"])
示例#3
0
def countTagsInClusters(bedfile, bamfile, outfile):

    bam = pysam.AlignmentFile(bamfile)

    outlines = []

    for bed in Bed.iterator(IOTools.openFile(bedfile)):
        interval = (bed.start, bed.end)
        counts = iCLIP.count_intervals(bam, [interval], bed.contig).sum()
        outlines.append(
            ["%s:%i-%i" % (bed.contig, bed.start, bed.end),
             str(counts)])

    IOTools.writeLines(outfile, outlines, header=["position", "count"])
示例#4
0
def loadClusterCounts(infiles, outfile):
    '''Find the number of signficant clusters found in each sample'''

    tmp = P.getTempFilename(shared=True)
    results = []
    for infile in infiles:
        count = IOTools.getNumLines(infile)
        method, track = re.match(
            "dedup_(.+).dir/(.+)\.clusters.bedgraph", infile).groups()
        results.append((method, track, count))
        
    IOTools.writeLines(tmp, results, header=["method", "track", "count"])

    P.load(tmp, outfile)
    os.unlink(tmp)
示例#5
0
def generateDaParsTranscriptsToGenes(infile, outfile):

    import CGAT.GTF as GTF

    outlines = []

    for transcript in GTF.transcript_iterator(
            GTF.iterator(IOTools.openFile(infile))):
        try:
            gene_id = transcript[0].ref_gene_id
        except AttributeError:
            gene_id = transcript[0].gene_id

        outlines.append((transcript[0].transcript_id, gene_id))

    outlines = list(set(outlines))

    IOTools.writeLines(outfile, outlines, header=["#transcript_id", "gene_id"])
示例#6
0
def generateReaperMetaData(infile, outfile):
    '''Take the sample_table and use it to generate a metadata table
    for reaper in the correct format '''

    adaptor_5prime = "AGATCGGAAGAGCGACGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"
    adaptor_3prime = "AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG"

    outlines = []
    lane = P.snip(infile[0], ".fastq.gz")
    for line in IOTools.openFile(infile[1]):
        fields = line.split("\t")
        barcode = fields[1]
        lanes = fields[-1].strip().split(",")
        if lane in lanes:
            outlines.append([barcode, adaptor_3prime, adaptor_5prime, "-"])

    header = ["barcode", "3p-ad", "tabu", "5p-si"]
    IOTools.writeLines(outfile, outlines, header)
示例#7
0
def generateReaperMetaData(infile, outfile):
    '''Take the sample_table and use it to generate a metadata table
    for reaper in the correct format '''

    adaptor_5prime = PARAMS["reads_5prime_adapt"]
    adaptor_3prime = PARAMS["reads_3prime_adapt"]

    outlines = []
    lane = P.snip(infile[0], ".fastq.clean.gz")
    for line in IOTools.openFile(infile[1]):
        fields = line.split("\t")
        barcode = fields[1]
        lanes = fields[-1].strip().split(",")
        if lane in lanes:
            outlines.append([barcode, adaptor_3prime, adaptor_5prime, "-"])

    header = ["barcode", "3p-ad", "tabu", "5p-si"]
    IOTools.writeLines(outfile, outlines, header)
    def generate_dexseq_design_files(infile, outfiles):
        '''take the design specification for the pipeline and convert 
        into dexseq design matricies'''

        bamfiles = glob.glob("*.bam")
        bamfiles = [P.snip(os.path.basename(f), ".bam") for f in bamfiles]
        comparisons = [
            line.split() for line in IOTools.openFile("design.tsv")
            if not line.startswith("#")
        ]

        for name, pat1, pat2 in comparisons:
            condition1_files = [(f, "test") for f in bamfiles
                                if re.match(f, pat1)]
            condition2_files = [(f, "control") for f in bamfiles
                                if re.match(f, pat2)]
            IOTools.writeLines("alt_utr_anlysis.dir/%s.design.tsv" % name,
                               condition1_files + condition2_files,
                               header=["track", "condition"])
def get_contigs(infile, outfile):
    '''Generate a pseudo-contigs file from the geneset, where the length of 
    each contigs is determined by the GTF entry with the highest end coordinate.
    Will not stop things going off the end on contigs, but that doesn't really
    matter for our purposes'''

    last_contig = None
    max_end = 0
    outlines = []
    for entry in GTF.iterator(IOTools.openFile(infile)):

        if last_contig and entry.contig != last_contig:
            outlines.append([entry.contig, str(max_end)])
            max_end = 0

        max_end = max(max_end, entry.end)
        last_contig = entry.contig

    outlines.append([last_contig, str(max_end)])
    IOTools.writeLines(outfile, outlines, header=None)
示例#10
0
def count_bams(infiles, outfile):
    '''Count the number of alignments both pre and post dedup'''

    outlines = []

    for infile in infiles:

        method = re.match("dedup_(.+).dir\/.+", infile)
        if method:
            method = method.groups()[0]
        else:
            method = "none"
            
        track = re.search("([^/]+).bam", infile).groups()[0]

        statement = '''samtools idxstats %(infile)s
                 | awk '{sum+=$3} END{print sum}' '''

        count, _ = P.execute(statement)

        outlines.append([method, track, count.strip()])

    IOTools.writeLines(outfile,
                       outlines, header=["method", "track", "count"])