Пример #1
0
    def bam_stats(bam_file):
        """
        Wrapper for the pysam SAMtools flagstat function

        Parameters
        ----------
        bam_file : str
            Location of the bam file

        Returns
        -------
        list : dict
            qc_passed : int
            qc_failed : int
            description : str
        """
        results = pysam.flagstat(bam_file)  # pylint: disable=no-member
        separate_results = results.strip().split("\n")
        return [
            {
                "qc_passed": int(element[0]),
                "qc_failed": int(element[2]),
                "description": " ".join(element[3:])
            } for element in [row.split(" ") for row in separate_results]
        ]
Пример #2
0
def buildBAMStats( infile, outfile ):
    '''calculate bamfile statistics
    '''

    # no bedToBigBed
    # to_cluster = True
    outs = open(outfile, "w" )
    outs.write( "reads\tcategory\n" )
    for line in pysam.flagstat( infile ):
        data = line[:-1].split( " ")
        outs.write( "%s\t%s\n" % (data[0], " ".join(data[1:]) ) )
    pysam_in = pysam.Samfile( infile, "rb" )


    outs_dupl = open( outfile + ".duplicates", "w" )
    outs_dupl.write( "contig\tpos\tcounts\n" )

    outs_hist = open( outfile + ".histogram", "w" )
    outs_hist.write( "duplicates\tcounts\tcumul\tfreq\tcumul_freq\n" )

    last_contig, last_pos = None, None
    ninput, nduplicates = 0, 0

    duplicates = collections.defaultdict( int )
    counts = collections.defaultdict( int )
    count = 0

    for read in pysam_in.fetch():

        ninput += 1

        if read.rname == last_contig and read.pos == last_pos:
            count += 1
            nduplicates += 1
            continue

        if count > 1:
            outs_dupl.write("%s\t%i\t%i\n" % (last_contig, last_pos, count) )
            counts[count] += 1

        count = 1
        last_contig, last_pos = read.rname, read.pos

    outs.write("%i\tduplicates (%5.2f%%)\n" % (nduplicates, 100.0* nduplicates / ninput))
    outs.write("%i\twithout duplicates (%5.2f%%)\n" % (ninput - nduplicates,
                                                       100.0*(ninput - nduplicates)/ninput))
    pysam_in.close()
    outs.close()
    outs_dupl.close()

    keys = counts.keys()
    # count per position (not the same as nduplicates, which is # of reads)
    c = 0
    total = sum( counts.values() )
    for k in sorted(keys):
        c += counts[k]
        outs_hist.write("%i\t%i\t%i\t%f\t%f\n" % (k, counts[k], c, 
                                                  100.0 * counts[k] / total,
                                                  100.0 * c / total) )
    outs_hist.close()
def is_paired_sequencing(bamfile):
    # TODO: this is scary. Should check for unpaired being 0, and number paired == total number
    r = pysam.flagstat(bamfile)
    paired = int(r[5].split()[0])
    if paired != 0:
        return True
    else:
        return False
Пример #4
0
def proc_sam(arg):
    samfile = arg[0]
    rmdup = arg[1]
    #se = arg[2]
    print samfile
    print rmdup
    
    sam_dir = "/".join(samfile.split("/")[:-1]) + "/"
    sam_prefix = os.path.basename(samfile).split(".sam")[0]
    mapped_sam = sam_dir + sam_prefix + "_mapped.sam"
    rmdup_sam = sam_dir + sam_prefix + "_rmdup.sam"
    sort_sam = sam_dir + sam_prefix + "_sort"
    
    if not os.path.exists(mapped_sam):
        print "Removing unmapped..."
        sam = pysam.Samfile(samfile, 'r')
        mb = pysam.Samfile(mapped_sam, 'w', template=sam)
        for read in sam:
            if not read.is_unmapped:
                mb.write(read)
        mb.close()
        print "Finished removing unmapped."
    if not os.path.exists(rmdup_sam) and rmdup == "True":
        print "Removing duplicates..."
        pysam.rmdup("-S", mapped_sam, rmdup_sam)
        os.remove(mapped_sam)
        print "Sorting..."
        pysam.sort(rmdup_sam, sort_sam)
        os.remove(rmdup_sam)
    else:
        print "Sorting..."
        pysam.sort(mapped_sam, sort_sam)
        os.remove(mapped_sam)
    print "Indexing..."
    sort_sam = sort_sam + ".sam"
    pysam.index(sort_sam)

    samfile_fs = open(samfile + "_stat", 'w')
    for line in pysam.flagstat(samfile):
        samfile_fs.write(line)
    samfile_fs.close
    sort_sam_fs = open(sort_sam + "_stat", 'w')
    for line in pysam.flagstat(sort_sam):
        sort_sam_fs.write(line)
    sort_sam_fs.close()
Пример #5
0
def proc_sam(arg):
    samfile = arg[0]
    rmdup = arg[1]
    #se = arg[2]
    print samfile
    print rmdup

    sam_dir = "/".join(samfile.split("/")[:-1]) + "/"
    sam_prefix = os.path.basename(samfile).split(".sam")[0]
    mapped_sam = sam_dir + sam_prefix + "_mapped.sam"
    rmdup_sam = sam_dir + sam_prefix + "_rmdup.sam"
    sort_sam = sam_dir + sam_prefix + "_sort"

    if not os.path.exists(mapped_sam):
        print "Removing unmapped..."
        sam = pysam.Samfile(samfile, 'r')
        mb = pysam.Samfile(mapped_sam, 'w', template=sam)
        for read in sam:
            if not read.is_unmapped:
                mb.write(read)
        mb.close()
        print "Finished removing unmapped."
    if not os.path.exists(rmdup_sam) and rmdup == "True":
        print "Removing duplicates..."
        pysam.rmdup("-S", mapped_sam, rmdup_sam)
        os.remove(mapped_sam)
        print "Sorting..."
        pysam.sort(rmdup_sam, sort_sam)
        os.remove(rmdup_sam)
    else:
        print "Sorting..."
        pysam.sort(mapped_sam, sort_sam)
        os.remove(mapped_sam)
    print "Indexing..."
    sort_sam = sort_sam + ".sam"
    pysam.index(sort_sam)

    samfile_fs = open(samfile + "_stat", 'w')
    for line in pysam.flagstat(samfile):
        samfile_fs.write(line)
    samfile_fs.close
    sort_sam_fs = open(sort_sam + "_stat", 'w')
    for line in pysam.flagstat(sort_sam):
        sort_sam_fs.write(line)
    sort_sam_fs.close()
 def __init__(self, bamname, outpath, window_size):
     self.bamname = bamname
     self.bamfile = pysam.Samfile(bamname, 'rb')
     self.outpath = outpath
     self.window_size = atoi(window_size)
     self.nreads = atoi(pysam.flagstat(bamname)[0].split()[0])
     self.chr_lengths = self.bamfile.lengths
     self.chrs_queue = []
     for index in range(self.bamfile.nreferences):
         self.chrs_queue.append((self.bamfile.references[index], self.bamfile.lengths[index]))
Пример #7
0
def calculateChromdata(samfile, ontarget):
    sys.stdout.write('\rFinalizing analysis ... 0.0%')
    sys.stdout.flush()

    chromdata = dict()
    chroms = samfile.references
    chromsres = []
    alltotal = 0
    allon = 0
    alloff = 0

    i = 0
    for chrom in chroms:

        total = sum(1 for _ in samfile.fetch(chrom))

        if 'chr' + chrom in ontarget.keys():
            on = int(ontarget['chr' + chrom])
            off = total - on
        else:
            on = 0
            off = 0
        chromsres.append({
            'CHROM': chrom,
            'RC': total,
            'RCIN': on,
            'RCOUT': off
        })
        alltotal += total
        allon += on
        alloff += off

        i += 1

        x = round(100 * i / len(chroms), 1)
        x = min(x, 100.0)
        sys.stdout.write('\rFinalizing analysis ... ' + str(x) + '%')
        sys.stdout.flush()

    chromdata['Chroms'] = chromsres
    chromdata['Mapped'] = {'RC': alltotal, 'RCIN': allon, 'RCOUT': alloff}

    allreads = pysam.flagstat(options.input)[0]
    allreads = allreads[:allreads.find('+')]
    allreads = int(allreads.strip())

    chromdata['Total'] = allreads
    chromdata['Unmapped'] = allreads - alltotal

    sys.stdout.write('\rFinalizing analysis ... 100.0% - Done')
    sys.stdout.flush()
    print ''

    return chromdata
Пример #8
0
def run_flagstat(bamfile):
    stats = {}
    for l in pysam.flagstat(bamfile):
        if 'QC-passed' in l:
            stats['QC-passed reads'] = l.rstrip().split(' ', 1)[0]
        elif 'mapped' in l:
            stats['% Mapped'] = perc_from_flagstat_line(l)
        elif 'properly paired' in l:
            stats['% Properly paired'] = perc_from_flagstat_line(l)
            break
    sys.stderr.write("{}\n".format(stats))
    return stats
Пример #9
0
def get_samtools_flagstat(bam_file):
    '''
    Runs samtools flagstat to get read metrics
    '''
    logging.info('samtools flagstat...')
    results = pysam.flagstat(bam_file, split_lines=True)
    flagstat = ''
    for line in results:
        logging.info(line.strip())
        flagstat += line
        if "mapped" in line and "mate" not in line:
            mapped_reads = int(line.split('+')[0].strip())
    return flagstat, mapped_reads
Пример #10
0
def getBamReadMean(regions, bam_file,
                   non_nan):  # bam file should be samtools indexed
    samfile = pysam.AlignmentFile(bam_file, "rb")
    if os.path.isfile(bam_file + '.reads'):  #total number of reads in a genome
        with open(bam_file + '.reads') as f:
            reads = int(f.readline().rstrip())
    else:  # if genomic reads not provided, normalize with the given bam file
        print 'Please wait, getting total number of (properly paired) mapped reads...'
        print 'assuming bam file is for whole genome region'
        flags = pysam.flagstat(bam_file)
        for flag in flags:
            if (flag.split(' ')[3] == 'properly'
                    and flag.split(' ')[4] == 'paired'):
                reads = (int(flag.split(' ')[0]) + int(flag.split(' ')[2]))
                break
    reads = reads / 1000000.0
    print reads, ' millions reads'
    Profile = []
    if non_nan == 1:  #average over non_nan region
        for region in regions:
            sumRegion = 0
            sumNonNan = 0
            iter = samfile.pileup(str(region.chrom), region.start, region.stop)
            for x in iter:
                if (x.reference_pos < region.stop) and (x.reference_pos >=
                                                        region.start):
                    #sumRegion+=x.nsegments
                    #if not x.nsegments ==0:
                    #    sumNonNan+=1
                    npp = 0
                    for y in x.pileups:
                        if y.alignment.is_proper_pair:  # counting only properly paired
                            sumRegion += 1
                            npp += 1
                    if npp > 0:
                        sumNonNan += 1
            Profile.append((sumRegion / reads) / sumNonNan)
    else:  #average over whole region, default
        for region in regions:
            sumRegion = 0
            iter = samfile.pileup(str(region.chrom), region.start, region.stop)
            for x in iter:
                if (x.reference_pos < region.stop) and (x.reference_pos >=
                                                        region.start):
                    #sumRegion+=x.nsegments
                    for y in x.pileups:
                        if y.alignment.is_proper_pair:
                            sumRegion += 1
            Profile.append((sumRegion / reads) / (region.stop - region.start))
    return Profile
Пример #11
0
def calculateChromdata(samfile, ontarget):
    sys.stdout.write('\rFinalizing analysis ... 0.0%')
    sys.stdout.flush()

    chromdata = dict()
    chroms = samfile.references
    chromsres = []
    alltotal = 0
    allon = 0
    alloff = 0

    i = 0
    for chrom in chroms:

        total = sum(1 for _ in samfile.fetch(chrom))

        if 'chr' + chrom in ontarget.keys():
            on = int(ontarget['chr' + chrom])
            off = total - on
        else:
            on = 0
            off = 0
        chromsres.append({'CHROM': chrom, 'RC': total, 'RCIN': on, 'RCOUT': off})
        alltotal += total
        allon += on
        alloff += off

        i += 1

        x = round(100 * i / len(chroms), 1)
        x = min(x, 100.0)
        sys.stdout.write('\rFinalizing analysis ... ' + str(x) + '%')
        sys.stdout.flush()

    chromdata['Chroms'] = chromsres
    chromdata['Mapped'] = {'RC': alltotal, 'RCIN': allon, 'RCOUT': alloff}

    allreads = pysam.flagstat(options.input)[0]
    allreads = allreads[:allreads.find('+')]
    allreads = int(allreads.strip())

    chromdata['Total'] = allreads
    chromdata['Unmapped'] = allreads - alltotal

    sys.stdout.write('\rFinalizing analysis ... 100.0% - Done')
    sys.stdout.flush()
    print ''

    return chromdata
Пример #12
0
def getBamReadMean(regions, bam_file, non_nan): # bam file should be samtools indexed
    samfile=pysam.AlignmentFile(bam_file,"rb")
    if os.path.isfile(bam_file+'.reads'):  #total number of reads in a genome
        with open(bam_file+'.reads') as f:
            reads=int(f.readline().rstrip())
    else:                       # if genomic reads not provided, normalize with the given bam file
        print 'Please wait, getting total number of (properly paired) mapped reads...'
        print 'assuming bam file is for whole genome region'
        flags=pysam.flagstat(bam_file)
        for flag in flags:
            if (flag.split(' ')[3]=='properly' and flag.split(' ')[4]=='paired'):
                reads=(int(flag.split(' ')[0])+int(flag.split(' ')[2]))
                break
    reads=reads/1000000.0
    print reads, ' millions reads'
    Profile=[]
    if non_nan ==1: #average over non_nan region
        for region in regions:
            sumRegion=0
            sumNonNan=0
            iter=samfile.pileup(str(region.chrom),region.start, region.stop)
            for x in iter:
                if (x.reference_pos<region.stop) and (x.reference_pos>=region.start):
                    #sumRegion+=x.nsegments
                    #if not x.nsegments ==0:
                    #    sumNonNan+=1
                    npp=0
                    for y in x.pileups:
                        if y.alignment.is_proper_pair: # counting only properly paired
                            sumRegion+=1
                            npp+=1
                    if npp>0:
                        sumNonNan+=1
            Profile.append((sumRegion/reads)/sumNonNan)
    else:           #average over whole region, default
        for region in regions:
            sumRegion=0
            iter=samfile.pileup(str(region.chrom),region.start, region.stop)
            for x in iter:
                if (x.reference_pos<region.stop) and (x.reference_pos>=region.start):
                    #sumRegion+=x.nsegments
                    for y in x.pileups:
                        if y.alignment.is_proper_pair:
                            sumRegion+=1
            Profile.append((sumRegion/reads)/(region.stop-region.start))
    return Profile
Пример #13
0
def calculateChromdata_minimal(samfile):
    print ''
    sys.stdout.write('\rRunning analysis ... 0.0%')
    sys.stdout.flush()

    chromdata = dict()
    chroms = samfile.references
    chromsres = []
    alltotal = 0
    allon = 0
    alloff = 0

    i = 0
    for chrom in chroms:
        total = sum(1 for _ in samfile.fetch(chrom))
        chromsres.append({'CHROM': chrom, 'RC': total})
        alltotal += total

        i += 1

        x = round(100 * i / len(chroms), 1)
        x = min(x, 100.0)
        sys.stdout.write('\rRunning analysis ... ' + str(x) + '%')
        sys.stdout.flush()

    chromdata['Chroms'] = chromsres
    chromdata['Mapped'] = {'RC': alltotal}

    allreads = pysam.flagstat(options.input)[0]
    allreads = allreads[:allreads.find('+')]
    allreads = int(allreads.strip())

    chromdata['Total'] = allreads
    chromdata['Unmapped'] = allreads - alltotal

    sys.stdout.write('\rRunning analysis ... 100.0% - Done')
    sys.stdout.flush()
    print ''

    return chromdata
Пример #14
0
def calculateChromdata_minimal(samfile):
    print ''
    sys.stdout.write('\rRunning analysis ... 0.0%')
    sys.stdout.flush()

    chromdata = dict()
    chroms = samfile.references
    chromsres = []
    alltotal = 0
    allon = 0
    alloff = 0

    i = 0
    for chrom in chroms:
        total = sum(1 for _ in samfile.fetch(chrom))
        chromsres.append({'CHROM': chrom, 'RC': total})
        alltotal += total

        i += 1

        x = round(100 * i / len(chroms), 1)
        x = min(x, 100.0)
        sys.stdout.write('\rRunning analysis ... ' + str(x) + '%')
        sys.stdout.flush()

    chromdata['Chroms'] = chromsres
    chromdata['Mapped'] = {'RC': alltotal}

    allreads = pysam.flagstat(options.input)[0]
    allreads = allreads[:allreads.find('+')]
    allreads = int(allreads.strip())

    chromdata['Total'] = allreads
    chromdata['Unmapped'] = allreads - alltotal

    sys.stdout.write('\rRunning analysis ... 100.0% - Done')
    sys.stdout.flush()
    print ''

    return chromdata
Пример #15
0
def proc(arg):
    bamfile = arg[0]
    rmdup = arg[1]
    errorlog = arg[2]
    if errorlog == "stderr":
        errorlog = sys.stderr
    if rmdup == "False": rmdup = False
    
    bam_dir = "/".join(bamfile.split("/")[:-1]) + "/"
    bam_prefix = os.path.basename(bamfile).split(".bam")[0]
    mapped_bam = bam_dir + bam_prefix + "_mapped.bam"
    rmdup_bam = bam_dir + bam_prefix + "_rmdup.bam"
    sort_bam = bam_dir + bam_prefix + "_sort"
    
    stat_dir = bam_dir + "stat/"
    if not os.path.exists(stat_dir): os.makedirs(stat_dir)
    
    if not os.path.exists(mapped_bam):
        print>>errorlog, "Removing unmapped..."
        mapped = 0
        unmapped = 0
        bam = pysam.Samfile(bamfile, 'rb')
        mb = pysam.Samfile(mapped_bam, 'wb', template=bam)
        try:
            for read in bam:
                if not read.is_unmapped:
                    mapped = mapped + 1
                    mb.write(read)
                else:
                    unmapped = unmapped + 1
        except:
            errorlog.write("Failed to remove unmapped reads: read number {0}\n".format(mapped + unmapped))
            raise
        else:
            errorlog.write("Unmapped read removal successful: Mapped {0}/Unmapped {1}\n".format(mapped, unmapped))

        bam.close()
        mb.close()
    
    if not os.path.exists(sort_bam + ".bam"):

        print>>errorlog, "Sorting..."

        try:
            cmd_args = ['java', '-Xmx2g', '-jar', '/seq/picard/SortSam.jar',
                        "=".join(["INPUT", mapped_bam]),
                        "=".join(["OUTPUT", sort_bam + ".bam"]),
                        "=".join(["SORT_ORDER", "coordinate"])]
            p = Popen(cmd_args, stdout=errorlog, stderr=errorlog)
            p.wait()
        except:
            errorlog.write("Sorting failed.\n")
            raise
        else:
            os.remove(mapped_bam)

    if not os.path.exists(rmdup_bam) and rmdup:   
        print "Removing duplicates..."
        rmdup_metrics = stat_dir + bam_prefix + "_rmdup_metrics"
        cmd_args = ['java', '-Xmx2g', '-jar', '/seq/picard/MarkDuplicates.jar',
                    "=".join(["INPUT", sort_bam + ".bam"]),
                    "=".join(["OUTPUT", rmdup_bam]),
                    "=".join(["METRICS_FILE", rmdup_metrics]),
                    "=".join(["REMOVE_DUPLICATES", "true"]),
                    "=".join(["ASSUME_SORTED", "true"])]
        try:
            p = Popen(cmd_args, stdout=errorlog, stderr=errorlog)
            p.wait()
        except:
            errorlog.write("Failed to remove duplicates.\n")
            raise
         
        try:
            print>>errorlog, "Indexing..."
            pysam.index(rmdup_bam)
        except SamtoolsError as detail:
            print>>errorlog, "Indexing failed: ",detail
    else:
        try:
            print>>errorlog, "Indexing..."
            sort_bam = sort_bam + ".bam"
            pysam.index(sort_bam)
        except SamtoolsError as detail:
            print>>errorlog, "Indexing failed: ", detail
    
 
    bamfile_fs = open(bam_dir + "stat/" + bam_prefix + "_stat", 'w')
    for line in pysam.flagstat(bamfile):
        bamfile_fs.write(line)
    bamfile_fs.close()

    return 0
Пример #16
0
def get_coverage(bamfile, match_string):
    """Get total reads using samtools flagstat """
    o = pysam.flagstat(bamfile)
    total_reads = int([s for s in o.split('\n') if match_string in s][0].split()[0])
    return total_reads  
Пример #17
0
 def testWithoutRedirectedStdout(self):
     r = pysam.flagstat(os.path.join(DATADIR, "ex1.bam"),
                        catch_stdout=False)
     self.assertTrue(len(r) == 0)
Пример #18
0
 def testWithRedirectedStdout(self):
     r = pysam.flagstat(os.path.join(DATADIR, "ex1.bam"))
     self.assertTrue(len(r) > 0)
Пример #19
0
def main():
    """The main function"""
    current_dir = os.getcwd()

    #mypath = current_dir + "/bams/"
    mypath = current_dir + "/alignments/"
    #bed_file = "/vlsci/UOM0040/shared/djp/rover_file/crc/CRC_10g_23May16.final.rover.bed"
    fastq_dir = current_dir + "/fastqs/"
    config_file = "pipeline.config"
    with open(config_file, 'r') as stream:
        try:
            bed_file = yaml.load(stream)['target_bed']
        except yaml.YAMLError as exc:
            print("Error with config file: " + exc)

    onlyfiles = []

    for root, dirs, files in os.walk(mypath):
        for file in files:
            if file.endswith(".primary.primerclipped.bam"):
                current_file = mypath + str(file)
                onlyfiles.append(os.path.join(root, file))

    #onlyfiles = [files for files in listdir(mypath) if (isfile(join(mypath, files)) and (files.endswith('.primary.primerclipped.bam')))]
    #file_paths = [join(mypath,files) for files in listdir(mypath) if (isfile(join(mypath, files)) and (files.endswith('.bam')))]
    #onlyfiles = [files for files in listdir(mypath) if (files.endswith(''))]

    #print onlyfiles
    #print len(onlyfiles)

    # stats list
    header = '\t'.join([ 'Sample_ID', 'Total_fastq_reads', 'Primary_reads', 'Reads_mapping_to_genome' , 'Reads_mapping_to_target', 'Percent_reads_mapping_to_genome', 'Percent_reads_mapping_to_target', 'Average_depth', \
        'Percent_target_not_covered', 'Percent_target_covered_at_<10X', 'Percent_target_covered_at_10X', 'Percent_target_covered_at_20X', 'Percent_target_covered_at_50X', 'Median_depth', \
        'Percent_target_covered_at_median', \
        'Percent_target_covered_at_median_10_fold', 'Percent_target_covered_at_median_20_fold', 'Percent_target_covered_at_median_30_fold', \
        'Percent_target_covered_at_median_40_fold', 'Percent_target_covered_at_median_50_fold'])
    #, 'Percent_target_covered_at_q50', \
    #'Percent_target_covered_at_q60', 'Percent_target_covered_at_q70', 'Percent_target_covered_at_q80'])

    #header = "Sample\tTotal_reads\tMapped_reads"

    print header

    for bam_file in onlyfiles:
        current_bam_file = join(mypath, bam_file)
        temp_bam_file = os.path.basename(current_bam_file)
        sample = temp_bam_file.replace(".primary.primerclipped.bam", "")

        fastq1 = fastq_dir + sample + "_L01_R1_001.fastq"
        fastq2 = fastq_dir + sample + "_L01_R2_001.fastq"

        fastq1_lc = int(
            subprocess.check_output(["wc", "-l",
                                     fastq1]).lstrip(' ').split(' ')[0])
        fastq2_lc = int(
            subprocess.check_output(["wc", "-l",
                                     fastq2]).lstrip(' ').split(' ')[0])

        total_fastq_lines = fastq1_lc + fastq2_lc
        total_fastq_reads = total_fastq_lines / 4

        flagstats = pysam.flagstat(current_bam_file)
        all_reads = int(flagstats.split('\n')[0].split('+')[0])
        reads_mapping_to_genome = int(flagstats.split('\n')[5].split('+')[0])

        x = pybedtools.example_bedtool(current_bam_file)
        b = pybedtools.example_bedtool(bed_file)
        y = x.intersect(b).moveto(join(mypath, 'temp.bam'))
        c = b.coverage(x)

        average_depth = calculate_average_depth(c)
        median_depth = calculate_median_depth(c)

        percent_target_not_covered = calculate_zero_depth_intervals(c, 0)
        percent_target_covered_at_L10X = calculate_zero_depth_intervals(c, 10)

        percent_target_covered_at_10X = calculate_x_depth_intervals(c, 10)
        percent_target_covered_at_20X = calculate_x_depth_intervals(c, 20)
        percent_target_covered_at_50X = calculate_x_depth_intervals(c, 50)

        percent_target_covered_at_median = calculate_x_depth_intervals_folds(
            c, median_depth, median_depth)
        # Using percentage from median
        #percent_target_covered_at_median_X10 = calculate_x_depth_intervals_folds(c, (median_depth - median_depth * (10.0/100)), (median_depth + median_depth * (10.0/100)))
        '''
        percent_target_covered_at_median_10_fold = calculate_x_depth_intervals_folds(c, (median_depth - (median_depth * 0.10)), (median_depth + (median_depth * 0.10)))
        percent_target_covered_at_median_20_fold = calculate_x_depth_intervals_folds(c, (median_depth - (median_depth * 0.20)), (median_depth + (median_depth * 0.20)))
        percent_target_covered_at_median_50_fold = calculate_x_depth_intervals_folds(c, (median_depth - (median_depth * 0.50)), (median_depth + (median_depth * 0.50)))
        percent_target_covered_at_median_60_fold = calculate_x_depth_intervals_folds(c, (median_depth - (median_depth * 0.60)), (median_depth + (median_depth * 0.60)))
        percent_target_covered_at_median_70_fold = calculate_x_depth_intervals_folds(c, (median_depth - (median_depth * 0.70)), (median_depth + (median_depth * 0.70)))
        percent_target_covered_at_median_80_fold = calculate_x_depth_intervals_folds(c, (median_depth - (median_depth * 0.80)), (median_depth + (median_depth * 0.80)))
        '''

        percent_target_covered_at_median_10_fold = calculate_x_depth_intervals(
            c, (median_depth / (10)))
        percent_target_covered_at_median_20_fold = calculate_x_depth_intervals(
            c, (median_depth / (20)))
        percent_target_covered_at_median_30_fold = calculate_x_depth_intervals(
            c, (median_depth / (30)))
        percent_target_covered_at_median_40_fold = calculate_x_depth_intervals(
            c, (median_depth / (40)))
        percent_target_covered_at_median_50_fold = calculate_x_depth_intervals(
            c, (median_depth / (50)))

        stats_temp = pysam.flagstat(join(mypath, 'temp.bam'))
        on_target_reads = int(stats_temp.split('\n')[0].split('+')[0])
        reads_mapping_to_target = int(stats_temp.split('\n')[5].split('+')[0])

        #percent_reads_mapping_to_genome = ((reads_mapping_to_genome * 1.0)/all_reads)*100.0
        percent_reads_mapping_to_genome = (
            (reads_mapping_to_genome * 1.0) / total_fastq_reads) * 100.0
        #percent_reads_mapping_to_target = ((reads_mapping_to_target * 1.0)/on_target_reads)*100.0
        percent_reads_mapping_to_target = (
            (reads_mapping_to_target * 1.0) / total_fastq_reads) * 100.0

        os.remove(join(mypath, 'temp.bam'))

        print("%s\t%d\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f" % (sample, total_fastq_reads, all_reads, reads_mapping_to_genome, reads_mapping_to_target, \
            percent_reads_mapping_to_genome, percent_reads_mapping_to_target, average_depth, \
            percent_target_not_covered, percent_target_covered_at_L10X, percent_target_covered_at_10X, percent_target_covered_at_20X, percent_target_covered_at_50X, \
            median_depth, percent_target_covered_at_median, \
            percent_target_covered_at_median_10_fold, percent_target_covered_at_median_20_fold, percent_target_covered_at_median_30_fold, \
            percent_target_covered_at_median_40_fold, percent_target_covered_at_median_50_fold))
Пример #20
0
def main():

    # Argument parsing
    parser = argparse.ArgumentParser(description="Calculate useful stats for "
                                     "capture-based experiments.")
    parser.add_argument(
        "bam_file",
        help="Input BAM file (sorted, rmduped and indexed). "
        "Assumes that the BAM file contains reads of one length.")
    parser.add_argument("bed_file", help="BED file listing target intervals.")
    parser.add_argument("--output", help="Output TSV stats file.")
    parser.add_argument(
        "--exclude",
        help="BED file listing regions to be excluded. "
        "Useful for pooled libraries sharing the same multiplex index.")
    args = parser.parse_args()

    # Initialize variables
    inbam = pysam.AlignmentFile(args.bam_file, "rb")
    inbed = cancer_api.files.BedFile.open(args.bed_file)
    excl_bed = cancer_api.files.BedFile.open(
        args.exclude) if args.exclude else None
    output = args.output if args.output else inbam.filename + ".capture_stats.tsv"
    stats = OrderedDict()

    # Define helper functions
    def count_interval_reads(bam_file, interval, viewed_reads={}):
        """Counts the number of reads in a given interval in a BAM file.
        Adds viewed reads to dictionary to avoid double-counting of reads.
        """
        reads_iterator = inbam.fetch(interval.chrom, interval.start_pos,
                                     interval.end_pos)
        read_qnames = [
            r.query_name for r in reads_iterator
            if r.query_name not in viewed_reads and not r.is_duplicate
        ]
        num_reads = len(read_qnames)
        for qname in read_qnames:
            viewed_reads[qname] = 1
        return num_reads

    # Calculate overall coverage
    read_length = inbam.head(1).next().query_length
    # Using flagstat in order to not count duplicate reads
    total_index = 0
    dups_index = 3
    flagstat_regex = r"(\d+).*"
    flagstat = pysam.flagstat(inbam.filename)
    num_total = int(re.match(flagstat_regex, flagstat[total_index]).group(1))
    num_dups = int(re.match(flagstat_regex, flagstat[dups_index]).group(1))
    genome_num_mapped = num_total - num_dups
    # Calculate genome length
    genome_length = sum(inbam.lengths)
    if excl_bed:
        # Correct for excluded regions
        excl_length = 0
        excl_num_mapped = 0
        for interval in excl_bed:
            excl_length += interval.length
            excl_num_mapped += count_interval_reads(inbam, interval)
        genome_num_mapped -= excl_num_mapped
        genome_length -= excl_length
    genome_cov = read_length * genome_num_mapped / float(genome_length)
    stats["Genome_Coverage"] = round(genome_cov, 3)

    # Calculate on-target coverage
    target_length = 0
    target_num_mapped = 0
    viewed_reads = {}
    for interval in inbed:
        target_length += interval.length
        target_num_mapped += count_interval_reads(inbam, interval,
                                                  viewed_reads)
    target_cov = read_length * target_num_mapped / float(target_length)
    stats["Target_Coverage"] = round(target_cov, 3)

    # Calculate percent on-target
    percent_on_target = target_num_mapped / float(genome_num_mapped) * 100
    stats["Percent_Reads_On_Target"] = round(percent_on_target, 3)

    # Calculate percent fold enrichment
    percent_fold_enrichment = target_cov / genome_cov * 100
    stats["Percent_Fold_Enrichment"] = round(percent_fold_enrichment, 3)

    # Write out stats to file
    with open(output, "w") as outfile:
        for k, v in stats.items():
            stats[k] = str(v)
        outfile.write("\t".join(stats.keys()) + "\n")
        outfile.write("\t".join(stats.values()) + "\n")

    # Cleanup
    inbam.close()
Пример #21
0
                    default=872674,
                    help='Seed for the sampler.')
args = parser.parse_args()

if args.number_reads and args.same_depth:
    logging.warning("--number-reads was passed; ignoring --same-depth")
    args.same_depth = False

if args.number_reads is None and not args.same_depth:
    logging.error("--number-reads or --same-depth must be passed")
    sys.exit(1)

# Determine the number of QC-passed reads in each of the bam files
logging.info("Determining the number of QC-passed reads in each file")

flagstats = [pysam.flagstat(x).split('\n')[0] for x in args.bam]
number_reads = [
    int(
        re.match(
            '\A(\d+) \+ (\d+) in total \(QC-passed reads \+ QC-failed reads\)\Z',
            i).group(1)) for i in flagstats
]

for index, bam in enumerate(args.bam):
    logging.info("Found {} reads in {}".format(number_reads[index], bam))

# Now perform the subsampling
for index, bam in enumerate(args.bam):

    bam_prefix = re.match('\A(.*).bam\Z', os.path.basename(bam)).group(1)
    subsample_fraction = None
Пример #22
0
def main(options):
    """
    
    Runs all analysies 
    
    one thing to do is make graphs fail gracefully 
    
    """

    #from subprocess import Popen, PIPE
    #host = Popen(["hostname"], stdout=PIPE).communicate()[0].strip()

    #gets clusters in a bed tools + names species
    clusters = options.clusters
    species = options.species
    clusters_bed = pybedtools.BedTool(clusters)

    #makes output file names
    clusters = str.replace(clusters, ".BED", "")
    options.k = map(int, options.k)
    outdir = options.outdir

    #sets up output dirs
    make_dir(outdir)

    assigned_dir = os.path.join(outdir, "assigned")
    misc_dir = os.path.join(outdir, "misc")
    fastadir = os.path.join(outdir, "fasta")
    kmerout = os.path.join(outdir, "kmer")
    homerout_base = os.path.join(outdir, "homer")
    make_dir(homerout_base)
    homerout = os.path.join(homerout_base, clusters)

    make_dir(assigned_dir)
    make_dir(misc_dir)
    make_dir(fastadir)
    make_dir(homerout)
    make_dir(kmerout)

    all_regions = (["all", "exon", "UTR3", "UTR5", "proxintron", "distintron"])

    #Not quite sure whats going on here, but its one logical block
    #either reassigns clusters to genic regions or reads from already
    #made assigned lists

    if options.assign is False:
        try:
            cluster_regions, sizes, Gsizes = build_assigned_from_existing(
                assigned_dir, clusters, all_regions, options.nrand)
            print "I used a pre-assigned set of BED files... score!"
        except:
            print "I had problems retreiving region-assigned BED files from %s, i'll rebuild" % (
                assigned_dir)
            options.assign = True

    if options.assign is True:
        print "Assigning Clusters to Genic Regions"
        cluster_regions, sizes, Gsizes = assign_to_regions(
            clusters_bed,
            options.genome_location,
            options.regions_location,
            species=species,
            getseq=True,
            nrand=options.nrand)
        print "Done Assigning"

        print "Saving BED and Fasta Files...",

        #outputs little files (maybe move inside of assign to regions)
        sizes_out = open(
            os.path.join(assigned_dir, "%s.sizes.pickle" % (clusters)), 'w')
        pickle.dump(sizes, file=sizes_out)
        sizes_out.close()
        Gsizes_out = open(os.path.join(assigned_dir, "Gsizes.pickle"), 'w')
        pickle.dump(Gsizes, file=Gsizes_out)
        Gsizes_out.close()

        #this is where all saving happens for assign to regions
        for region in all_regions:
            of = clusters + "." + region + ".real.BED"
            try:
                cluster_regions[region]['real'].saveas(
                    os.path.join(assigned_dir, of))
            except:
                continue
            for n in range(options.nrand):
                of = clusters + "." + region + ".rand." + str(n) + ".BED"
                try:
                    cluster_regions[region]['rand'][n].saveas(
                        os.path.join(assigned_dir, of))
                except:
                    continue

        print "done"

        #creates pretty file names for all regions
        for region in all_regions:
            try:
                real_fa = fa_file(clusters,
                                  region=region,
                                  fd=fastadir,
                                  type="real")
                rand_fa = fa_file(clusters,
                                  region=region,
                                  fd=fastadir,
                                  type="random")
                cluster_regions[region]['real'].save_seqs(real_fa)

                l = list()  #list of randoms
                for n in cluster_regions[region]['rand'].keys():
                    l.append(cluster_regions[region]['rand'][n])
                write_seqs(rand_fa, l)
            except:
                continue

    print "Counting reads in clusters...",

    #generates data for figure 1 and 2
    #gets reads in clusters (figure 1)
    #gets reads per cluster (figure 2)
    reads_in_clusters = 0
    reads_per_cluster = list()
    for cluster in cluster_regions['all']['real']:
        chr, start, stop, name, score, strand, tstart, tstop = str(
            cluster).strip().split("\t")
        try:
            gene, n, reads = name.split("_")
        except:
            try:
                gene, n, reads = name.split(";")[0].split("_")
            except:
                pass
        if int(reads) > 1:
            reads_per_cluster.append(int(reads))
        reads_in_clusters += int(reads)
    print "done"

    #need to get rid of this pickleing busniess, its a waste of space and doesn't work with other methods
    #gets total number of reads (figure 1)
    #gets total number of reads from clipper analysis (Need to make clipper automatically output
    #pickle file
    print "Getting total number of reads...",
    total_reads = 0
    try:
        pickle_file = clusters + ".pickle"
        if os.path.exists(pickle_file):
            pf = pickle.load(open(pickle_file, 'rb'))
        else:
            print "Couldn't find %s" % (pickle_file)
        print "Found %s" % (pickle_file)
        for gene in pf:
            total_reads += gene['nreads']

    #if clipper didn't output gets it from flagstat
    except:
        print "Couldn't find a pickled file, resorting to flagstat for total reads. (this includes intergenic reads)"
        flagstats = pysam.flagstat(options.bam)
        total_reads = int(flagstats[2].split(" ")[0])

    print "done, there were %d" % (total_reads)
    print "Gathering bed lengths...",

    #one stat is just generated here
    #generates cluster lengths (figure 3)
    cluster_lengths = bedlengths(cluster_regions['all']['real'])
    print "done"

    ##This should be abstracted to some sort of list or something...
    #figures 5 and 6, builds pre-mrna, mrna exon and intron distributions
    mRNA_positions = list()
    premRNA_positions = list()
    intron_positions = list()
    exon_positions = list()

    #also builds figure 10 (exon distances)
    GENES, Gtypes = build_AS_STRUCTURE_dict(species, options.as_structure)
    types = {}
    for type in ["CE:", "SE:", "MXE:", "A5E:", "A3E:"]:
        types[type] = 0
    print "locating clusters within genes",

    try:
        #counts nearest exon to peak and gets RNA
        #gets rna positon for every line as well
        for line in (cluster_regions['all']['real']):
            mRNA_frac, premRNA_frac, exon_frac, intron_frac, nearest_type = RNA_position(
                line, GENES)
            if mRNA_frac is not None:
                mRNA_positions.append(mRNA_frac)
            if exon_frac is not None:
                exon_positions.append(exon_frac)
            if premRNA_frac is not None:
                premRNA_positions.append(premRNA_frac)
            if intron_frac is not None:
                intron_positions.append(intron_frac)
            if nearest_type is not None:
                try:
                    types[nearest_type] += 1
                except:
                    types[nearest_type] = 1
    except:
        print "there were errors, skipping"
    print "done"

    #gtypes is total genomic content
    #types is what clusters are
    #generates figure 10 (exon distances)
    type_count = [
        types["CE:"], types["SE:"], types["MXE:"], types["A5E:"], types["A3E:"]
    ]
    Gtype_count = [
        Gtypes["CE:"], Gtypes["SE:"], Gtypes["MXE:"], Gtypes["A5E:"],
        Gtypes["A3E:"]
    ]

    ### write fasta files and run homer and/or kmer analysis if at least one analysis is requested
    #runs kmer and homer analysis

    kmer_results = {}
    if options.reMotif is True:
        for region in all_regions:

            #reads nicely named files
            real_fa = fa_file(clusters,
                              region=region,
                              fd=fastadir,
                              type="real")
            rand_fa = fa_file(clusters,
                              region=region,
                              fd=fastadir,
                              type="random")
            if options.k is not None:
                if options.homer is True:
                    region_homer_out = os.path.join(homerout, region)
                    run_homer(real_fa,
                              rand_fa,
                              options.k,
                              outloc=region_homer_out)
                for k in options.k:
                    kmer_results[k] = {}
                    kmer_results[k][region] = kmerdiff(real_fa, rand_fa, k)
                    kmerfile = clusters + ".k" + str(
                        k) + "." + region + ".kmerdiff"
                    kmerfile = os.path.join(kmerout, kmerfile)
                    kmer_sorted_output = run_kmerdiff(real_fa,
                                                      rand_fa,
                                                      outfile=kmerfile,
                                                      k=k)

    #all the different motifs that the user specifices
    motifs = list(options.motif)
    kmer_box_params = [kmerout, clusters, options.k, motifs]

    ###conservation --should use multiprocessing to speed this part up!
    #start of conservation logic, very slow...
    phast_values = list()

    #loads phastcons values of generates them again
    if options.rePhast is False:
        try:
            phast_values = pickle.load(
                open(os.path.join(misc_dir, "%s.phast.pickle" % (clusters))))
        except:
            options.rePhast = True

    #generates again
    if options.rePhast is True:
        print "Fetching Phastcons Scores...",

        #phastcons values for all regions except "all"
        for region in all_regions[1:]:  #skip "all" combine them later
            print("%s..." % (region)),
            try:
                samplesize = 1000

                #because it takes so long to fetch only select 1000 of them, not actually
                #implemented
                if len(cluster_regions[region]['real']) > samplesize:
                    R1 = cluster_regions[region]['real']
                    # R1 = random.sample(cluster_regions[region]['real'], samplesize)
                else:
                    R1 = cluster_regions[region]['real']

                #realPhast = get_phastcons(cluster_regions[region]['real'], species=options.species)
                print "getting real...",

                #gets phastcons values real regions
                realPhast = get_phastcons(R1,
                                          options.phastcons_location,
                                          species=options.species)
                randPhast = list()

                #logic for random stuff (could be precomputed)
                for i in range(options.nrand):
                    if len(cluster_regions[region]['rand'][i]) > samplesize:
                        R2 = cluster_regions[region]['rand'][i]
                        #R2 = random.sample(cluster_regions[region]['rand'][i], samplesize)
                    else:
                        R2 = cluster_regions[region]['rand'][i]
                    print("getting rand %d" % (i)),
                    randPhast.extend(
                        get_phastcons(R2,
                                      options.phastcons_location,
                                      species=options.species).tolist())

                #list of lists for real and random for every genic region
                phast_values.append(realPhast)
                phast_values.append(randPhast)

            except:
                continue

        #hacky selection of real values from phast_values
        all_real = np.concatenate(phast_values[::2])

        #hacky selection of random values from phast_values
        all_rand = np.concatenate(phast_values[1::2])

        #adds back in all and rand to phast_values list
        phast_values.insert(0, all_rand)
        phast_values.insert(0, all_real)
        pickout = open(os.path.join(misc_dir, "%s.phast.pickle" % (clusters)),
                       'w')
        pickle.dump(phast_values, file=pickout)

    Zscores = None  #old. remove

    #build qc figure
    QCfig_params = [
        reads_in_clusters, (total_reads - reads_in_clusters), cluster_lengths,
        reads_per_cluster, premRNA_positions, mRNA_positions, exon_positions,
        intron_positions, Gsizes, sizes, Gtype_count, type_count, Zscores,
        homerout, kmer_box_params, phast_values
    ]

    #save results
    pickout = open(
        os.path.join(outdir, "misc", "%s.qcfig_params.pickle" % (clusters)),
        'w')
    pickle.dump(QCfig_params, file=pickout)
    QCfig = CLIP_Analysis_Display.CLIP_QC_figure(*QCfig_params)
    fn = clusters + ".QCfig.pdf"
    outFig = os.path.join(outdir, fn)

    #TODO Fix output of file (Don't know why its crashing right now
    print >> sys.stderr, outFig
    QCfig.savefig(outFig)

    ### does something with motifs doesn't appear to work right now

    #reads in existing precompiled motif file
    motifs = list(options.motif)

    if motifs is not None and False:  #TODO hack to get stuff compiling fix soon
        motifBASE = options.motif_location
        fig = pylab.figure(figsize=(8.5, 11))
        colors = [
            "red", "orange", "green", "blue", "purple", "brown", "black",
            "pink", "gray", "cyan", "magenta"
        ]
        for i, motif in enumerate(motifs):
            mf = "motif_" + motif + ".BED"
            mfgz = "motif_" + motif + ".BED.gz"
            print os.path.join(motifBASE, species, mf)
            motifFILE = None

            if os.path.exists(os.path.join(motifBASE, species, mf)):
                motifFILE = os.path.join(motifBASE, species, mf)
            elif os.path.exists(os.path.join(motifBASE, species, mfgz)):
                motifFILE = os.path.join(motifBASE, species, mfgz)
            else:
                print "MOTIF BED FILE for motif: %s is not available, please build it" % (
                    mf)
                continue

            #plots motif distance from the precompiled file to the clusters
            plot_motif_dist(cluster_regions,
                            motifFILE,
                            fig,
                            color=colors[i],
                            species=species,
                            slopsize=200)
        pylab.savefig(clusters + ".motif_distribution.pdf")
Пример #23
0
def main():

    # Argument parsing
    parser = argparse.ArgumentParser(description="Calculate useful stats for "
                                     "capture-based experiments.")
    parser.add_argument("bam_file", help="Input BAM file (sorted, rmduped and indexed). "
                        "Assumes that the BAM file contains reads of one length.")
    parser.add_argument("bed_file", help="BED file listing target intervals.")
    parser.add_argument("--output", help="Output TSV stats file.")
    parser.add_argument("--exclude", help="BED file listing regions to be excluded. "
                        "Useful for pooled libraries sharing the same multiplex index.")
    args = parser.parse_args()

    # Initialize variables
    inbam = pysam.AlignmentFile(args.bam_file, "rb")
    inbed = cancer_api.files.BedFile.open(args.bed_file)
    excl_bed = cancer_api.files.BedFile.open(args.exclude) if args.exclude else None
    output = args.output if args.output else inbam.filename + ".capture_stats.tsv"
    stats = OrderedDict()

    # Define helper functions
    def count_interval_reads(bam_file, interval, viewed_reads={}):
        """Counts the number of reads in a given interval in a BAM file.
        Adds viewed reads to dictionary to avoid double-counting of reads.
        """
        reads_iterator = inbam.fetch(interval.chrom, interval.start_pos, interval.end_pos)
        read_qnames = [r.query_name for r in reads_iterator
                       if r.query_name not in viewed_reads and not r.is_duplicate]
        num_reads = len(read_qnames)
        for qname in read_qnames:
            viewed_reads[qname] = 1
        return num_reads

    # Calculate overall coverage
    read_length = inbam.head(1).next().query_length
    # Using flagstat in order to not count duplicate reads
    total_index = 0
    dups_index = 3
    flagstat_regex = r"(\d+).*"
    flagstat = pysam.flagstat(inbam.filename)
    num_total = int(re.match(flagstat_regex, flagstat[total_index]).group(1))
    num_dups = int(re.match(flagstat_regex, flagstat[dups_index]).group(1))
    genome_num_mapped = num_total - num_dups
    # Calculate genome length
    genome_length = sum(inbam.lengths)
    if excl_bed:
        # Correct for excluded regions
        excl_length = 0
        excl_num_mapped = 0
        for interval in excl_bed:
            excl_length += interval.length
            excl_num_mapped += count_interval_reads(inbam, interval)
        genome_num_mapped -= excl_num_mapped
        genome_length -= excl_length
    genome_cov = read_length * genome_num_mapped / float(genome_length)
    stats["Genome_Coverage"] = round(genome_cov, 3)

    # Calculate on-target coverage
    target_length = 0
    target_num_mapped = 0
    viewed_reads = {}
    for interval in inbed:
        target_length += interval.length
        target_num_mapped += count_interval_reads(inbam, interval, viewed_reads)
    target_cov = read_length * target_num_mapped / float(target_length)
    stats["Target_Coverage"] = round(target_cov, 3)

    # Calculate percent on-target
    percent_on_target = target_num_mapped / float(genome_num_mapped) * 100
    stats["Percent_Reads_On_Target"] = round(percent_on_target, 3)

    # Calculate percent fold enrichment
    percent_fold_enrichment = target_cov / genome_cov * 100
    stats["Percent_Fold_Enrichment"] = round(percent_fold_enrichment, 3)

    # Write out stats to file
    with open(output, "w") as outfile:
        for k, v in stats.items():
            stats[k] = str(v)
        outfile.write("\t".join(stats.keys()) + "\n")
        outfile.write("\t".join(stats.values()) + "\n")

    # Cleanup
    inbam.close()
Пример #24
0
def has_reads_mapped(bam):
    return int(pysam.flagstat(bam).split('\n')[4][0]) > 0
Пример #25
0
def main():
    """Inputs:
    -- Expression output table file
    -- Transcript/gene symbol containing column number
    -- Reads per Tx/gene containing column number
    -- Bam file
    -- List of transcript/gene symbols
    -- Name of condition/sample
    Outputs:
    -- table of transcript/gene symbols, condition/sample name,original read count, and read counts normalized to each million mapped reads."""
    
    
    usage = """python %prog [options]"""
    parser = optparse.OptionParser(usage=usage)
    
    parser.add_option('-t', type="string", default=False,
                      help="""Name of a table file containing Tx/Gene symbols and respective read counts for at least one condition/sample. (default=%default)""")
    parser.add_option('-c', type='string', default=False,
                      help="""Name of the condition/sample. (default=%default)""")
    parser.add_option('--tx-col', dest="tx_col", type='string', default=False,
                      help="""The column number containing Tx/Gene symbols. (default=%default)""")
    parser.add_option('--rd-col', dest="rd_col", type='string', default=False,
                      help="""The column number containing read counts. (default=%default)""")
    parser.add_option('-b', type='string', default=False,
                      help="""Path to the bam file representing the aligned reads for the desired condition/sample. (default=%default)""")
    parser.add_option('-l', type='string', default="All",
                      help="""Quoted, comma-delim'd list of transcript/gene symbols. (default=%default)""")


    
    (opts, args) = parser.parse_args()
    
    if len(sys.argv) == 1:
        parser.print_help()
        exit(1)
    
    if not opts.l == "All":
        opts.l = opts.l.split(',')
    else:
        opts.l = ''

    opts.tx_col = int(opts.tx_col)
    opts.rd_col = int(opts.rd_col)
    
    # get million mapped reads
    milMapped = float(pysam.flagstat(opts.b)[3].split()[0])/1000000
    #milMapped = 7.5 # for quick debugging to bybass the bam filtering
    
    # parse list of tx/gene symbols into dict for tracking successes
    ##symbols = {}
    ##if not opts.l == "All":
        ##for sym in opts.l:
            ##symbols[sym] = []
    
        
    
    # open expression table into rows
    rows = csv.reader(open(opts.t),delimiter='\t')
    
    # for each tx/gene in expFile: output million mapped reads (MMR)
    #   if row[opts.tx_col].startswith(<any of the requested symbols>)
    #   and update symbols with True if a hit is found.
    print "Tx_symbol\tCondition\tOriginal_reads\tReads_per_million_mapped"
    for row in rows:
        if row[opts.tx_col].startswith(tuple(opts.l)):
            try:
                print "%s\t%s\t%s\t%s" % (row[opts.tx_col],
                                      opts.c,
                                      row[opts.rd_col],
                                      float(row[opts.rd_col])/milMapped)
            except:
                print "failed:  %s" % (';'.join(row))
Пример #26
0
 def testWithRedirectedStdout(self):
     r = pysam.flagstat(os.path.join(DATADIR, "ex1.bam"))
     self.assertTrue(len(r) > 0)
Пример #27
0
def main():

    args = args_setup()

    path_exists(args.input)
    path_dir(args.output)
    check_ext(args.input)

    if args.mapread_stats:
        mapping_stats(args.input)

    if (args.bam_filter):
        bam_filter(args.input, args.ref_match, args.map_qual, args.read_qual,
                   args.pident, args.read_len)

    #Check the case if --bam-filter was not selected but default arguments were changed:
    if (
            args.map_qual or args.read_qual or args.pident
            or args.read_len != 0 or args.ref_match != '.*'
    ):  #if this statment is True, default arguments were changed, next is to check if --bam-filter was selected:
        if (
                not args.bam_filter
        ):  #if this statement is Flase, --bam-filter was selected & run with default arguments
            sys.exit(
                "Error: -b/--bam_filter argument is missing"
            )  #if True, --bam-filter was not selected but default arguments were changed

    if (args.bamtofastq):
        bam_to_fastq(args.input)

    if (args.mapped_reads):
        extract_mapped_reads(args.input)

    if (args.fastq_concat):
        fastq_concat(args.input)

    if (args.fastq_fasta):
        fastq_to_fasta(args.input)

    if (args.fastq_filter):
        fastq_filter(args.input, args.fastq_filter)

    if (args.vcf_filter):
        vcf_filter(args.input, args.vcf_filter)

    #several samtools commands (samtools functionallity with pysam)
    file_name = os.path.split(
        args.input[0])[1]  #obtain only the file name withoout the path
    if args.sort:
        pos_args_len(args.input, 1)
        ps.sort("-o", file_name + ".sorted.bam", args.input[0])
        print("Sorted bam file <" + file_name + ".sorted.bam" +
              "> was created")
    if args.index:
        pos_args_len(args.input, 1)
        ps.index(args.input[0])
        print("Index file <" + file_name + ".bai" + "> was created")
    if args.flagstat:
        pos_args_len(args.input, 1)
        print("Reads mapping summary for %s" % (file_name))
        print(ps.flagstat(args.input[0]))
Пример #28
0
 def testWithoutRedirectedStdout(self):
     r = pysam.flagstat(os.path.join(DATADIR, "ex1.bam"),
                        catch_stdout=False)
     self.assertTrue(len(r) == 0)
Пример #29
0
def proc(arg):
    bamfile = arg[0]
    rmdup = arg[1]
    errorlog = arg[2]
    if errorlog == "stderr":
        errorlog = sys.stderr
    if rmdup == "False": rmdup = False

    bam_dir = "/".join(bamfile.split("/")[:-1]) + "/"
    bam_prefix = os.path.basename(bamfile).split(".bam")[0]
    mapped_bam = bam_dir + bam_prefix + "_mapped.bam"
    rmdup_bam = bam_dir + bam_prefix + "_rmdup.bam"
    sort_bam = bam_dir + bam_prefix + "_sort"

    stat_dir = bam_dir + "stat/"
    if not os.path.exists(stat_dir): os.makedirs(stat_dir)

    if not os.path.exists(mapped_bam):
        print >> errorlog, "Removing unmapped..."
        mapped = 0
        unmapped = 0
        bam = pysam.Samfile(bamfile, 'rb')
        mb = pysam.Samfile(mapped_bam, 'wb', template=bam)
        try:
            for read in bam:
                if not read.is_unmapped:
                    mapped = mapped + 1
                    mb.write(read)
                else:
                    unmapped = unmapped + 1
        except:
            errorlog.write(
                "Failed to remove unmapped reads: read number {0}\n".format(
                    mapped + unmapped))
            raise
        else:
            errorlog.write(
                "Unmapped read removal successful: Mapped {0}/Unmapped {1}\n".
                format(mapped, unmapped))

        bam.close()
        mb.close()

    if not os.path.exists(sort_bam + ".bam"):

        print >> errorlog, "Sorting..."

        try:
            cmd_args = [
                'java', '-Xmx2g', '-jar', '/seq/picard/SortSam.jar',
                "=".join(["INPUT",
                          mapped_bam]), "=".join(["OUTPUT",
                                                  sort_bam + ".bam"]),
                "=".join(["SORT_ORDER", "coordinate"])
            ]
            p = Popen(cmd_args, stdout=errorlog, stderr=errorlog)
            p.wait()
        except:
            errorlog.write("Sorting failed.\n")
            raise
        else:
            os.remove(mapped_bam)

    if not os.path.exists(rmdup_bam) and rmdup:
        print "Removing duplicates..."
        rmdup_metrics = stat_dir + bam_prefix + "_rmdup_metrics"
        cmd_args = [
            'java', '-Xmx2g', '-jar', '/seq/picard/MarkDuplicates.jar',
            "=".join(["INPUT",
                      sort_bam + ".bam"]), "=".join(["OUTPUT", rmdup_bam]),
            "=".join(["METRICS_FILE",
                      rmdup_metrics]), "=".join(["REMOVE_DUPLICATES", "true"]),
            "=".join(["ASSUME_SORTED", "true"])
        ]
        try:
            p = Popen(cmd_args, stdout=errorlog, stderr=errorlog)
            p.wait()
        except:
            errorlog.write("Failed to remove duplicates.\n")
            raise

        try:
            print >> errorlog, "Indexing..."
            pysam.index(rmdup_bam)
        except SamtoolsError as detail:
            print >> errorlog, "Indexing failed: ", detail
    else:
        try:
            print >> errorlog, "Indexing..."
            sort_bam = sort_bam + ".bam"
            pysam.index(sort_bam)
        except SamtoolsError as detail:
            print >> errorlog, "Indexing failed: ", detail

    bamfile_fs = open(bam_dir + "stat/" + bam_prefix + "_stat", 'w')
    for line in pysam.flagstat(bamfile):
        bamfile_fs.write(line)
    bamfile_fs.close()

    return 0
Пример #30
0
def main(options):
    
    """
    
    Runs all analysies 
    
    one thing to do is make graphs fail gracefully 
    
    """
    
    #from subprocess import Popen, PIPE
    #host = Popen(["hostname"], stdout=PIPE).communicate()[0].strip()
    
    #gets clusters in a bed tools + names species 
    clusters = options.clusters
    species = options.species
    clusters_bed = pybedtools.BedTool(clusters)

    #makes output file names 
    clusters = str.replace(clusters, ".BED", "")
    options.k = map(int, options.k)
    outdir = options.outdir
    
    #sets up output dirs
    make_dir(outdir)        

    assigned_dir = os.path.join(outdir, "assigned")
    misc_dir = os.path.join(outdir, "misc")
    fastadir = os.path.join(outdir, "fasta")    
    kmerout = os.path.join(outdir, "kmer")
    homerout_base = os.path.join(outdir, "homer")
    make_dir(homerout_base)
    homerout = os.path.join(homerout_base, clusters)    

    make_dir(assigned_dir)
    make_dir(misc_dir)
    make_dir(fastadir)
    make_dir(homerout)
    make_dir(kmerout)

    all_regions = (["all", "exon", "UTR3", "UTR5", "proxintron", "distintron"])    

    #Not quite sure whats going on here, but its one logical block
    #either reassigns clusters to genic regions or reads from already
    #made assigned lists

    if options.assign is False:
        try:
            cluster_regions, sizes, Gsizes = build_assigned_from_existing(assigned_dir, clusters, all_regions, options.nrand)
            print "I used a pre-assigned set of BED files... score!"
        except:
            print "I had problems retreiving region-assigned BED files from %s, i'll rebuild" % (assigned_dir)
            options.assign = True
            
    if options.assign is True:
        print "Assigning Clusters to Genic Regions"
        cluster_regions, sizes, Gsizes = assign_to_regions(clusters_bed,options.genome_location, options.regions_location, species=species, getseq=True, nrand=options.nrand)
        print "Done Assigning"
        
        print "Saving BED and Fasta Files...",

        #outputs little files (maybe move inside of assign to regions)
        sizes_out = open(os.path.join(assigned_dir, "%s.sizes.pickle" %(clusters)), 'w')
        pickle.dump(sizes, file=sizes_out)
        sizes_out.close()    
        Gsizes_out = open(os.path.join(assigned_dir, "Gsizes.pickle"), 'w')
        pickle.dump(Gsizes, file=Gsizes_out)
        Gsizes_out.close()
        
        #this is where all saving happens for assign to regions
        for region in all_regions:
            of = clusters + "." + region+ ".real.BED"
            try:
                cluster_regions[region]['real'].saveas(os.path.join(assigned_dir, of))
            except:
                continue
            for n in range(options.nrand):
                of = clusters + "." + region+ ".rand." + str(n) + ".BED"
                try:
                    cluster_regions[region]['rand'][n].saveas(os.path.join(assigned_dir, of))
                except:
                    continue
                
        print "done"

        #creates pretty file names for all regions
        for region in all_regions:
            try:
                real_fa = fa_file(clusters, region=region, fd = fastadir, type="real")
                rand_fa = fa_file(clusters, region=region, fd = fastadir, type="random")
                cluster_regions[region]['real'].save_seqs(real_fa)

                l = list()#list of randoms
                for n in cluster_regions[region]['rand'].keys():
                    l.append(cluster_regions[region]['rand'][n])
                write_seqs(rand_fa, l)        
            except:
                continue            
                                   
    print "Counting reads in clusters...",
    
    #generates data for figure 1 and 2
    #gets reads in clusters (figure 1)
    #gets reads per cluster (figure 2)
    reads_in_clusters = 0
    reads_per_cluster = list()
    for cluster in cluster_regions['all']['real']:
        chr, start, stop, name, score, strand, tstart, tstop = str(cluster).strip().split("\t")
        try:
            gene, n, reads = name.split("_")
        except:
            try:
                gene, n, reads = name.split(";")[0].split("_")
            except:
                pass
        if int(reads)> 1:
            reads_per_cluster.append(int(reads))
        reads_in_clusters += int(reads)
    print "done"
    
    #need to get rid of this pickleing busniess, its a waste of space and doesn't work with other methods
    #gets total number of reads (figure 1)
    #gets total number of reads from clipper analysis (Need to make clipper automatically output
    #pickle file
    print "Getting total number of reads...",
    total_reads = 0;
    try:
        pickle_file = clusters + ".pickle"
        if os.path.exists(pickle_file):
            pf = pickle.load(open(pickle_file, 'rb'))
        else:
            print "Couldn't find %s" %(pickle_file)
        print "Found %s" %(pickle_file)
        for gene in pf:
            total_reads += gene['nreads']
    
    #if clipper didn't output gets it from flagstat
    except:
        print "Couldn't find a pickled file, resorting to flagstat for total reads. (this includes intergenic reads)"
        flagstats = pysam.flagstat(options.bam)
        total_reads =int(flagstats[2].split(" ")[0])
        
    print "done, there were %d" %(total_reads)
    print "Gathering bed lengths...",
    
    #one stat is just generated here
    #generates cluster lengths (figure 3)
    cluster_lengths = bedlengths(cluster_regions['all']['real'])
    print "done"
    
    ##This should be abstracted to some sort of list or something...
    #figures 5 and 6, builds pre-mrna, mrna exon and intron distributions 
    mRNA_positions = list()
    premRNA_positions = list()
    intron_positions = list()
    exon_positions = list()
    
    #also builds figure 10 (exon distances)
    GENES, Gtypes = build_AS_STRUCTURE_dict(species, options.as_structure)
    types = {}
    for type in ["CE:", "SE:", "MXE:", "A5E:", "A3E:"]:
        types[type]=0
    print "locating clusters within genes",
    
    
    try:
        #counts nearest exon to peak and gets RNA 
        #gets rna positon for every line as well
        for line in (cluster_regions['all']['real']):
            mRNA_frac, premRNA_frac, exon_frac, intron_frac, nearest_type = RNA_position(line, GENES)
            if mRNA_frac is not None:
                mRNA_positions.append(mRNA_frac)
            if exon_frac is not None:
                exon_positions.append(exon_frac)
            if premRNA_frac is not None:
                premRNA_positions.append(premRNA_frac)
            if intron_frac is not None:
                intron_positions.append(intron_frac)
            if nearest_type is not None:
                try:
                    types[nearest_type] += 1
                except:
                    types[nearest_type] =1
    except:
        print "there were errors, skipping"
    print "done"
    
    #gtypes is total genomic content 
    #types is what clusters are
    #generates figure 10 (exon distances)
    type_count = [types["CE:"], types["SE:"], types["MXE:"], types["A5E:"], types["A3E:"]]
    Gtype_count = [Gtypes["CE:"], Gtypes["SE:"], Gtypes["MXE:"], Gtypes["A5E:"], Gtypes["A3E:"]]    

    ### write fasta files and run homer and/or kmer analysis if at least one analysis is requested
    #runs kmer and homer analysis
    
    kmer_results = {} 
    if options.reMotif is True:
        for region in all_regions:

            #reads nicely named files 
            real_fa = fa_file(clusters, region=region, fd =  fastadir, type="real")
            rand_fa = fa_file(clusters, region=region, fd =  fastadir, type="random")
            if options.k is not None:
                if options.homer is True:
                    region_homer_out = os.path.join(homerout, region)
                    run_homer(real_fa, rand_fa, options.k,  outloc=region_homer_out)
                for k in options.k:
                    kmer_results[k] = {}
                    kmer_results[k][region] = kmerdiff(real_fa, rand_fa, k)
                    kmerfile = clusters + ".k" + str(k) + "." + region + ".kmerdiff"
                    kmerfile = os.path.join(kmerout, kmerfile)
                    kmer_sorted_output = run_kmerdiff(real_fa, rand_fa, outfile=kmerfile, k=k)

            
    #all the different motifs that the user specifices 
    motifs = list(options.motif)
    kmer_box_params = [kmerout, clusters, options.k, motifs]

    ###conservation --should use multiprocessing to speed this part up!
    #start of conservation logic, very slow...
    phast_values = list()
    
    #loads phastcons values of generates them again
    if options.rePhast is False:
        try:
            phast_values = pickle.load(open(os.path.join(misc_dir, "%s.phast.pickle" %(clusters))))
        except:
            options.rePhast = True

    #generates again
    if options.rePhast is True:
        print "Fetching Phastcons Scores...",
        
        #phastcons values for all regions except "all"
        for region in all_regions[1:]: #skip "all" combine them later
            print ("%s..." %(region)),
            try:
                samplesize=1000
                
                #because it takes so long to fetch only select 1000 of them, not actually
                #implemented
                if len(cluster_regions[region]['real']) > samplesize:
                    R1 = cluster_regions[region]['real']                
                    # R1 = random.sample(cluster_regions[region]['real'], samplesize)
                else:
                    R1 = cluster_regions[region]['real']

                #realPhast = get_phastcons(cluster_regions[region]['real'], species=options.species)
                print "getting real...",
                
                #gets phastcons values real regions 
                realPhast = get_phastcons(R1, options.phastcons_location, species=options.species)
                randPhast = list()
                
                #logic for random stuff (could be precomputed)
                for i in range(options.nrand):
                    if len(cluster_regions[region]['rand'][i]) > samplesize:
                        R2 = cluster_regions[region]['rand'][i]                    
                        #R2 = random.sample(cluster_regions[region]['rand'][i], samplesize)
                    else:
                        R2 = cluster_regions[region]['rand'][i]
                    print ("getting rand %d" %(i)),
                    randPhast.extend(get_phastcons(R2, options.phastcons_location, species=options.species).tolist())
                
                #list of lists for real and random for every genic region
                phast_values.append(realPhast)
                phast_values.append(randPhast)
            
            except:
                continue
            
        #hacky selection of real values from phast_values
        all_real = np.concatenate(phast_values[::2])
        
        #hacky selection of random values from phast_values
        all_rand = np.concatenate(phast_values[1::2])
        
        #adds back in all and rand to phast_values list
        phast_values.insert(0,all_rand)
        phast_values.insert(0,all_real)
        pickout = open(os.path.join(misc_dir, "%s.phast.pickle" %(clusters)), 'w')
        pickle.dump(phast_values, file = pickout)
    
    
    Zscores = None  #old. remove
    
    #build qc figure
    QCfig_params = [reads_in_clusters, (total_reads - reads_in_clusters), cluster_lengths, reads_per_cluster, premRNA_positions, mRNA_positions, exon_positions, intron_positions, Gsizes, sizes, Gtype_count, type_count, Zscores, homerout, kmer_box_params, phast_values]

    #save results 
    pickout = open(os.path.join(outdir, "misc", "%s.qcfig_params.pickle" %(clusters)), 'w')
    pickle.dump(QCfig_params, file = pickout)
    QCfig = CLIP_Analysis_Display.CLIP_QC_figure(*QCfig_params)
    fn = clusters + ".QCfig.pdf"
    outFig = os.path.join(outdir, fn)
    
    #TODO Fix output of file (Don't know why its crashing right now
    print >> sys.stderr, outFig
    QCfig.savefig(outFig)
                    
    ### does something with motifs doesn't appear to work right now
    
    #reads in existing precompiled motif file
    motifs = list(options.motif)
    
    if motifs is not None and False: #TODO hack to get stuff compiling fix soon
        motifBASE  = options.motif_location
        fig = pylab.figure(figsize=(8.5, 11))
        colors = ["red", "orange", "green", "blue", "purple", "brown", "black", "pink", "gray", "cyan", "magenta"]
        for i, motif in enumerate(motifs):
            mf = "motif_" + motif + ".BED"
            mfgz = "motif_" + motif + ".BED.gz"
            print os.path.join(motifBASE,species,mf)
            motifFILE = None

            if os.path.exists(os.path.join(motifBASE,species, mf)):
                motifFILE = os.path.join(motifBASE,species, mf)
            elif os.path.exists(os.path.join(motifBASE,species, mfgz)):
                motifFILE= os.path.join(motifBASE,species, mfgz)
            else:
                print "MOTIF BED FILE for motif: %s is not available, please build it" %(mf)
                continue
            
            #plots motif distance from the precompiled file to the clusters 
            plot_motif_dist(cluster_regions, motifFILE, fig, color = colors[i], species=species, slopsize=200)
        pylab.savefig(clusters + ".motif_distribution.pdf")
Пример #31
0
def main(options):
    


    
    #
    from subprocess import Popen, PIPE
    host = Popen(["hostname"], stdout=PIPE).communicate()[0].strip()
    #print host
    #print mpl.get_backend()

    #print mpl.get_backend()
    
    clusters = options.clusters
    species = options.species
    CLUSTERS = pybedtools.BedTool(clusters)

    clusters = str.replace(clusters, ".BED", "")
    options.k= map(int, options.k)
    outdir = options.outdir

    def make_dir(dir_name):
        if not os.path.exists(dir_name):
            os.mkdir(dir_name)

    make_dir(outdir)        

    assigned_dir = os.path.join(outdir, "assigned")
    misc_dir = os.path.join(outdir, "misc")
    fastadir = os.path.join(outdir, "fasta")    
    kmerout = os.path.join(outdir, "kmer")
    homerout_base = os.path.join(outdir, "homer")
    make_dir(homerout_base)
    homerout = os.path.join(homerout_base, clusters)    

    make_dir(assigned_dir)
    make_dir(misc_dir)
    make_dir(fastadir)
    make_dir(homerout)
    make_dir(kmerout)

    all_regions = (["all", "exon", "UTR3", "UTR5", "proxintron", "distintron"])    


    def fa_file(filename, region = None, fd=fastadir, type= "real"):
        if not os.path.exists(fd):
            raise Exception
        if region is not None:
            x =filename+"."+  region+ "."+ type+ ".fa"
            return os.path.join(fd, x)
        else:
            x = filename+ "."+ type + ".fa"
            return os.path.join(fd, x)

    if options.assign is False:
        try:
            CLUS_regions, sizes, Gsizes = build_assigned_from_existing(assigned_dir, clusters, all_regions, options.nrand)
            print "I used a pre-assigned set of BED files... score!"
        except:
            print "I had problems retreiving region-assigned BED files from %s, i'll rebuild" %(assigned_dir)
            options.assign=True
            
    if options.assign is True:
        print "Assigning Clusters to Genic Regions"
        CLUS_regions, sizes, Gsizes = assign_to_regions(CLUSTERS, species=species, getseq=True, nrand=options.nrand)
        print "Done Assigning"

        print "Saving BED and Fasta Files...",

        sizes_out = open(os.path.join(assigned_dir, "%s.sizes.pickle" %(clusters)), 'w')
        pickle.dump(sizes, file=sizes_out)
        sizes_out.close()    
        Gsizes_out = open(os.path.join(assigned_dir, "Gsizes.pickle"), 'w')
        pickle.dump(Gsizes, file=Gsizes_out)
        Gsizes_out.close()

        for region in all_regions:
            of = clusters + "." + region+ ".real.BED"
            try:
                CLUS_regions[region]['real'].saveas(os.path.join(assigned_dir, of))
            except:
                continue
            for n in range(options.nrand):
                of = clusters + "." + region+ ".rand." + str(n) + ".BED"
                try:
                    CLUS_regions[region]['rand'][n].saveas(os.path.join(assigned_dir, of))
                except:
                    continue
                
        print "done"

        for region in all_regions:
            try:
                real_fa = fa_file(clusters, region=region, type="real")
                rand_fa = fa_file(clusters, region=region, type="random")
                CLUS_regions[region]['real'].save_seqs(real_fa)

                l = list()#list of randoms
                for n in CLUS_regions[region]['rand'].keys():
                    l.append(CLUS_regions[region]['rand'][n])
                write_seqs(rand_fa, l)        
            except:
                continue            
                                   
    print "Counting reads in clusters...",
    reads_in_clusters = 0
    reads_per_cluster = list()
    for cluster in CLUS_regions['all']['real']:
        chr, start, stop, name, score, strand, tstart, tstop = str(cluster).strip().split("\t")
        try:
            gene, n, reads = name.split("_")
        except:
            try:
                gene, n, reads = name.split(";")[0].split("_")
            except:
                pass
        if int(reads)> 1:
            reads_per_cluster.append(int(reads))
        reads_in_clusters += int(reads)
    print "done"
    #bamfile = pysam.Samfile(options.bam, 'rb')
    print "Getting total number of reads...",
    total_reads = 0;
    try:
        pickle_file = clusters + ".pickle"
        if os.path.exists(pickle_file):
            pf = pickle.load(open(pickle_file, 'rb'))
        else:
            print "Couldn't find %s" %(pickle_file)
        print "Found %s" %(pickle_file)
        for gene in pf:
            total_reads += gene['nreads']
            
    except:
        print "Couldn't find a pickled file, resorting to flagstat for total reads. (this includes intergenic reads)"
        flagstats = pysam.flagstat(options.bam)
        total_reads =int(flagstats[2].split(" ")[0])
        
    print "done, there were %d" %(total_reads)
    print "Gathering bed lengths...",
    cluster_lengths = bedlengths(CLUS_regions['all']['real'])
    print "done"
##     
    mRNA_positions = list()
    premRNA_positions = list()
    intron_positions = list()
    exon_positions = list()
    GENES, Gtypes = build_AS_STRUCTURE_dict(species)
    types = {}
    for type in ["CE:", "SE:", "MXE:", "A5E:", "A3E:"]:
        types[type]=0
    print "locating clusters within genes",
    try:
        for line in (CLUS_regions['all']['real']):
            mRNA_frac, premRNA_frac, exon_frac, intron_frac, nearest_type = RNA_position(line, GENES)
            if mRNA_frac is not None:
                mRNA_positions.append(mRNA_frac)
            if exon_frac is not None:
                exon_positions.append(exon_frac)
            if premRNA_frac is not None:
                premRNA_positions.append(premRNA_frac)
            if intron_frac is not None:
                intron_positions.append(intron_frac)
            if nearest_type is not None:
                try:
                    types[nearest_type] += 1
                except:
                    types[nearest_type] =1
    except:
        print "there were errors, skipping"
    print "done"
                                     
    type_count = [types["CE:"], types["SE:"], types["MXE:"], types["A5E:"], types["A3E:"]]
    Gtype_count = [Gtypes["CE:"], Gtypes["SE:"], Gtypes["MXE:"], Gtypes["A5E:"], Gtypes["A3E:"]]    

    ### write fasta files and run homer and/or kmer analysis if at least one analysis is requested
    if options.reMotif is True:
       
        for region in all_regions:
            try:
                real_fa = fa_file(clusters, region=region, type="real")
                rand_fa = fa_file(clusters, region=region, type="random")
                if options.k is not None:
                    if options.homer is True:
                        region_homer_out = os.path.join(homerout, region)
                        run_homer(real_fa, rand_fa, options.k,  outloc=region_homer_out)
                    for k in options.k:                    
                        kmerfile = clusters + ".k" + str(k) + "." + region + ".kmerdiff"
                        kmerfile = os.path.join(kmerout, kmerfile)
                        kmer_sorted_output = run_kmerdiff(real_fa, rand_fa, outfile=kmerfile, k=k)
            except:
                continue

    motifs = list(options.motif)
    kmer_box_params = [kmerout, clusters, options.k, motifs]

    ###conservation --should use multiprocessing to speed this part up!
    phast_values = list()
    if options.rePhast is False:
        try:
            phast_values = pickle.load(open(os.path.join(misc_dir, "%s.phast.pickle" %(clusters))))
        except:
            options.rePhast =True


    if options.rePhast is True:
        print "Fetching Phastcons Scores...",
        for region in all_regions[1:]:#skip "all" combine them later
            print ("%s..." %(region)),
            try:
                samplesize=1000
                if len(CLUS_regions[region]['real']) > samplesize:
                    R1 = CLUS_regions[region]['real']                
                    # R1 = random.sample(CLUS_regions[region]['real'], samplesize)
                else:
                    R1 = CLUS_regions[region]['real']

                #realPhast = get_phastcons(CLUS_regions[region]['real'], species=options.species)
                print "getting real...",
                realPhast = get_phastcons(R1, species=options.species)
                randPhast=list()
                for i in range(options.nrand):
                    if len(CLUS_regions[region]['rand'][i]) > samplesize:
                        R2 = CLUS_regions[region]['rand'][i]                    
                        #R2 = random.sample(CLUS_regions[region]['rand'][i], samplesize)
                    else:
                        R2 = CLUS_regions[region]['rand'][i]
                    print ("getting rand %d" %(i)),
                    randPhast.extend(get_phastcons(R2, species=options.species).tolist())
                phast_values.append(realPhast)
                phast_values.append(randPhast)
            except:
                continue
        all_real = np.concatenate(phast_values[::2])
        all_rand = np.concatenate(phast_values[1::2])
        phast_values.insert(0,all_rand)
        phast_values.insert(0,all_real)
        pickout = open(os.path.join(misc_dir, "%s.phast.pickle" %(clusters)), 'w')
        pickle.dump(phast_values, file = pickout)
    Zscores = None  #old. remove

    QCfig_params = [reads_in_clusters, (total_reads - reads_in_clusters), cluster_lengths, reads_per_cluster, premRNA_positions, mRNA_positions, exon_positions, intron_positions, Gsizes, sizes, Gtype_count, type_count, Zscores, homerout, kmer_box_params, phast_values]

    pickout = open(os.path.join(outdir, "misc", "%s.qcfig_params.pickle" %(clusters)), 'w')
    pickle.dump(QCfig_params, file = pickout)
    QCfig = CLIP_QC_figure(*QCfig_params)
    fn = clusters + ".QCfig.pdf"
    outFig = os.path.join(outdir, fn)
    QCfig.savefig(outFig)
                    
###
    motifs = list(options.motif)
    motifBASE  = basedir + "/lovci/projects/ucscBED"
    if motifs is not None:
        fig = pylab.figure(figsize=(8.5, 11))
        colors = ["red", "orange", "green", "blue", "purple", "brown", "black", "pink", "gray", "cyan", "magenta"]
        for i, motif in enumerate(motifs):
            mf = "motif_" + motif + ".BED"
            mfgz = "motif_" + motif + ".BED.gz"
            print os.path.join(motifBASE,species,mf)
            motifFILE = None
#            import code
#            code.interact(local=locals())
            if os.path.exists(os.path.join(motifBASE,species, mf)):
                motifFILE = os.path.join(motifBASE,species, mf)
            elif os.path.exists(os.path.join(motifBASE,species, mfgz)):
                motifFILE= os.path.join(motifBASE,species, mfgz)
            else:
                print "MOTIF BED FILE for motif: %s is not available, please build it" %(mf)
                continue
            plot_motif_dist(CLUS_regions, motifFILE, fig, color = colors[i], species=species, slopsize=200)
        pylab.savefig(clusters + ".motif_distribution.pdf")