示例#1
0
def coverages(intersect):
    svs = {}
    overlaps = {}
    for interval in intersect:
        chrom1, start1, end1, svtype1, sv_id1 = interval[0:5]
        if sv_id1 not in svs:
            svs[sv_id1] = []
            svs[sv_id1].append([chrom1, start1, end1, svtype1])
#        if source == 'CCDG':
#            if sv_id1 not in ccdgs:
#                ccdgs[sv_id1] = float(0)
#        if source == 'CEPH':
#            if sv_id1 not in cephs:
#                cephs[sv_id1] = float(0)
#        if source == 'gnomAD':
#            if sv_id1 not in gnomads:
#                gnomads[sv_id1] = float(0)
        chrom2, start2, end2, svtype2, source2, sv_id2 = interval[5:11]
        if sv_id1 not in overlaps:
            overlaps[sv_id1] = []
        if svtype1 == svtype2:
            overlaps[sv_id1].append(
                [chrom2, start2, end2, svtype2, source2, sv_id2])
    for key in overlaps:
        if len(overlaps[key]) > 0:
            tmpA = BedTool(svs[key])
            tmpB = BedTool(overlaps[key])
            coverage = tmpA.coverage(tmpB)
            for sv in coverage:
                cov_frac = sv[7]
                covs[key] = float(cov_frac)
示例#2
0
def pybedtoolcoverage(dicoInit, dicoThread, thread_name):
    bed = BedTool(dicoThread[thread_name]['bed'])
    bam = BedTool(dicoThread[thread_name]['bam'])
    cov = bed.coverage(bam, d=True)
    shutil.move(
        cov.fn, dicoInit['tmp'] + "/" +
        str(dicoThread[thread_name]['bam_num']) + ".cov")
示例#3
0
def compute_unique_coverage(unique_bam, repnames_bedfile):
    """Use bedtools to calculate the number of unique reads that overlap the repeat ranges"""
    print(f"Computing overlaps between {unique_bam} and {repnames_bedfile}...")
    a = BedTool(repnames_bedfile)
    b = BedTool(unique_bam)
    c = a.coverage(b, counts=True)

    return c
示例#4
0
def find_telomeres(window,genomefile,telomeres_gff,percentile,outfilename):
    from pybedtools import BedTool
    import pandas as pd
    import numpy as np
    win = BedTool().window_maker(g=genomefile,w=window)
    tel = BedTool(telomeres_gff)
    cov = win.coverage(tel)
    df = pd.read_csv(cov.fn,sep="\t",names=["chr","start","end","depth","n bp","len","cov"])
    df["per_bp"] = df["depth"]/df["len"]
    a = np.percentile(df["per_bp"],percentile)
    tel = df[df["per_bp"]>=a].reset_index()
    tel["num"] = "tel"+tel.index.map(str)
    tel2=tel[["chr","start","end","num"]]
    tel2.to_csv(outfilename,sep="\t",header=None,index=None)
示例#5
0
def normalize(geneid2tval):
    norma = sum(geneid2tval.values())
    return dict([ (x[0], x[1]*1000000/norma) for x in geneid2tval.items() ])

def get_geneid(intersection):
    attrs = dict( [x.split("=") for x in intersection[-2].split(";")])
    return attrs["ID"]

mapped_reads = BedTool(args.path)
genes = BedTool(args.genes);
geneid2tval = {};

if(args.stranded):
    mapped_to_genes = 0
    for cov in genes.coverage(b=mapped_reads, F=0.51, s=True, sorted=True):
        geneid2tval[cov.name] = int(cov[9])/len(cov);
        mapped_to_genes += int(cov[9])
        
    sys.stderr.write("\nTotal reads: %d\nReads mapped to genes: %d\nFraction mapped %1.2f\n\n" % (len(mapped_reads), mapped_to_genes, mapped_to_genes/len(mapped_reads)))
        
        
    
else:
    geneid2count = defaultdict(float);
    curname = ''
    cur_geneids = [];
    shared_reads = 0;
    for interval in mapped_reads.intersect(b = genes, wo = True, f=0.51, sorted=True):
        #print(curname);
        #print(interval.name)
示例#6
0
                    counter += 1
                    if stop:
                        break

bins = BedTool(binfile)

Results = {}
tracknames = []
for bed in sorted(bedfiles):
    (vol, fname) = os.path.split(bed)
    #    fname = "Density_"+re.sub('\.(bed|vcf)','',fname)
    fname = re.sub('\.(bed|vcf\.gz|gff)', '', fname)
    print(fname)
    tracknames.append(fname)
    bedtrack = BedTool(bed)
    cov = bins.coverage(bedtrack)
    for interval in cov:
        window = interval[3]
        #        print("window is",window)
        if not window in Results:
            Results[window] = {
                "coords": [interval[0], interval[1], interval[2]]
            }

        m = Results[window]
        if not "coverage" in m:
            m["coverage"] = {fname: interval[6]}
        else:
            m["coverage"][fname] = interval[6]

with open(tracks, "w") as ggtrack:
示例#7
0
def coverage_bed(window_bed, input_coverage_bed, output_coverage_bed):
    """function to calculate TFBS %coverage on sliding windows"""
    windows = BedTool(window_bed)
    motifs = BedTool(input_coverage_bed)
    # calculate TFBS coverage and save the output file
    windows.coverage(motifs, output=output_coverage_bed)
for k in chromosomes.keys():
    print chrom_Intervals[k][0:10]

with open('humanBedInterval.bed', 'w') as f:
    for k in chromosomes.keys():
        f.write('\n'.join(
            '%s\t%d\t%d' %
            (k, chrom_Intervals[k][i], chrom_Intervals[k][i + 1] - 1)
            for i in range(len(chrom_Intervals[k]) - 1)) + '\n')

a = BedTool('humanBedInterval.bed').sort()
b = BedTool('/mnt/disks/data-vcf/GSN79Tumor_normal.vcf')
print a.head()
print b.head()

a.coverage(b).saveas('VCFCoverage.bed')  #,hist=True

#for k in chromosomes.keys():
positionHistogram = defaultdict(list)

with open('/mnt/disks/data-vcf/GSN79Tumor_normal.vcf', 'r') as f:
    for line in f.readlines():
        if line and line.startswith('#') == 0:
            if line.split('\t')[0] not in positionHistogram.keys():
                positionHistogram[line.split('\t')[0]] = []
            positionHistogram[line.split('\t')[0]].append(
                int(line.split('\t')[1]))

for k in chromosomes.keys():
    plt.hist(positionHistogram[k], bins=chrom_Intervals[k])
    plt.savefig(k + 'SNPDense.png')
def genDataset(genes,
               testTrain):  # second argument is test or train bed dictionary
    dataset = {'SNP': defaultdict(list), 'indel': defaultdict(list)}
    #random_index = randrange(0,len(genes))
    #with open('out.txt','w') as f:
    if len(genes) > 1:
        for gene in genes[0:10]:  #random_index[0:11]:
            #gene = genes[i]
            print gene

            if gene and gene.startswith('1\t') or gene.startswith('22\t'):
                geneInfo = gene.split('\t')
                interval = map(int, geneInfo[1:3])
                #f.write('\t'.join([geneInfo[0]]+geneInfo[1:3])+'\n')
                #bin1 = np.arange(interval[0],interval[1],100)
                #bin2 = np.arange(interval[0]+50,interval[1],100)
                #geneBed = BedTool(gene,from_string=True)
                #SNPtest = testTrain['SNP'].intersect(geneBed,wa=True)
                #indelTest = testTrain['indel'].intersect(geneBed,wa=True)
                #snpDensity = []
                #indelDensity = []
                #print [np.arange(interval[0],interval[1],100),np.arange(interval[0]+50,interval[1],100)]
                for bin in [np.arange(interval[0], interval[1], 100)[0:4]
                            ]:  #,np.arange(interval[0]+50,interval[1],100)]:
                    for i in range(len(bin) - 1):
                        interval = bin[i:i + 2]
                        geneNaming = geneInfo[3].strip('\n') + '|' + '-'.join(
                            [geneInfo[0]] + map(str, interval))  #geneInfo[0:3]
                        #f.write('Gene Name: ' + geneNaming + '\n')
                        #try:
                        densityBedInt = BedTool('\n'.join(
                            np.vectorize(lambda x: geneInfo[0] + '\t%d\t%d' %
                                         (x - 10, x + 10))(np.arange(
                                             interval[0] + 10,
                                             interval[1] - 10, 5))),
                                                from_string=True)
                        #except:
                        #    print '\n'.join(np.vectorize(lambda x: geneInfo[0]+'\t%d\t%d'%(x-5,x+5))(np.arange(interval[0]+5,interval[1]-5)))

                        #try:
                        densitySNP = np.vectorize(
                            lambda line: float(line.split('\t')[-1]))(filter(
                                None,
                                str(densityBedInt.coverage(
                                    testTrain['SNP'])).split('\n')))
                        #except:
                        #    print str(densityBedInt.coverage(testTrain['SNP'])).split('\n')
                        densityIndel = np.vectorize(
                            lambda line: float(line.split('\t')[-1]))(filter(
                                None,
                                str(densityBedInt.coverage(
                                    testTrain['indel'])).split('\n')))
                        dataset['SNP'][geneNaming] = densitySNP
                        dataset['indel'][geneNaming] = densityIndel
                print 'duasdf'
                print dataset
                dump(dataset['SNP'],
                     open('testData.p',
                          'wb'))  #.keys(),dataset['SNP'].values()]

            #f.write('FINISH 1\n')#testTrain['SNP'].head()
    else:
        for gene in genes:
            print gene
            if gene and gene.startswith('1\t') or gene.startswith('22\t'):
                geneInfo = gene.split('\t')
                interval = map(int, geneInfo[1:3])
                #f.write('\t'.join([geneInfo[0]]+geneInfo[1:3])+'\n')
                #bin1 = np.arange(interval[0],interval[1],100)
                #bin2 = np.arange(interval[0]+50,interval[1],100)
                #geneBed = BedTool(gene,from_string=True)
                #SNPtest = testTrain['SNP'].intersect(geneBed,wa=True)
                #indelTest = testTrain['indel'].intersect(geneBed,wa=True)
                #snpDensity = []
                #indelDensity = []
                #print [np.arange(interval[0],interval[1],100),np.arange(interval[0]+50,interval[1],100)]
                for bin in [
                        np.arange(interval[0], interval[1], 100),
                        np.arange(interval[0] + 50, interval[1], 100)
                ]:
                    for i in range(len(bin) - 1):
                        interval = bin[i:i + 2]
                        geneNaming = geneInfo[3].strip('\n') + '|' + '-'.join(
                            [geneInfo[0]] + map(str, interval))  #geneInfo[0:3]
                        #f.write('Gene Name: ' + geneNaming + '\n')
                        #try:
                        densityBedInt = BedTool('\n'.join(
                            np.vectorize(lambda x: geneInfo[0] + '\t%d\t%d' %
                                         (x - 10, x + 10))(np.arange(
                                             interval[0] + 10,
                                             interval[1] - 10, 5))),
                                                from_string=True)
                        #except:
                        #    print '\n'.join(np.vectorize(lambda x: geneInfo[0]+'\t%d\t%d'%(x-5,x+5))(np.arange(interval[0]+5,interval[1]-5)))

                        #try:
                        densitySNP = np.vectorize(
                            lambda line: float(line.split('\t')[-1]))(filter(
                                None,
                                str(densityBedInt.coverage(
                                    testTrain['SNP'])).split('\n')))
                        #except:
                        #    print str(densityBedInt.coverage(testTrain['SNP'])).split('\n')
                        densityIndel = np.vectorize(
                            lambda line: float(line.split('\t')[-1]))(filter(
                                None,
                                str(densityBedInt.coverage(
                                    testTrain['indel'])).split('\n')))
                        dataset['SNP'][geneNaming] = densitySNP
                        dataset['indel'][geneNaming] = densityIndel

        print 'FINISH 1'
        print dataset
        return dataset
示例#10
0
    for line in f:
        chromCount += 1
        # bedread = '\n'.join('\t'.join('%s\t%d\t%d'%tuple([line.split('\t')[0]]+sorted(np.vectorize(lambda x: int(x))(line.split('\t')[1:3])))))
        #interval = sorted(np.vectorize(lambda x: int(x))(line.split('\t')[1:3]))
        interval = line.split('\t')[1]
        histInterval[line.split('\t')[0]] = list(
            np.arange(0., int(interval), 250000.)) + [int(interval)]  # [0]
        if chromCount > 22:
            break
    bedHist = BedTool('\n'.join('\n'.join('\t'.join([key] + [
        str(int(x)) for x in [histInterval[key][i], histInterval[key][i + 1]]
    ]) for i in range(len(histInterval[key]) - 1))
                                for key in histInterval.keys()),
                      from_string=True).saveas('BedHist.bed')

bedTrans = bedHist.coverage(
    BedTool(transposonGFF)).sort().saveas('coverage.bed')

transposonDensity = []
for line in str(bedTrans).splitlines():
    if line:
        lineList = line.split('\t')
        transposonDensity.append(lineList[0:3] + [float(lineList[-1])])
transposonDensityArr = np.array(transposonDensity)
print transposonDensityArr
window = 21
centromereRegions = ''
nonCentromereRegions = ''
for chrom in histInterval:
    print chrom
    if len(histInterval[chrom]) > window:
        arraySubset = transposonDensityArr[transposonDensityArr[:, 0] == chrom]
示例#11
0
def plot_gene(gene, outpref, bams, bedname, gtfname, peakname, labels,
              nsubplots, normalize, scale):
    cds_indices = []
    transcript_end = 0
    if os.path.isfile(gtfname):
        gtf = open(gtfname, 'r')
        for line in gtf.readlines():
            tabbed = line.split('\t')
            csome, feature = tabbed[0], tabbed[2]
            strand = tabbed[6]
            start = str(int(tabbed[3]) - 1)
            end = tabbed[4]
            if feature == 'CDS':
                if strand == '-':
                    inds = (int(end) - 1, int(start))
                else:
                    inds = (int(start), int(end) - 1)
                cds_indices.append(inds)
    else:
        strand = '+'
    print('{} strand'.format(strand))

    peakset = get_peakset(peakname, normalize)

    coverage = {}
    bed = BedTool(bedname)
    for label, bamname in zip(labels, bams):  #add replicates here
        bam = BedTool(bamname)
        cov = bed.coverage(bam, d=True, stream=True)
        indices = {}
        gene_cov = []
        tot_cov, fullcov = 0, 0
        for i, line in enumerate(cov):
            items = str(line).split('\t')
            del items[2:-2]
            csome, start, intind, nreads = items
            #print line
            indices[int(start) + int(intind) - 1] = i
            gene_cov.append(int(nreads))
            if normalize and (csome, start) not in peakset:
                tot_cov += int(nreads)
            fullcov += int(nreads)

        if strand == '-':
            length_exon = len(indices)
            for ind in indices:
                indices[ind] = length_exon - indices[ind] - 1
        print('coverage', tot_cov, fullcov)
        if not normalize:
            tot_cov = fullcov
        coverage = fill_coverage(coverage, strand, gene_cov, label, normalize,
                                 tot_cov)
        print("coverage calculated for {}: {}...".format(
            label, ','.join([str(x) for x in gene_cov[:10]])))

    peaks = get_peaks(peakname, indices, strand)

    sns.set_style("ticks")
    if not scale:
        figp, axes = plt.subplots(figsize=(5, 2 * nsubplots),
                                  nrows=nsubplots,
                                  sharey=True,
                                  sharex=True)
    else:
        figp, axes = plt.subplots(figsize=(3.5, 1.5 * nsubplots),
                                  nrows=nsubplots,
                                  sharey=False,
                                  sharex=True)  #change size
    if nsubplots == 1:
        axes = [axes]
    print("{} subplots".format(len(axes)))
    #pal1 = cm.ScalarMappable(sns.light_palette("navy", as_cmap=True, reverse=True)).to_rgba(range(len(labels)/2))
    #pal2 = cm.ScalarMappable(sns.light_palette("orange", reverse=True, as_cmap=True)).to_rgba(range(len(labels)/2))
    #colours = [(0,0,0,1)]*len(labels)
    colours = [
        '#000000', '#2294a3'
    ] * nsubplots  #,'#006e90','#f18f01']*nsubplots #,'#adcad6','#c2724d','#9883e5','#f76b49','#3a208e','#f24236'] #don't like the colours? change them here!
    #colours[::2] = pal1
    #colours[1::2] = pal2
    #print colours
    gene_len = coverage[labels[-1]]['gene_ind'][-1]
    maxpeak = 10
    sp = 0

    def get_unique(seq):
        seen = set()
        seen_add = seen.add
        return [x for x in seq if not (x in seen or seen_add(x))]

    uniq_labels = get_unique(labels)
    npersubplot = len(uniq_labels) / nsubplots

    reps = False
    maxpeaks = {subplot: 10 for subplot in range(len(axes))}
    for i, label, col in zip(range(len(uniq_labels)), uniq_labels, colours):
        if len(axes) > 1:
            ax = axes[sp]
        else:
            ax = axes[0]
        xs = coverage[label]['gene_ind']
        if len(coverage[label]['count']) > 1:
            ys = coverage[label]['count']
            #print ys[:10]
            std = np.std(ys, axis=0)
            ys = np.mean(ys, axis=0)
            reps = True
        else:
            ys = coverage[label]['count'][0]
        if len(xs) == 1:
            xs = xs[0]
        ax.plot(xs, ys, label=label, color=col)
        if reps:
            ax.fill_between(xs, ys - std, ys + std, alpha=0.3, facecolor=col)
        maxpeak = max(maxpeak, ax.get_ylim()[1])
        if scale:
            maxpeaks[sp] = max(maxpeaks[sp], maxpeak)
            #ax.set_ylim([0,maxpeak + int(0.1*maxpeak)])
        #else:
        #maxpeak = ax.get_ylim()[1] #max([maxpeak]+ax.get_ylim()[1])
        if i >= (sp + 1) * (npersubplot) - 1:
            sp += 1
            if scale:
                maxpeak = 10

    for i, ax in enumerate(axes):
        if scale:
            maxpeak = maxpeaks[i]  #max(10,ax.get_ylim()[1])
            #ax.set_ylim([0,maxpeak])
        for (s, e) in cds_indices:
            rect = matplotlib.patches.Rectangle((indices[s], 0),
                                                indices[e] - indices[s],
                                                maxpeak,
                                                angle=0.0,
                                                alpha=0.1,
                                                color='#a5abaf')
            ax.add_patch(rect)
        for peak in peaks:
            rect = matplotlib.patches.Rectangle((peak[0], 0),
                                                peak[1] - peak[0],
                                                maxpeak,
                                                angle=0.0,
                                                alpha=0.3,
                                                color='#ffff00')
            ax.add_patch(rect)
        #ax.set_xlabel(gene)
        ax.xaxis.set_ticks(
            np.arange(0, gene_len, max(100 * np.round(gene_len / 500., 0),
                                       500)))
        ax.set_xlim([0, gene_len + 1])
        #print maxpeak, int(0.05*maxpeak)
        #if not scale:
        ax.set_ylim([0, maxpeak])  #+int(0.05*maxpeak)])
        if normalize:
            ax.set_ylabel('normalized\ncoverage')
        else:
            ax.set_ylabel('coverage')
        if i == nsubplots - 1:
            ax.set_xlabel(gene)
        ax.legend(loc=2)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
    #sns.despine(fig=figp)
    plt.tight_layout()
    plt.savefig(outpref + '_' + gene + '_coverage.pdf',
                dpi=500,
                bbox_inches='tight')
示例#12
0
    rpk = coverage.mean() * 1000
    return [
        rpk,
        sum(coverage),
        len(coverage) / 1000,
        coverage.mean(),
        variation(coverage)
    ]


mapping = BedTool(args.path)
annotation = BedTool(args.genes)
name2stat = {}
curname = ''
curints = []
for cov in annotation.coverage(b=mapping, F=0.2, s=True, sorted=True, d=True):
    if (cov.name == curname):
        curints.append(cov)
    else:
        if (curints):
            name2stat[curname] = assign_coverage(curints)
        curints = [cov]
        curname = cov.name
else:
    name2stat[curname] = assign_coverage(curints)

### Normalize to RPKM
normfactor = sum([x[1] for x in name2stat.values() if x[1]]) / 1000000
for l in name2stat.values():
    l[1] = (l[1] / normfactor) / l[2]
示例#13
0
                        break

bins = BedTool(binfile)
strains = {}
Results = {}

with open(bamlist, "r") as fh:
    reader = csv.reader(fh, delimiter=",")
    for row in reader:
        filename = row[0]
        strain = row[1]
        group = row[2]
        print(row)
        strains[strain] = group
        bam = BedTool(filename)
        cov = bins.coverage(bam, sorted=True, g=args.genomefile, bed=True)
        for interval in cov:
            window = interval[3]
            print("window is", window)
            if not window in Results:
                Results[window] = {
                    "coords": [interval[0], interval[1], interval[2]]
                }
            m = Results[window]
            normdepth = "%.2f" % (int(interval[6]) / straindepths[strain])
            if not "coverage" in m:
                m["coverage"] = {strain: normdepth}
            else:
                m["coverage"][strain] = normdepth

with open(tracks, "w") as ggtrack: