Exemplo n.º 1
0
def prepare_gat(
    df,
    promoter_TATA_intersect_bed,
    TATA_box_locations,
    file_names,
    output_genecat_prefix,
    promoterpref,
    variable1_name,
    variable2_name,
):
    """prepare files for running gat analysis - outputs a workspace file containing all promoters, a variable promoter file and a constitutive promoter file"""
    # make buffer to save promoters
    buffer = io.StringIO()
    df.to_csv(buffer, sep="\t", header=None, index=False)
    buffer.seek(0)
    # select only constitutive and variable genes
    df = df[(df.gene_type == variable1_name) |
            (df.gene_type == variable2_name)]
    # reorder columns
    df_reordered = df[[
        "chr",
        "start",
        "stop",
        "gene_type",
        "strand",
        "source",
        "attributes",
        "AGI",
    ]]
    # sort by chromosome and start
    sorted_motifs = df_reordered.sort_values(["chr", "start"])
    # save bed file
    BedTool.from_dataframe(sorted_motifs).saveas(
        f"../../data/output/{file_names}/TATA/{output_genecat_prefix}_{promoterpref}_nocontrol.bed"
    )

    # run bedtools intersect between TATAbox_location_renamed.bed and the extracted promoters
    TATAlocations = BedTool(TATA_box_locations)
    promoters = BedTool(buffer)
    promoters.intersect(TATAlocations,
                        wao=True,
                        output=promoter_TATA_intersect_bed)
    # make a new gat workspace file with all promoters (first 3 columns)
    BedTool.from_dataframe(sorted_motifs[["chr", "start", "stop"]]).saveas(
        f"../../data/output/{file_names}/TATA/gat_analysis/{output_genecat_prefix}_{promoterpref}_workspace.bed"
    )
    # select only variable promoters
    variable_promoters_extended = sorted_motifs[sorted_motifs["gene_type"] ==
                                                variable2_name]
    sorted_variable = variable_promoters_extended.sort_values(["chr", "start"])
    BedTool.from_dataframe(sorted_variable).saveas(
        f"../../data/output/{file_names}/TATA/gat_analysis/{output_genecat_prefix}_{promoterpref}_{variable2_name}.bed"
    )
    # make a constitutive only file
    constitutive_promoters = sorted_motifs[sorted_motifs["gene_type"] ==
                                           variable1_name]
    sorted_constitutive = constitutive_promoters.sort_values(["chr", "start"])
    BedTool.from_dataframe(sorted_constitutive).saveas(
        f"../../data/output/{file_names}/TATA/gat_analysis/{output_genecat_prefix}_{promoterpref}_{variable1_name}.bed"
    )
Exemplo n.º 2
0
def Cluster2pIntronRetention(Cluster):
    ''''clu_5077': {'chrX\t166458208\t166462315': 0.0195360774825754, 'chrX\t166458343\t166462315': -0.0195360774825754}
    '''
    for k1, v1 in Cluster.items():
        if len(v1) > 2:
            for k2, v2 in v1.items():
                Junction = BedTool(k2, from_string=True)
                Junction_Bed = Bed(k2.split("\t"))
                X = Junction.intersect(Genes, wb=True)
                m6A_string = []
                splicing_type = []
                if len(X) >= 1:
                    for gene in X:
                        g = genebed(gene[3:])
                        for e in g.Exons():
                            if Junction_Bed.overlap(Junction_Bed, e):
                                x_l = Junction_Bed.overlapLength(e)
                                if x_l >= 10 and x_l < e.length(
                                ) - 2:  ## if ==, which means the entire exon is inside of the intron
                                    exon = BedTool(str(e), from_string=True)
                                    alternative = Junction.intersect(exon)
                                    splicing_type.append("pfIntronRetention")
                                    m6A_string.append(m6AORnot(alternative))
                if len(set(m6A_string)) == 1 and len(splicing_type) > 0:
                    fo_pfIR.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(
                        "|".join(set(splicing_type)),
                        "|".join(set(m6A_string)), k1, k2, v2))
Exemplo n.º 3
0
def get_enrichment(peakfile, length):
    peaks = BedTool(peakfile)
    region = BedTool([
        Interval(peaks[0].chrom,
                 args.start,
                 args.end,
                 strand='+',
                 score='0',
                 name='region')
    ])
    ilen = len(region[0])
    olen = length - ilen
    outside = [
        float(x.attrs['topcoverage'])
        for x in peaks.intersect(region, f=0.5, v=True)
    ]
    inside = [
        float(x.attrs['topcoverage'])
        for x in peaks.intersect(region, f=0.5, u=True)
    ]

    if (len(outside)):
        normed_count = (len(inside) / ilen) / (len(outside) / olen)
        normed_sum = (sum(inside) / ilen) / (sum(outside) / olen)
        overrepresented = int(normed_count > 2 or normed_sum > 2)
    else:
        normed_count = float('nan')
        normed_sum = float('nan')
        overrepresented = -1
    return "%s\t%d\t%d\t%1.1f\t%1.1f\t%1.3f\t%1.3f\t%1.3f\t%1.3f\t%1.3f\t%1.3f\t%d" % (
        os.path.basename(peakfile).split(".")[0], len(inside), len(outside),
        sum(inside), sum(outside), np.mean(inside), np.mean(outside),
        np.median(inside), np.median(outside), normed_count, normed_sum,
        overrepresented)
Exemplo n.º 4
0
def count_stitch(cnv,probefh):
	to_stitch=[]
	first=''
	probes = BedTool(probefh).sort()
	cnvbed = BedTool(list(set(cnv))).sort(stream=True)
	cnv = toList(cnvbed)
	for i in xrange(len(cnv)):
		if i == 0: first=cnv[i]
		elif i > 0:
	       		(c1,s1,e1,cl1,id1,cf1) = first
			(c2,s2,e2,cl2,id2,cf2) = cnv[i]
			if cl1 != cl2: first = cnv[i]
			elif c1 != c2: first = cnv[i]
			else:
				g1 = int(e1)+1
				g2 = int(s2)-1
				if int(e1) == int(s2)-1:
					g1=int(e1)
					g2=int(s2)
			       	probe_spans=[]
				probe_spans.append(len(probes.intersect(BedTool(' '.join(map(str,(c1,s1,e1))),from_string=True),wa=True,u=True,stream=True)))
				probe_spans.append(len(probes.intersect(BedTool(' '.join(map(str,(c2,s2,e2))),from_string=True),wa=True,u=True,stream=True)))
				max_span = max(probe_spans)
				if len(probes.intersect(BedTool(' '.join(map(str,(c1,g1,g2))),from_string=True),wa=True,u=True,stream=True)) <= float(max_span)*0.5:
					to_stitch.append(i)
	to_stitch.sort()
	to_stitch = [[v[1] for v in vals] for _, vals in itertools.groupby(enumerate(to_stitch), key=lambda x: x[1] - x[0])]
	return len(to_stitch)
Exemplo n.º 5
0
def rebin_step1():
    if url == False:
        A = BedTool(input_signal)
        B = BedTool(bins)
        AB = A.intersect(B, wo=True)
        AB_inv = B.intersect(A, v=True)
        return AB, AB_inv
    elif url == True:
        to_download = "'" + input_signal + "'"
        command = "wget " + to_download
        p = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE)
        stdout, stderr = p.communicate()
        ninput_signal = os.path.basename(input_signal)
        if bigWig == True:
            run_bash("./kent_binaries/bigWigToBedGraph " + ninput_signal +
                     " " +
                     ninput_signal.replace(".bigWig", ".bedGraph").replace(
                         ".bigwig", ".bedGraph"))
            run_bash("rm " + ninput_signal)
            ninput_signal = ninput_signal.replace(".bigWig",
                                                  ".bedGraph").replace(
                                                      ".bigwig", ".bedGraph")
        A = BedTool(ninput_signal)
        B = BedTool(bins)
        AB = A.intersect(B, wo=True)
        AB_inv = B.intersect(A, v=True)
        run_bash("rm " + ninput_signal)
        return AB, AB_inv
Exemplo n.º 6
0
def load_beddata(genome, bed_file, use_meta, use_gencode, input_dir, is_sorted, chrom=None):
    bed = BedTool(bed_file)
    if not is_sorted:
        print('Sorting BED file')
        bed = bed.sort()
        is_sorted = True
    blacklist = make_blacklist()
    print('Determining which windows are valid')
    bed_intersect_blacklist_count = bed.intersect(blacklist, wa=True, c=True, sorted=is_sorted)
    if chrom:
        nonblacklist_bools = np.array([i.chrom==chrom and i.count==0 for i in bed_intersect_blacklist_count])
    else:
        nonblacklist_bools = np.array([i.count==0 for i in bed_intersect_blacklist_count])
    print('Filtering away blacklisted windows')
    bed_filtered = bed.intersect(blacklist, wa=True, v=True, sorted=is_sorted)
    if chrom:
        print('Filtering away windows not in chromosome:', chrom)
        bed_filtered = subset_chroms([chrom], bed_filtered)
    print('Generating test data iterator')
    bigwig_names, bigwig_files_list = load_bigwigs([input_dir])
    bigwig_files = bigwig_files_list[0]
    if use_meta:
        meta_names, meta_list = load_meta([input_dir])
        meta = meta_list[0]
    else:
        meta = []
        meta_names = None
    
    shift = 0
    
    if use_gencode:
        cpg_bed = BedTool('resources/cpgisland.bed.gz')
        cds_bed = BedTool('resources/wgEncodeGencodeBasicV19.cds.merged.bed.gz')
        intron_bed = BedTool('resources/wgEncodeGencodeBasicV19.intron.merged.bed.gz')
        promoter_bed = BedTool('resources/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')
        utr5_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')
        utr3_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')

        peaks_cpg_bedgraph = bed_filtered.intersect(cpg_bed, wa=True, c=True)
        peaks_cds_bedgraph = bed_filtered.intersect(cds_bed, wa=True, c=True)
        peaks_intron_bedgraph = bed_filtered.intersect(intron_bed, wa=True, c=True)
        peaks_promoter_bedgraph = bed_filtered.intersect(promoter_bed, wa=True, c=True)
        peaks_utr5_bedgraph = bed_filtered.intersect(utr5_bed, wa=True, c=True)
        peaks_utr3_bedgraph = bed_filtered.intersect(utr3_bed, wa=True, c=True)

        data_bed = [(window.chrom, window.start, window.stop, 0, bigwig_files, np.append(meta, np.array([cpg.count, cds.count, intron.count, promoter.count, utr5.count, utr3.count], dtype=bool)))
                    for window, cpg, cds, intron, promoter, utr5, utr3 in 
                    itertools.izip(bed_filtered, peaks_cpg_bedgraph,peaks_cds_bedgraph,peaks_intron_bedgraph,peaks_promoter_bedgraph,peaks_utr5_bedgraph,peaks_utr3_bedgraph)]
    else:
        data_bed = [(window.chrom, window.start, window.stop, shift, bigwig_files, meta)
                    for window in bed_filtered]
    #from data_iter import DataIterator
    from data_iter import DataIterator
    bigwig_rc_order = get_bigwig_rc_order(bigwig_names)
    datagen_bed = DataIterator(data_bed, genome, 100, L, bigwig_rc_order, shuffle=False)
    return bigwig_names, meta_names, datagen_bed, nonblacklist_bools
Exemplo n.º 7
0
def main():
    out_dir = Path("../output/chipseq-wf/intersections")
    out_dir.mkdir(exist_ok=True)

    ref = "../output/chipseq-wf/dmel-all-r6.26_genes.bed"
    ref_bt = BedTool(ref).slop(b=1_000, genome="dm6")

    for file_name in Path("../output/chipseq-wf/bed").iterdir():
        file_out = out_dir / file_name.name
        bt = BedTool(file_name)
        bt.intersect(ref_bt, wb=True).saveas(file_out)
Exemplo n.º 8
0
def get_binned_modules(ma=None,
                       a=annotations450,
                       b='lola_vignette_data/activeDHS_universe.bed',
                       include_last=False,
                       min_capsule_len=2000):
    allcpgs = ma.beta.columns.values
    a = BedTool(a)
    b = BedTool(b)
    # a.saveas('a.bed')
    # b.saveas('b.bed')
    a_orig = a
    df = BedTool(a).to_dataframe()
    df.iloc[:,
            0] = df.iloc[:,
                         0].astype(str)  #.map(lambda x: 'chr'+x.split('.')[0])
    df = df.set_index('name').loc[list(
        ma.beta)].reset_index().iloc[:, [1, 2, 3, 0]]
    a = BedTool.from_dataframe(df)
    # df_bed=pd.read_table(b,header=None)
    # df_bed['features']=np.arange(df_bed.shape[0])
    # df_bed=df_bed.iloc[:,[0,1,2,-1]]
    # b=BedTool.from_dataframe(df)
    # a=BedTool.from_dataframe(df_bed)#('lola_vignette_data/activeDHS_universe.bed')
    df_bed = BedTool(b).to_dataframe()
    if df_bed.shape[1] < 4:
        df_bed['features'] = np.arange(df_bed.shape[0])
    b = BedTool.from_dataframe(df_bed)
    try:
        c = b.intersect(a, wa=True, wb=True).sort()
        # c.saveas('c.bed')
        d = c.groupby(g=[1, 2, 3, 4], c=(8, 8), o=('count', 'distinct'))
    except:
        df = BedTool(a_orig).to_dataframe()
        df.iloc[:, 0] = df.iloc[:, 0].astype(str).map(
            lambda x: 'chr' + x.split('.')[0])
        df = df.set_index('name').loc[list(
            ma.beta)].reset_index().iloc[:, [1, 2, 3, 0]]
        a = BedTool.from_dataframe(df)
        c = b.intersect(a, wa=True, wb=True).sort()
        # c.saveas('c.bed')
        d = c.groupby(g=[1, 2, 3, 4], c=(8, 8), o=('count', 'distinct'))
    #d.saveas('d.bed')
    df2 = d.to_dataframe()
    df3 = df2.loc[df2.iloc[:, -2] > min_capsule_len]
    modules = [cpgs.split(',') for cpgs in df3.iloc[:, -1].values]
    modulecpgs = np.array(list(set(list(reduce(lambda x, y: x + y, modules)))))
    if include_last:
        missing_cpgs = np.setdiff1d(allcpgs, modulecpgs).tolist()
    final_modules = modules + ([missing_cpgs] if include_last else [])
    module_names = (df3.iloc[:, 0] + '_' + df3.iloc[:, 1].astype(str) + '_' +
                    df3.iloc[:, 2].astype(str)).tolist()
    return final_modules, modulecpgs.tolist(), module_names
Exemplo n.º 9
0
def intersector(sampfile, filtfile, specfr):
    ###Takes two bed files and will intersect them. The specfr argument is used to tell whether to keep or remove the regions that overlap in the two files given.###

    sf = BedTool(sampfile)
    ff = BedTool(filtfile)

    if specfr == 'Keep':
        endfile = sf.intersect(ff, u=True)
    elif specfr == 'Discard':
        endfile = sf.intersect(ff, u=False)
    else: print "ERROR: Incorrect specifying argument given, please use 'Keep' or 'Discard'."

    return endfile
Exemplo n.º 10
0
def generate_background(foipath,gfpath,background):
	"""accepts a background filepath
	generate a background and returns as a pybedtool.
	Replaces the chrom fields of the foi and the gf with the interval
	id from the background.
	"""
	bckg = background
	bckgnamed = "" 
	interval = 0 

	#inserts a unique interval id into the backgrounds name field
	for b in bckg:
		bckgnamed +=  "\t".join(b[:3])+'\t{}\t'.format(interval) + "\t".join(b[4:]) + "\n"
		interval += 1
	bckg = BedTool(bckgnamed,from_string=True)
	foi = BedTool(str(foipath))
	gf = BedTool(str(gfpath))
	# get the interval names from the background that the gf intersects with
	gf = bckg.intersect(gf)
	gfnamed = ""

	# insert the interval id into the chrom field of the gf and creates a new bedtool
	for g in gf:
		gfnamed += '{}\t'.format(g.name) + "\t".join(g[1:]) + "\n"
		#print "GFNAMED: " + str(g)
	gf = BedTool(gfnamed,from_string=True)
	#print "GFBEDTOOL: " + str(g)

	# inserts the interval id into the chrom column of the foi and creates a new bedtool
	foi = bckg.intersect(foi)
	foinamed = ""
	for f in foi:
		foinamed += '{}\t'.format(f.name) + "\t".join(f[1:])+"\n" 
		#print "FOINAMED: " + str(f)
	foi = BedTool(foinamed,from_string=True)

	#print "FOIBEDTOOL: " + str(f)
	bckgnamed = ""
	for b in bckg:
		bckgnamed += '{}\t'.format(b.name) + "\t".join(b[1:])+"\n"
	bckg = BedTool(bckgnamed,from_string=True)
	# converts the background to a genome dictionary
	chrstartend = [(g.start,g.end) for g in bckg]
	background = dict(zip([g.chrom for g in bckg],chrstartend))
	return {"foi": foi,"gf":gf,"background":background}


	

	run_pvalue=False,run_pybedtool=False,run_jaccard=False,run_proximity=False,run_kolmogorov=False
Exemplo n.º 11
0
def bed_intersect(cpg, data):
    """
    using pybedtools, perform an intersection on our cpg file of interest and cpg count observation file
    :param cpg: cpg file
    :param data: count file
    :return: None
    """
    output_name = cpg.replace(".bed", "_results.txt")  # make results file

    # make bedtool objects out of cpg and data files
    a = BedTool(cpg)
    b = BedTool(data)

    # perform a left outer join intersect and move to an output file
    a.intersect(b, loj=True).moveto(output_name)
Exemplo n.º 12
0
def gene_regions(vf, af):
    print "inside gene regions"
    v = BedTool(vf)
    feats = BedTool(af)
    
    # first establish all the columns in the annotation file
    cols = set(f[4] for f in feats)
 
    results = {}

    intersection = v.intersect(feats, wb=True)

    if len(intersection) > 0:
        annots = intersection.groupby(g=[1,2,3,4], c=9, o='collapse')

        for entry in annots:
            regions = {}
            for region in entry[4].split(','):  
                if region in regions:
                    regions[region] += 1
                else:
                    regions[region] = 1

            results[entry.name] = Series(regions)

    df = DataFrame(results, index = cols)
    print "exiting gene regions"
    return df.T.fillna(0)
Exemplo n.º 13
0
    def get_feat(self, _input):
        snp_dfm = _input.loc[:, ['chrom', 'chromStart', 'chromEnd', 'name']]
        snp_bed_obj = BedTool(snp_dfm.to_string(index=False, header=False, index_names=False), from_string=True)

        seg_bed_fn = os.path.join(self.src_data_dir, self.src_data_fn)
        seg_bed_obj = BedTool(seg_bed_fn)

        results = {}

        # The 'intersect' operation is not 'left-join' style so its result might have less entries than the SNP bed
        intersection = snp_bed_obj.intersect(seg_bed_obj, wb=True)
        if len(intersection) > 0:
            annots = intersection.groupby(g=[1, 2, 3, 4], c=8, o='collapse')
            for entry in annots:
                results[entry.name] = pd.Series(entry[4].split(',')).value_counts()

        names = {
            'CTCF': 'CTCF_REG',
            'E': 'ENH',
            'PF': 'TSS_FLANK',
            'R': 'REP',
            'T': 'TRAN',
            'TSS': 'TSS',
            'WE': 'WEAK_ENH'
        }

        gwava_dfm = pd.DataFrame(results, index=names.keys()).T.rename(columns=names)

        snp_dfm = snp_dfm.merge(gwava_dfm, how='left', left_on='name', right_index=True, copy=True)

        return snp_dfm.fillna(0).drop(['chrom', 'chromStart', 'chromEnd'], axis=1)
Exemplo n.º 14
0
def coverage(source: BedTool,
             to_intersect: BedTool,
             presorted: bool = True) -> [float]:
    """
    For each interval in the source compute coverage by to_intersect features. Source intervals must be non overlapping
    """
    if not presorted:
        source, to_intersect = source.sort(), to_intersect.sort()
    intersection = source.intersect(to_intersect, wao=True)
    intersection, source = list(intersection), list(source)

    ind = 0
    curinter = intersection[ind]
    curcov = float(curinter.fields[-1])
    coverage = []
    assert source[
        ind] == curinter, f"Fail: expected {source[ind]}, got {curinter}"
    for inter in intersection:
        if inter == curinter:
            curcov += float(inter.fields[-1])
        else:
            coverage.append(curcov / curinter.length)
            assert coverage[-1] <= 1
            curinter = inter
            curcov = float(curinter.fields[-1])
            ind += 1
            assert curinter == source[
                ind], f"Fail: expected {source[ind]}, got {curinter}"
    coverage.append(curcov / curinter.length)
    assert len(coverage) == len(source)
    return coverage
Exemplo n.º 15
0
def calculate_ovl(nbedfile, obedfile, opts, scoresfile):
    nbedtool = BedTool(nbedfile)
    obedtool = BedTool(obedfile)

    ab = nbedtool.intersect(obedtool, wao=True, f=opts.f, r=opts.r, s=opts.s)
    cmd = """cut -f4,5,10,13 | awk -F $'\t' 'BEGIN { OFS = FS } ($3 != "."){ print $1,$3,$2,$4; }'"""
    sh(cmd, infile=ab.fn, outfile=scoresfile)
Exemplo n.º 16
0
	def load_1kgp(self,raw=None,svtype=None,gen=None,tmp_bed=None):
		sv = BedTool([(format_chrom(x[0]),x[1],x[2],x[3]) for x in raw if svtype in str(x[3])]).sort()
		sv.intersect('{}annotation_files/{}_1000Genomes_{}.bed'.format(Config().resource_path(),gen,svtype), f=0.8, F=0.8, wao=True,output=tmp_bed)
		with open(tmp_bed,'r') as f:
			for l in f:
				x = tuple(l.rstrip().split('\t'))
				locus = tokenize_sv(x)+(str(x[3]),)
				ovr = int(x[-1])
				if ovr==0: continue
				ovr = format(float(x[len(x)-1])/(int(x[2])-int(x[1])),'.2f')
				if self._1kgp.get(locus)==None:
					self._1kgp[locus]=(x[len(x)-2],ovr)
				elif self._1kgp.get(locus)!=None and float(ovr) > float(self._1kgp[locus][1]):
					self._1kgp[locus]=(x[len(x)-2],ovr)
				else: continue
		os.remove(tmp_bed)
Exemplo n.º 17
0
def load_chip_multiTask(input_dir):
    tfs, chip_beds, merged_chip_bed = get_chip_beds(input_dir)
    print('Removing peaks outside of X chromosome and autosomes')
    chroms, chroms_sizes, genome_bed = get_genome_bed()
    merged_chip_bed = merged_chip_bed.intersect(genome_bed, u=True, sorted=True)

    print('Windowing genome')
    genome_windows = BedTool().window_maker(g=genome_sizes_file, w=genome_window_size,
                                            s=genome_window_step)

    print('Extracting windows that overlap at least one ChIP interval')
    positive_windows = genome_windows.intersect(merged_chip_bed, u=True, f=1.0*(genome_window_size/2+1)/genome_window_size, sorted=True)

    # Exclude all windows that overlap a blacklisted region
    blacklist = make_blacklist()
    
    print('Removing windows that overlap a blacklisted region')
    positive_windows = positive_windows.intersect(blacklist, wa=True, v=True, sorted=True)

    num_positive_windows = positive_windows.count()
    # Binary binding target matrix of all positive windows
    print('Number of positive windows:', num_positive_windows)
    print('Number of targets:', len(tfs))
    # Generate targets
    print('Generating target matrix of all positive windows')
    y_positive = parmap.map(intersect_count, chip_beds, positive_windows.fn)
    y_positive = np.array(y_positive, dtype=bool).T
    print('Positive matrix sparsity', (~y_positive).sum()*1.0/np.prod(y_positive.shape))
    merged_chip_slop_bed = merged_chip_bed.slop(g=genome_sizes_file, b=genome_window_size)
    # Later we want to gather negative windows from the genome that do not overlap
    # with a blacklisted or ChIP region
    nonnegative_regions_bed = merged_chip_slop_bed.cat(blacklist)
    return tfs, positive_windows, y_positive, nonnegative_regions_bed
Exemplo n.º 18
0
def gene_regions(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    
    # first establish all the columns in the annotation file
    cols = set(f[4] for f in feats)

    results = {}

    intersection = v.intersect(feats, wb=True)

    if len(intersection) > 0:
        annots = intersection.groupby(g=[1,2,3,4], c=9, ops='collapse')

        for entry in annots:
            regions = {}
            for region in entry[4].split(','):  
                if region in regions:
                    regions[region] += 1
                else:
                    regions[region] = 1

            results[entry.name] = Series(regions)

    df = DataFrame(results, index = cols)

    return df.T.fillna(0)
Exemplo n.º 19
0
 def cnv_format(self):
     samples_sv = self.parse_vcf()
     for sample in samples_sv:
         if os.path.exists(os.path.join(self.module, sample)):
             pass
         else:
             os.mkdir(os.path.join(self.module, sample))
         df = pd.DataFrame()
         df['Samples'] = [sample for i in range(
             len(samples_sv[sample]['Chrom']))]
         df['Chrom'] = samples_sv[sample]['Chrom']
         df['Start'] = samples_sv[sample]['Start']
         df['End'] = samples_sv[sample]['End']
         df['Length'] = samples_sv[sample]['Length']
         df['Type'] = samples_sv[sample]['Type']
         df['CN'] = samples_sv[sample]['CN']
         df.to_csv('{module}/{sample}/cnv_igv.seg'.format(module=self.module,
                                                          sample=sample), index=False, header=True, sep='\t')
         df = df.drop('Samples', axis=1)
         # df=df.drop('CN',axis=1)
         df.to_csv('{module}/{sample}/cnv.bed'.format(module=self.module,
                                                      sample=sample), index=False, header=False, sep='\t')
         self.cnv_plot(df, sample)
         cnv = BedTool(
             '{module}/{sample}/cnv.bed'.format(module=self.module, sample=sample))
         gene = BedTool(self.genecode)
         intersect = cnv.intersect(gene, wb=True, f=0.5)
         intersect_gene_count = self.gene_count(intersect)
         # e.g chr1	1406909	1406998	18358	DUP	chr1	1406909	1406998	MRPL20
         intersect.moveto(
             '{module}/{sample}/cnv.annotatedGenecodeV31.bed'.format(module=self.module, sample=sample))
         self.cnv_stat(df, sample, intersect_gene_count)
Exemplo n.º 20
0
def segmentations(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    results = {}
    intersection = v.intersect(feats, wb=True)
    if len(intersection) > 0:
        sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$8"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn)
        call(sort_cmd1, shell=True)
        annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse')
        for entry in annots: 
            regions = {}
            regions[entry[4]] = entry[5]

            results[entry.name] = Series(regions)

    names = {
        'CTCF': 'CTCF_REG', 
        'E':    'ENH', 
        'PF':   'TSS_FLANK', 
        'R':    'REP', 
        'T':    'TRAN', 
        'TSS':  'TSS', 
        'WE':   'WEAK_ENH'
    }

    return DataFrame(results, index=names.keys()).T.rename(columns=names)   
Exemplo n.º 21
0
def _iter_pairwise_connections(
    clusterable_bedtool: pybedtools.BedTool,
    min_reciprocal_overlap: float,
    min_sample_overlap: float = 0,
    is_carrier: Mapping[Text, numpy.ndarray] = MappingProxyType({})
) -> Iterator[Tuple[Text, Text]]:
    """
    Iterate over pairs of variant intervals that meet minimum requirement for reciprocal overlap. Exclude self-overlaps.
    Optionally impose requirement of minimum Jaccard index for carrier samples.
    Parameters
    ----------
    clusterable_bedtool: BedTool
        bed object with intervals that may overlap each other
    min_reciprocal_overlap: float
        minimum reciprocal overlap for two intervals to be connected
    min_sample_overlap: float (default=0)
        minimum Jaccard index of carrier samples for two intervals to be connected
    is_carrier: Mapping[Text, numpy.ndarray]
        map from variant ID to carrier status (array boolean True/False for each sample)
    Yields
    -------
    variant_id_1, variant_id_2: Tuple[Text, Text]
        successive pairs of variant IDs that meet the overlap requiremnts
    """
    # Cluster intervals based on reciprocal overlap
    if len(clusterable_bedtool) == 0:
        return
    overlap_bedtool = clusterable_bedtool.intersect(clusterable_bedtool,
                                                    f=min_reciprocal_overlap,
                                                    r=True,
                                                    wa=True,
                                                    wb=True,
                                                    sorted=True,
                                                    nonamecheck=True)
    num_1_fields = clusterable_bedtool.field_count()
    name_1_field = name_field
    sv_type_1_field = sv_type_field
    name_2_field = num_1_fields + name_field
    sv_type_2_field = num_1_fields + sv_type_field

    if min_sample_overlap > 0:
        for overlap in overlap_bedtool:
            fields = overlap.fields
            if fields[sv_type_1_field] != fields[sv_type_2_field]:
                continue  # only cluster same sv_type
            name_1 = fields[name_1_field]
            name_2 = fields[name_2_field]
            if name_1 != name_2 and jaccard_index(
                    is_carrier[name_1],
                    is_carrier[name_2]) >= min_sample_overlap:
                yield name_1, name_2
    else:
        for overlap in overlap_bedtool:
            fields = overlap.fields
            if fields[sv_type_1_field] != fields[sv_type_2_field]:
                continue  # only cluster same sv_type
            name_1 = fields[name_1_field]
            name_2 = fields[name_2_field]
            if name_1 != name_2:
                yield name_1, name_2
Exemplo n.º 22
0
def snp_freq_by_window(stat_df, group_label, window_file, outdir):
    groups = stat_df.columns[3:]
    alt_freq_stat_bed = outdir / f'{group_label}.snp.plot.bed'
    if not alt_freq_stat_bed.is_file():
        alt_freq_stat_df = stat_df.copy()
        alt_freq_stat_df.loc[:, 'start'] = alt_freq_stat_df.Pos - 1
        bed_cols = ['Chr', 'start', 'Pos', 'Alt']
        bed_cols.extend(groups)
        alt_freq_stat_df.to_csv(alt_freq_stat_bed,
                                sep='\t',
                                columns=bed_cols,
                                header=None,
                                index=False)
    window_bed = BedTool(str(window_file))
    snp_bed = BedTool(str(alt_freq_stat_bed))
    intersect_obj = window_bed.intersect(snp_bed, sorted=True, wo=True)
    intersect_obj_cols = ['Chrom', 'Start', 'End']
    intersect_obj_cols.extend(['snp_Chr', 'snp_start', 'Pos', 'Alt'])
    intersect_obj_cols.extend(groups)
    intersect_obj_cols.append('overlap')
    intersect_str = StringIO(str(intersect_obj))
    intersect_df = pd.read_csv(intersect_str,
                               sep='\t',
                               header=None,
                               names=intersect_obj_cols)
    intersect_df.drop(['snp_Chr', 'snp_start', 'overlap'],
                      axis=1,
                      inplace=True)
    return intersect_df
Exemplo n.º 23
0
def Cluster2A5SSorA3SS(Cluster):
    ''' 'clu_5077': {'chrX\t166458208\t166462315': 0.0195360774825754, 'chrX\t166458343\t166462315': -0.0195360774825754}
    '''
    for k1, v1 in Cluster.items():
        if len(v1) == 2:
            bed2list = []
            for k2, v2 in v1.items():
                bed2list.append(Bed(k2))
            bed2list.sort(key=sortbylength)
            short, longer = bed2list
            alternative = ""
            ss_type = ""
            if short.start == longer.start:
                alternative = BedTool(short.chr + "\t" + str(short.end) +
                                      "\t" + str(longer.end) + "\t" + k1 +
                                      "alternative" + "\t0\t" + short.strand,
                                      from_string=True)
            elif short.end == longer.end:
                alternative = BedTool(short.chr + "\t" + str(longer.start) +
                                      "\t" + str(short.start) + "\t" + k1 +
                                      "alternative" + "\t0\t" + short.strand,
                                      from_string=True)
            else:
                pass
            X = alternative.intersect(m6A_bed)
            longer = longer.chr + "\t" + str(longer.start) + "\t" + str(
                longer.end)
            if len(X) >= 1:
                yield "A5SS_A3SS", "m6A", k1, v1[longer]
            else:
                yield "A5SS_A3SS", "nom6A", k1, v1[longer]
Exemplo n.º 24
0
def generate_bed_file_annotations(bed_directory, output_directory, loci):
    """
        Generates the annotation file for every bed file in the bed_directory folder
    """
    
    # Loop over the bed files in the bed directory.
    bed_file_list = glob.glob(os.path.join(bed_directory, "*.bed"))
    logging.info("Start to generate BED file annotations")
    logging.info("Writing annotation to: {0}/".format(output_directory))
    for locus in loci:
        zscore = os.path.join(output_directory, locus) 
        bed_lines, rsids = _bed_from_zscore(zscore)
        tmp_bed = open("tmp.bed","w").writelines(bed_lines)
        snps = BedTool("tmp.bed")
        no_snps = _get_line_number(zscore)
        a_matrix= AnnotateLociMatrix(len(bed_file_list), no_snps)
        logging.info("Annotating locus: {0}, using VCF file {1}".format(locus, zscore))
        for beds in bed_file_list:
            test_annotation = BedTool(beds)
            inter = snps.intersect(test_annotation)
            idxs = []
            for inte in inter:
                idxs.append(rsids.index(inte.name))
            zeroes = np.zeros(len(rsids))
            for idx in idxs:
                zeroes[idx] = 1
            a_matrix.add_annotation(zeroes, beds)
        annotations_file = os.path.join(output_directory, locus + ".annotations")
        logging.info("Writing annotation matrix to: {0}".format(annotations_file))
        a_matrix.write_annotations(annotations_file)
        os.remove("tmp.bed")
Exemplo n.º 25
0
def bed_overlap(ref_bed_filepath, test_bed_filepath, dir_out='.'):
    """ Given two bed files, ref_bed and test_bed, perform bedtools intersect -c to return the number of 
    test_bed regions that overlap any regions in the ref_bed file. Returns the ref_bed file with a column 
    of overlap counts """

    cwd = os.getcwd()

    # specify the reference bed file
    ref_bedtool = BedTool(ref_bed_filepath)
    prfx_ref = ref_bed_filepath.split('/')[-1]
    prfx_ref = prfx_ref.split('.')[0]

    # specify the new ClinVar bed file
    test_bedtool = BedTool(test_bed_filepath)
    prfx_test = test_bed_filepath.split('/')[-1]
    prfx_test = prfx_test.split('.')[0]

    # specify name/path of output bed file
    bed_out = dir_out + '/{}_IN_{}.bed'.format(prfx_test, prfx_ref)

    # run bedtools intersect to get all test_bed regions NOT found in ref_bed (-v option)
    ref_in_test = test_bedtool.intersect(b=ref_bedtool, c=True)

    # save the bed overlap file
    ref_in_test.saveas(bed_out)

    # confirm file saved
    if os.path.isfile(bed_out):
        print('Success!\nFile saved to: \n{}.\n'.format(
            os.path.join(cwd, bed_out)))

    return (ref_in_test)
Exemplo n.º 26
0
def vcf_to_df_worker(arg):
    """Convert CANVAS vcf to a dict, single thread"""
    canvasvcf, exonbed, i = arg
    logging.debug("Working on job {}: {}".format(i, canvasvcf))
    samplekey = op.basename(canvasvcf).split(".")[0].rsplit("_", 1)[0]
    d = {"SampleKey": samplekey}

    exons = BedTool(exonbed)
    cn = parse_segments(canvasvcf)
    overlaps = exons.intersect(cn, wao=True)
    gcn_store = {}
    for ov in overlaps:
        # Example of ov.fields:
        # [u'chr1', u'11868', u'12227', u'ENSG00000223972.5',
        # u'ENST00000456328.2', u'transcribed_unprocessed_pseudogene',
        # u'DDX11L1', u'.', u'-1', u'-1', u'.', u'0']
        gene_name = "|".join((ov.fields[6], ov.fields[3], ov.fields[5]))
        if gene_name not in gcn_store:
            gcn_store[gene_name] = defaultdict(int)

        cn = ov.fields[-2]
        if cn == ".":
            continue
        cn = int(cn)
        if cn > 10:
            cn = 10
        amt = int(ov.fields[-1])
        gcn_store[gene_name][cn] += amt

    for k, v in sorted(gcn_store.items()):
        v_mean, v_median = counter_mean_and_median(v)
        d[k + ".avgcn"] = v_mean
        d[k + ".medcn"] = v_median
    cleanup()
    return d
Exemplo n.º 27
0
def generate_curve(bedfile, chromosome, region_start, region_stop):
    bedtool = BedTool(bedfile)
    region_of_interest = BedTool(chromosome + ' ' + str(region_start) + ' ' +
                                 str(region_stop),
                                 from_string=True)

    plot_region = region_of_interest.intersect(bedtool)

    domain = np.arange(region_start, region_stop + 1)

    values = np.zeros(domain.shape)

    for interval in plot_region:
        if (interval.start < region_start):
            start = region_start
        else:
            start = interval.start

        if (interval.end > region_stop):
            finish = region_stop
        else:
            finish = interval.end

        start_i = start - domain[0]
        finish_i = finish - domain[-1]

        values[start_i:finish_i] = values[start_i:finish_i] + 1

    return (domain, values)
Exemplo n.º 28
0
def compute_fold_change(exp1, exp2):
    peaks_file = meta_data.peaks_file(exp1);

    raw_file1 = meta_data.raw_bed_file(exp1);
    raw_file2 = meta_data.raw_bed_file(exp2);

    peaks = BedTool(peaks_file);
    raw1 = BedTool(raw_file1);
    raw2 = BedTool(raw_file2);

    coverage_1 = peaks.intersect(raw1, c=True);
    coverage_2 = peaks.intersect(raw2, c=True);

    output_bed = list();
    #Bad way to do this, but I'm having trouble writing
    #to a file while iterating over BedTools

    for i1, i2 in zip(coverage_1, coverage_2):
        if(i1.count == 0):
            cstring = "-1";
        else:
            cstring = str.format("{:f}",i2.count / i1.count);
        line = i1.chrom + "\t" + str(i1.start) + "\t" + str(i1.end) + "\t" + cstring + "\n";
        output_bed.append(line);

    out_directory = meta_data.directory_foldchange(exp2);

    if(not os.path.isdir(out_directory)):
        os.mkdir(out_directory);

    out_file = out_directory + os.sep + "foldchange.bed";

    with open(out_file, 'w') as fout:
        for line in output_bed:
            fout.write(line);
Exemplo n.º 29
0
def main():
    p = argparse.ArgumentParser(description=__doc__,
            formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument('bed', help='bed with miRNA as name')
    p.add_argument('--reference-beds', dest='reference', nargs='+', 
        help='reference beds for each feature to annotate')
    p.add_argument('--names', nargs='+', 
        help='names corresponding to reference files')
    args = p.parse_args()
    if not args.names and not args.reference:
        sys.exit(p.print_help())
    
    bed = BedTool(args.bed)
    
    # create the reference beds
    reference = {}
    for refname, refbed in izip(args.names, args.reference):
        reference[refname] = BedTool(refbed)
    
    for refname in args.names:
    
        # intersect the mirna bed with the reference annotations
        for b in bed.intersect(reference[refname], s=True, stream=True):
            # Cytoscape formatting
            fields = (b.name, "=", refname)
            print " ".join(map(str, fields))
Exemplo n.º 30
0
def main():
    args = parse_args()
    naive_overlap_sorted_merged = BedTool(args.naive_overlap_sorted_merged)
    sample_optimal_sets = list(args.sample_optimal_sets)
    num_samples = len(sample_optimal_sets)
    sample_bedtools = [BedTool(i) for i in sample_optimal_sets]
    region_to_sample = dict()
    for sample_index in range(len(sample_bedtools)):
        sample = sample_bedtools[sample_index]
        sample_name = sample_optimal_sets[sample_index]
        print(sample_name)
        #intersect with the naive_overlap_sorted_merged peak set
        intersections = naive_overlap_sorted_merged.intersect(sample, wa=True)
        for intersection in intersections:
            str_intersection = str(intersection)
            if str_intersection not in region_to_sample:
                region_to_sample[str_intersection] = [sample_name]
            else:
                region_to_sample[str_intersection].append(sample_name)
    output_files = dict()
    output_files['shared'] = open(args.outf_prefix + ".shared", 'w')
    for sample_name in sample_optimal_sets:
        output_files[sample_name] = open(args.outf_prefix + "." + sample_name,
                                         'w')
    for region in region_to_sample:
        if len(region_to_sample[region]) == 1:
            #unique to one sample
            cur_sample = region_to_sample[region][0]
            output_files[cur_sample].write(region)
        elif len(region_to_sample[region]) == num_samples:
            output_files['shared'].write(region)
Exemplo n.º 31
0
def _bed_intersection(bed: pybedtools.BedTool,
                      path,
                      g,
                      region_index,
                      bed_sorted,
                      fraction=0.2):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        query_bed = _region_bed_sorted(path, g, bed_sorted)
        try:
            df = bed.intersect(query_bed,
                               wa=True,
                               f=fraction,
                               g=g,
                               sorted=True).to_dataframe()
            if df.shape[0] == 0:
                regions_idx = pd.Series([])
            else:
                regions_idx = df["name"]
        except pd.errors.EmptyDataError:
            regions_idx = pd.Series([])
    regions = pd.Index(regions_idx.values)
    bool_series = pd.Series(region_index.isin(regions), index=region_index)

    query_bed.delete_temporary_history(ask=False)
    return bool_series
Exemplo n.º 32
0
def gene_regions(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    
    # first establish all the columns in the annotation file
    cols = set(f[4] for f in feats)

    results = {}

    intersection = v.intersect(feats, wb=True)

    if len(intersection) > 0:
        #sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn)
        #call(sort_cmd1, shell=True)
        tempfile1 = tempfile.mktemp()
        sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s > %s' % (intersection.fn, tempfile1)
        call(sort_cmd2, shell=True)
        intersection = BedTool(tempfile1)   
        annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse')

        for entry in annots:
            regions = {}
            regions[entry[4]] = entry[5]

            results[entry.name] = Series(regions)

    df = DataFrame(results, index = cols)

    return df.T.fillna(0)
Exemplo n.º 33
0
def annotate_peaks(notsif, beds, names):
    """Takes notsif, transforms to bed, and outputs annotation of where the 
    miRNA seed is interrogating via Cytoscape edge attribute file.
    """
    strand = find_strand_from_filename(notsif)

    mirna_bed = BedTool(notsif_to_bed(notsif, strand), from_string=True)

    # create the reference beds
    reference = {}
    for name, bed in izip(names, beds):
        reference[name] = BedTool(bed)

    for name in names:

        # intersect the mirna bed with the reference annotations
        for hit in mirna_bed.intersect(reference[name], s=True, stream=True):

            # name field returned from notsif_to_bed is delimited by "|"
            mirna_name = hit.name.split("|")[0]
            gene_name = hit.name.split("|")[1]
            # Cytoscape formatting
            seed_length = "(%s)" % hit.score
            fields = (mirna_name, seed_length, gene_name, "=", name)
            print " ".join(map(str, fields))
def getGenesOverlappingRegion(rec, genes):
    """
    :param rec:  pysam vcf record
    :param genes: BedTool object with all the gene annotations in bedformat
    :return: tuple of strings (gene_name(s))
    """

    NOGENE = str(set([".___."]))
    if rec.info['SVTYPE'] == "TRA" or ("TRA" in rec.info.keys()
                                       and rec.info['TRA']):
        return NOGENE
    chr1 = rec.chrom
    pos1 = rec.pos
    pos2 = rec.info['ENDPOSSV']

    if int(pos2) < int(pos1):
        pos1 = rec.info['ENDPOSSV']
        pos2 = rec.pos
    locus = BedTool(' '.join([chr1, str(pos1 - 1),
                              str(pos2)]),
                    from_string=True)
    isec = locus.intersect(genes, wao=True)
    if isec is None or isec == "":
        return NOGENE  # print("NO Intersection with any gene")
    gene = set(
        isec.to_dataframe().iloc[0::,
                                 6])  # here we get ALL the Genes in column 6
    # strand = set(isec.to_dataframe().iloc[0:, 8]).pop()
    if gene == "." or gene == "{'.'}":
        return NOGENE
    return str(gene)
def get_coverage(bed_prefix, directory, file_prefix, bam):
    """
    Coverage at all positions is calculated. This is then used for coverage analysis and to determine read depth at any
    false negative sites
    :param bed_prefix: all regions in the bed files submitted are in a file generated during intersections
    :param directory: location of patient results
    :param file_prefix: prefix used for all files in pipeline i.e. worklist-patient
    :return out: filename for coverage stats
    """
    #TODO change BAM path so filename is not required
    print 'Generating coverage stats.'
    whole_bed = '/results/Analysis/MiSeq/MasterBED/GIAB/' + bed_prefix + '.whole.bed'
    out = directory + '/giab_results/whole_bed_coverage.txt'
    command = '/results/Pipeline/program/sambamba/build/sambamba depth base --min-coverage=0 -q29 -m -L ' + whole_bed + \
              ' ' + bam + ' > ' + out + '.tmp'
    try:
        subprocess.check_call(command, shell=True)
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)
    print 'Sambamba complete.'
    #issue with sambamba that leaves out regions that have 0 coverage - intersect regions to find missing and add
    # them to the file at coverage 0
    temp_bed = out.replace('.txt', '.bed.tmp')
    command = 'awk \'{print($1"\\t"$2"\\t"$2+1"\\t"$3)}\' ' + out + '.tmp | grep -v "COV" > ' + temp_bed
    print command
    try:
        subprocess.check_call(command, shell=True)
        print 'BED coordinates extracted.'
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)


    coverage_bed = BedTool(temp_bed)
    print 'BED tool created'
    whole_bedtool = BedTool(whole_bed)
    print 'Intersecting'
    missing_regions = whole_bedtool.intersect(coverage_bed, v=True)
    missing_file = directory + '/giab_results/regions_missing'
    missing_regions.moveto(missing_file)
    print 'Generating file'
    sample_split = file_prefix.split('-')
    sample = sample_split[1] + '-' + sample_split[2]
    command = '''while read i; do start=`echo "$i"|cut -f2`; end=`echo "$i"|cut -f3`; chr=`echo "$i"|cut -f1`; end_true=`echo "${end} - 1" | bc`; for j in $(seq $start $end_true); do new_end=`echo -e "${j} + 1" | bc`; echo -e "$chr\\t${j}\\t0\\t0\\t0\\t0\\t0\\t0\\t0\\t''' + sample + '''";done;done < ''' + missing_file + '> ' + directory + '/to_add'
    print command
    try:
        subprocess.check_call(command, shell=True)
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)

    command = 'cat ' + out + '.tmp ' + directory + '/to_add > ' + out
    try:
        subprocess.check_call(command, shell=True)
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)
    print 'fix complete.'
    return out
Exemplo n.º 36
0
def main():
    p = argparse.ArgumentParser(description=__doc__,
            formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument('peaks', help='peaks bed')
    p.add_argument('exons', help='refseq exons from UCSC')
    p.add_argument('gtf', help='refseq gtf with feature of interest')
    p.add_argument('feature', help='feature of interest in the gtf')
    p.add_argument('-v', '--verbose', action="store_true", help='maximum verbosity')
    args = p.parse_args()
    
    if args.verbose: sys.stderr.write(">> building exon library...\n")
    exon_lib = make_exon_lib(args.exons)
    
    peaks = BedTool(args.peaks)
    exons = BedTool(args.exons)
    full_ref = BedTool(args.gtf)
    
    if args.verbose: sys.stderr.write(">> filtering for feature...\n")
    filtered_ref = full_ref.filter(lambda gtf: gtf[2] == args.feature)
    
    if args.verbose: sys.stderr.write(">> selecting exonic peaks...\n")
    exonic_peaks = peaks.intersect(exons, wo=True)
    
    if args.verbose: sys.stderr.write(">> calculating distance fractions...\n")
    # D for distance (returns negative if upstream)
    for peak in exonic_peaks.closest(filtered_ref, D="a"):
        try:
            p = ComplexLine(peak)
            corrected_distance = 0.0
            total_exon_length = 0.0
            # parse gtf attrs
            gene_id = p.gtfattrs.split(';')[0].rstrip('"').lstrip('gene_id "')

            # looking downstream wrt peak
            if p.gtfdistance > 0:
                # exon with peak
                corrected_distance = p.exonstop - p.peakstop
                for exon in exon_lib[p.exoninfo.name]:
                    # add downstream exon lengths
                    if exon > p.exoninfo.number:
                        corrected_distance += exon_lib[p.exoninfo.name][exon]
                        
            # looking upstream wrt peak
            else:
                # exon with peak
                corrected_distance = p.peakstart - p.exonstart
                for exon in exon_lib[p.exoninfo.name]:
                    # add upstream exon lengths
                    if exon < p.exoninfo.number:
                        corrected_distance += exon_lib[p.exoninfo.name][exon]
            
            for exon in exon_lib[p.exoninfo.name]:
                total_exon_length += exon_lib[p.exoninfo.name][exon]
            
            # fraction
            print (corrected_distance / total_exon_length)
        
        except ValueError:
            continue
Exemplo n.º 37
0
def cpg_islands(vf, af):
    print "inside cpg_islands"
    v = BedTool(vf)
    cpg = BedTool(af)
    overlap = v.intersect(cpg, wb=True)
    results = dict([ (r.name, 1) for r in overlap ])
    print "exit cpg_islands"
    return Series(results, name="cpg_island")
Exemplo n.º 38
0
def calculate_ovl(nbedfile, obedfile, opts, scoresfile):
    from pybedtools import BedTool
    nbedtool = BedTool(nbedfile)
    obedtool = BedTool(obedfile)

    ab = nbedtool.intersect(obedtool, wao=True, f=opts.f, r=opts.r, s=opts.s)
    cmd = """cut -f4,5,10,13 | awk -F $'\t' 'BEGIN { OFS = FS } ($3 != "."){ print $1,$3,$2,$4; }'"""
    sh(cmd, infile=ab.fn, outfile=scoresfile)
Exemplo n.º 39
0
def parse_bed(cluster, gtruth, omit):
    bed = []
    with open(cluster) as f:
        for line in f:
            if len(line.split('\t')[6].split('~')) == 2:
                if len(line.split('\t')[6].split('~')[1].split(':')[1].split('-')) == 1:
                    chrm = line.split('\t')[6].split('~')[1].split(':')[0]
                    coord1 = line.split('\t')[6].split('~')[1].split(':')[1]
                    name = line.split('\t')[8]
                    bed.append(chrm + '\t' + coord1 + '\t' + str(int(coord1) + 1) + '\t' + name)
                elif len(line.split('\t')[6].split('~')[1].split(':')[1].split('-')) == 2:
                    chrm = line.split('\t')[6].split('~')[1].split(':')[0]
                    coord1 = line.split('\t')[6].split('~')[1].split(':')[1].split('-')[0]
                    coord2 = line.split('\t')[6].split('~')[1].split(':')[1].split('-')[1]
                    name = line.split('\t')[8]
                    bed.append(chrm + '\t' + coord1 + '\t' + str(int(coord1) + 1) + '\t' + name)
                    bed.append(chrm + '\t' + coord2 + '\t' + str(int(coord2) + 1) + '\t' + name)
            elif len(line.split('\t')[6].split('~')) == 3:
                chrm1 = line.split('\t')[6].split('~')[1].split(':')[0]
                chrm2 = line.split('\t')[6].split('~')[2].split(':')[0]
                coord1 = line.split('\t')[6].split('~')[1].split(':')[1]
                coord2 = line.split('\t')[6].split('~')[2].split(':')[1]
                name = line.split('\t')[8]
                bed.append(chrm1 + '\t' + coord1 + '\t' + str(int(coord1) + 1) + '\t' + name)
                bed.append(chrm2 + '\t' + coord2 + '\t' + str(int(coord2) + 1) + '\t' + name)
    bed_str = '\n'.join(bed)
    sv_bed = BedTool(bed_str, from_string=True)
    sv_bed = sv_bed.sort()
    truth_bed = BedTool(gtruth)
    truth_bed = truth_bed.sort()
    intersect1 = sv_bed.intersect(truth_bed, wa=True, wb=True)
    if str(omit) != 'None':
        omit_bed = BedTool(omit)
        omit_bed = omit_bed.sort()
        intersect2 = sv_bed.intersect(omit_bed, wa=True, wb=True)
    else:
        intersect2 = []
    truth_dict = {}
    omit_dict = {}
    for line in intersect1:
        line = list(line)
        truth_dict[line[3]] = 1
    for line in intersect2:
        line = list(line)
        omit_dict[line[3]] = 1
    return truth_dict, omit_dict
Exemplo n.º 40
0
def motifs(vf, af):
    print "inside motif"
    v = BedTool(vf)
    cpg = BedTool(af)
    overlap = v.intersect(cpg, wb=True)
    results = dict([ (r.name, 1) for r in overlap ])
    print "exit motif"
    return Series(results, name="pwm")
def build_vcf_intervals(reads, vcf_recs, bam_handle):
    """
    Find if any of these reads match a known SUN/indel by simple bedtools intersections
    """
    vcf_bed_recs = [ChromosomeInterval(x.CHROM, x.start, x.end, None) for x in vcf_recs]
    vcf_bed = BedTool(vcf_bed_recs)
    reads_bed_recs = [(bam_handle.getrname(x.tid), x.positions[0], x.positions[-1]) for x in reads if len(x.positions) > 2]
    reads_bed = BedTool(reads_bed_recs)
    return list(vcf_bed.intersect(reads_bed))
Exemplo n.º 42
0
def repeats(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    intersection = v.intersect(feats, wb=True)
    results = {}
    if len(intersection) > 0:
        annots = intersection.groupby(g=[1,2,3,4], c=8, ops='collapse')
        for entry in annots:
            types = entry[4].split(',') 
            results[entry.name] = len(types)

    return Series(results, name='repeat')
def main():
    p = argparse.ArgumentParser(description=__doc__,
            formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument('reads', help='bed format file of reads')
    p.add_argument('intron', help='intron coordinates bed')
    p.add_argument('utr', help="UTR coordinates bed")
    p.add_argument('-v', '--verbose', action='store_true',
            help="maximum verbosity")

    args = p.parse_args()
    
    bed = BedTool(args.reads)
    intron = BedTool(args.intron)
    utr = BedTool(args.utr)
    
    annotated_bed = {}
    
    if args.verbose: print >>sys.stderr, ">> finding intersection"
    intersection = bed.intersect(intron, s=True)
    
    for row in intersection:
        annotated_bed[row.name] = {'chrom':row.chrom, 'start':row.start, \
                                        'stop':row.stop}
    
    if args.verbose: print >>sys.stderr, ">> annotating"
    
    for row in intersection.closest(utr, s=True, d=True):
        annotated_bed[row.name]['utr'] = int(row.fields[-1:][0])

    step = 100
    rangemax = 5000
    
    bin_counts = {}
    max_utr = 0
    
    for ubound in range(step, rangemax + 1, step):
        bincount = 0
        for name, val in annotated_bed.iteritems():
            if val['utr'] <= ubound and val['utr'] > (ubound - step):
                bincount += 1
            if val['utr'] > max_utr:
                max_utr = val['utr']
        fields = (ubound, bincount)
        print "\t".join(map(str, fields))
    
    for ubound in range(50000, max_utr, 50000):
        bincount = 0
        for name, val in annotated_bed.iteritems():
            if val['utr'] <= ubound and val['utr'] > (ubound - 100000):
                bincount += 1
        fields = (ubound, bincount)
        print "\t".join(map(str, fields))
Exemplo n.º 44
0
def intersect_bed(bed_name, bed_filter):
    """KEEPS regions of annotation of interest that overlap with
    repeat-masked regions
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    if not os.path.isfile(bed_name + '.Rmsk.bed'):
        bed = BedTool(bed_name + '.merged.sorted.bed')
        print "Keeping calls in rmsk from " + bed_name + "..."
        bed_overlap = bed.intersect(bed_filter)
        bed_overlap.saveas(bed_name + '.Rmsk.bed')
        print bed_name + " done!"
    else:
        print bed_name + " rmsk calls already isolated"
Exemplo n.º 45
0
def rebin_step1():
	if url == False:
		A=BedTool(input_signal)
		B=BedTool(bins)
		AB = A.intersect(B, wo=True)
		AB_inv=B.intersect(A, v=True)
		return AB,AB_inv
	elif url == True:
		to_download="'"+input_signal+"'"
		command= "wget "+to_download
		p = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE)
		stdout, stderr = p.communicate()
		ninput_signal=os.path.basename(input_signal) 
		if bigWig ==True:
			run_bash("./kent_binaries/bigWigToBedGraph "+ninput_signal+" "+ninput_signal.replace(".bigWig",".bedGraph").replace(".bigwig",".bedGraph"))
			run_bash("rm "+ninput_signal)
			ninput_signal=ninput_signal.replace(".bigWig",".bedGraph").replace(".bigwig",".bedGraph")	
		A=BedTool(ninput_signal)
		B=BedTool(bins)
		AB = A.intersect(B, wo=True)
		AB_inv=B.intersect(A, v=True)
		run_bash("rm "+ninput_signal)
		return AB,AB_inv
Exemplo n.º 46
0
def intersectBamWithBed(inbam, inbed):
    '''
    Intersects reads with genomic features, Transposable elements, and returns separately reads that map sense and antisense to the features.
    Input: paths to bam and bed file
    Output: list of tuples with a name (str) and the reads for sense and antisense piRNAs (bedTool)
    '''
    # convert bam to bed
    print 'Separating sense and antisense piRNAs ' + timeStamp()
    piRNA = BedTool(inbam).bam_to_bed()

    ## create bedtool for genomic features
    bed = BedTool(inbed)

    # outname = inbam.replace('.bam', '')
    # outsense = outname + "sense.bed"
    # outantisense = outname + "antisense.bed"

    antisense = piRNA.intersect(bed, S=True)
    sense = piRNA.intersect(bed, s=True)
    piRNAs = [
        ('sense', sense),
        ('antisense', antisense)]
    return piRNAs
Exemplo n.º 47
0
def bound_motifs(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    intersection = feats.intersect(v, wb=True)
    results = {}
    if len(intersection) > 0:
        sort_cmd = 'sort -k6,6 -k7,7n -k8,8n -k9,9 %s -o %s' % (intersection.fn, intersection.fn)
        call(sort_cmd, shell=True)
        annots = intersection.groupby(g=[6,7,8,9], c=4, ops='collapse')
        for entry in annots:
            cells = entry[4].split(',') 
            results[entry.name] = len(cells)

    return Series(results, name='bound_motifs')
Exemplo n.º 48
0
def check_overlap(feature_string, query_string):
    """
    Check overlap between two bed strings.

    Arg1: feature_string -> target string.
    Arg2: query_string -> query string.
    Returns -> True (if has overlap), False (not overlap).

    """

    feat_bed = BedTool(feature_string, from_string=True)
    query_bed = BedTool(query_string, from_string=True)
    feat_query = feat_bed.intersect(query_bed)
    return bool(feat_query)
Exemplo n.º 49
0
def bed_to_snps(bed, bim_df, mergebed=False):
    from pybedtools import BedTool
    if mergebed:
        print('merging intervals in mask')
        bed = bed.sort().merge()
    print('creating bedtool')
    iter_bim = [['chr'+str(x1), x2, x2+1] for (x1, x2) in np.array(bim_df[['CHR', 'BP']])]
    bimbed = BedTool(iter_bim)
    print('performing bedtools intersection')
    int_bed = bimbed.intersect(bed)
    print('creating df and merging with refpanel')
    bp = [x.start for x in int_bed]
    df_int = pd.DataFrame({'BP': bp})
    return pd.merge(bim_df, df_int, how='inner', on='BP')
Exemplo n.º 50
0
def get_peak_counter(bed, ref_bed):
    """
    bed is either the pos or neg strand file of peak coordinates
    ref_bed is the reference annotation for exon, intron, etc.
    
    returns defaultdict(Counter)
    """
    bed = BedTool(bed)
    ref_bed = BedTool(ref_bed)
    peaks = defaultdict(Counter)
    for peak in bed.intersect(ref_bed,f=0.5,wo=True):
        gene_name = peak[6]
        peaks[gene_name].update([gene_name])
    return peaks
Exemplo n.º 51
0
def cpg_islands(vf, af):
    v = BedTool(vf)
    cpg = BedTool(af)
    overlap = v.intersect(cpg, wb=True)
    #sort_cmd = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$5"_"$6"_"$7"_"$8}\' %s 1<>%s' % (overlap.fn, overlap.fn)
    #call(sort_cmd, shell=True)
    tempfile1 = tempfile.mktemp()
    sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$5"_"$6"_"$7"_"$8}\' %s > %s' % (overlap.fn, tempfile1)
    call(sort_cmd2, shell=True)
    intersection = BedTool(tempfile1)   
    results = {}
    for r in intersection:
        results[r.name] = r[4]
    return Series(results, name="cpg_island")
Exemplo n.º 52
0
def consolidate(nbedfile, obedfile, cbedfile):
    from pybedtools import BedTool
    nbedtool = BedTool(nbedfile)
    obedtool = BedTool(obedfile)

    ab = nbedtool.intersect(obedtool, s=True, u=True)
    ba = obedtool.intersect(nbedtool, s=True, u=True)

    cmd = "cat {0} {1} | sort -k1,1 -k2,2n".format(ab.fn, ba.fn)
    fp = popen(cmd)
    ovl = BedTool(fp.readlines())

    abmerge = ovl.merge(s=True, nms=True, scores="mean").sort()
    cmd = "cat {0}".format(abmerge.fn)
    fp = popen(cmd, debug=False)
    ovl = BedTool(fp.readlines())

    notovl = nbedtool.intersect(ovl.sort(), s=True, v=True)

    infile = "{0} {1}".format(notovl.fn, ovl.fn)
    tmpfile = "/tmp/reformat.{0}.bed".format(os.getpid())
    cmd = "sort -k1,1 -k2,2n"
    sh(cmd, infile=infile, outfile=tmpfile)

    fp = open(cbedfile, "w")
    bed = Bed(tmpfile)
    for b in bed:
        if ";" in b.accn:
            accns = set()
            for accn in b.accn.split(";"):
                accns.add(accn)
            b.accn = ";".join(accns)
        print >> fp, b
    fp.close()
    os.remove(tmpfile)

    sort([cbedfile, "-i"])
Exemplo n.º 53
0
def dnase_fps(vf, af):
    print "inside dnase_fps"
    v = BedTool(vf)
    feats = BedTool(af)
    results = {}
    intersection = feats.intersect(v, wb=True)
    if len(intersection) > 0:
        sort_cmd = 'sort -k6,6 -k7,7n -k8,8n -k9,9 %s -o %s' % (intersection.fn, intersection.fn)
        call(sort_cmd, shell=True)
        annots = intersection.groupby(g=[6,7,8,9], c=4, o='collapse')
        for entry in annots:
            cells = entry[4].split(',') 
            results[entry.name] = len(cells)
    print "exiting dnase_fps"
    return Series(results, name='dnase_fps')
Exemplo n.º 54
0
def motifs(vf, af):
    v = BedTool(vf)
    cpg = BedTool(af)
    overlap = v.intersect(cpg, wb=True)
    sort_cmd1 = 'sort -k1,1 -k2,2n -k3,3n -k4,4 %s -o %s' % (overlap.fn, overlap.fn)
    tempfile1 = tempfile.mktemp()
    call(sort_cmd1, shell=True)
    sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$5"__"$6"__"$7"__"$8"__"$9"__"$10"__"$11"__"$12"__"$13}\' %s > %s' % (overlap.fn, tempfile1)
    call(sort_cmd2, shell=True)
    intersection = BedTool(tempfile1)    
    annots = intersection.groupby(g=[1,2,3,4], c=5, ops='collapse')
    results = {}
    for entry in annots:
        results[entry.name] = entry[4]
    return Series(results, name="pwm")
Exemplo n.º 55
0
def repeats(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    intersection = v.intersect(feats, wb=True)
    results = {}
    if len(intersection) > 0:
        tempfile1 = tempfile.mktemp()
        sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$5"_"$6"_"$7"_"$8"_"$9"_"$10}\' %s > %s' % (intersection.fn, tempfile1)
        call(sort_cmd2, shell=True)
        intersection = BedTool(tempfile1)   			
        annots = intersection.groupby(g=[1,2,3,4], c=5, ops='collapse')
        for entry in annots:
            results[entry.name] = entry[4]

    return Series(results, name='repeat')
Exemplo n.º 56
0
def make_annot_files(args, bed_for_annot):
    print('making annot file')
    df_bim = pd.read_csv(args.bimfile,
            delim_whitespace=True, usecols = [0,1,2,3], names = ['CHR','SNP','CM','BP'])
    iter_bim = [['chr'+str(x1), x2, x2] for (x1, x2) in np.array(df_bim[['CHR', 'BP']])]
    bimbed = BedTool(iter_bim)
    annotbed = bimbed.intersect(bed_for_annot)
    bp = [x.start for x in annotbed]
    df_int = pd.DataFrame({'BP': bp, 'ANNOT':1})
    df_annot = pd.merge(df_bim, df_int, how='left', on='BP')
    df_annot.fillna(0, inplace=True)
    df_annot = df_annot[['ANNOT']].astype(int)
    if args.annot_file.endswith('.gz'):
        with gzip.open(args.annot_file, 'wb') as f:
            df_annot.to_csv(f, sep = "\t", index = False)
    else:
        df_annot.to_csv(args.annot_file, sep="\t", index=False)
Exemplo n.º 57
0
def snp_stats(vf, af, stat='avg_het', flank=500):
    v = BedTool(vf)
    feats = BedTool(af)
    flanks = v.slop(g=pybedtools.chromsizes('hg19'), b=flank)
    intersection = feats.intersect(flanks, wb=True)
    results = {}
    if len(intersection) > 0:
        sort_cmd = 'sort -k6,6 -k7,7n -k8,8n -k9,9 %s -o %s' % (intersection.fn, intersection.fn)
        call(sort_cmd, shell=True)
        annots = intersection.groupby(g=[6,7,8,9], c=5, ops='collapse')

        for entry in annots:
            rates = entry[4].split(',')
            tot = reduce(lambda x, y: x + float(y), rates, 0.)
            rate = tot / (flank * 2)
            results[entry.name] = rate
        
    return Series(results, name=stat)