def ProcessRegion(RawGEMs):
    Temp = RawGEMs.groupby(g=[1, 2, 3, 5],
                           c=[5, 6, 7, 8],
                           o=['count', 'collapse', 'collapse', 'collapse'])
    RefineGEMs = Temp.filter(lambda F: int(F[4]) > 1)
    # Need this to keep the result of filter! All these files can be manually removed after the job
    Test = BedTool(RefineGEMs).saveas()
    Tempstr = ''
    for i in range(Test.count()):
        Start = np.fromstring(Test[i][6], dtype=np.int, sep=', ')
        End = np.fromstring(Test[i][7], dtype=np.int, sep=', ')
        Mcount = Test[i][5].count('P')
        Start.sort()
        End.sort()
        # chrom, start_min, end_min, GEM ID, #(fragments)  start_max, end_max,
        for j in range(len(Start)):
            if j == 0:
                Tempstr += Test[i][0] + ' ' + str(Start[j]) + ' ' + str(
                    End[j]) + ' ' + Test[i][3] + ' ' + str(
                        len(Start)) + ' ' + str(Mcount) + ' '
            elif len(Start) != 2 and j != (len(Start) - 1):
                Tempstr += str(Start[j]) + ',' + str(End[j]) + ','
            elif j == (len(Start) - 1):
                Tempstr += str('-1,-1') + ' ' + str(Start[j]) + ' ' + str(
                    End[j]) + '\n'

    FinalGEMs = BedTool(Tempstr, from_string=True)
    return FinalGEMs
Exemplo n.º 2
0
def compute_all_overlaps(*peak_calls):
    """
    Each peak_call in peak_calls should be a namedtuple with fields
    name, and file.

    file is a bed file defining peak regions

    Computes overlaps for every 2-pair, 3-group, 4-group, etc between input peak calls
    
    Returns a summary object
    """
    
    result = list();

    Group = namedtuple("Group", ["peak_calls", "overlap"]);

    for num in range(len(peak_calls)+1):
        for group in combinations(peak_calls, num):
            if(len(group)>1):
                count = compute_overlap(group);
                names = [x.name for x in group];
                result.append(Group(names, count));
            else #only one group member
                pbed = BedTool(group[0].file);
                count = pbed.count();
                result.append(Group(group[0].name, count));

    return result;
Exemplo n.º 3
0
def bed_intersection_scores(region_bedtool, feature_bedtool, f=0.5, F=0.5, e=True, score_index=4, **kwargs):
    """
    intersects regions with feature bed and returns 0 if no intersection found, or the score if
    an intersection was found
    """
    region_bedtool = BedTool(region_bedtool)

    if feature_bedtool is not None:
        # Get inersection labels and intersecting region subset from a sorted
        # region bedtool
        feature_bedtool = BedTool(feature_bedtool).sort()
	print("Feature bedtool sorted")
        intersect_labels = bed_intersection_labels(
            region_bedtool, feature_bedtool, f=f, F=F)
        intersecting_regions = region_bedtool.at(
            np.where(intersect_labels == 1)[0])
        # For intersecting subset, get matched intersecting feature regions
        matched_intersects = intersecting_regions.intersect(
            BedTool(feature_bedtool), wao=True, f=f, F=F, e=True)
        # Group by score index coumn in matched region bedtool.
        # Its going to be the usr defined score index shifted by the total number
        # of fields/columns in the dnase bed file.
        groupby_col_index = score_index + intersecting_regions.field_count()
        grouped_matched_intersects = matched_intersects.groupby(
            g=[1, 2, 3], c=groupby_col_index, o="max")
        # Initialize score array, store scores where intersection labels are 1
        scores = np.zeros(intersect_labels.shape)
        intersection_scores = [interval.fields[-1]
                               for interval in grouped_matched_intersects]
        scores[intersect_labels == 1] = np.array(
            intersection_scores, dtype=float)
        return scores
    else:
        return -1 * np.ones((region_bedtool.count(), 1))
Exemplo n.º 4
0
def vcf_intersect(vcf_path, bed_panel):
    """Uses Pybedtoolst a.intersect(b) to ectract from VCF variants mapping in the gene panel intervals

    Args:
        1) a path to a valid VCF file to estract variants from.
        2) bed file containing the genetic intervals of interest.

    Returns:
        A tuple, containing:
            1) a VCF object built from the intervals-filtered VCF
            2) the number of original intervals in the bed panel
            3) the number of variants mapping to these intervals
    """

    try:
        vcf_file = BedTool(vcf_path)
        gene_panel = BedTool(bed_panel) # gene panel doesn't need to be sorted by chrom and position.

        # Do the actual filtering and create a mini VCF with only the variants from the bed file's intervals:

        intersections = vcf_file.intersect(gene_panel, header=True)
        LOG.info('Computing intersections between interval filter and VCF file..')
        panel_intervals =  gene_panel.count()
        intersected_vars = intersections.count()

        LOG.info('Extracting %s intervals from the %s total entries of the VCF file.', gene_panel.count(), vcf_file.count())
        LOG.info('Number of variants found in the intervals:%s', intersected_vars)

        temp_intersections_file = NamedTemporaryFile('w+t', dir=os.getcwd())
        intersections.saveas(temp_intersections_file.name)
        mini_VCF = VCF(temp_intersections_file.name)

        #remove temporary file:
        temp_intersections_file.close()

        # Return a tuple with:
        # a mini-VCF file object
        # the number of original intervals in the bed panel
        # the number of variants mapping to these intervals
        return (mini_VCF, panel_intervals, intersected_vars)

    except Exception as e:

        LOG.critical(e)
        return False
Exemplo n.º 5
0
def bed_intersection_labels(region_bedtool, feature_bedtool, f=0.5, F=0.5, e=True, **kwargs):
    """
    intersects regions with feature bed and returns binary labls
    """
    region_bedtool = BedTool(region_bedtool)
    if feature_bedtool is not None:
        try:
            overlap_counts = [interval.count for interval in
                              region_bedtool.intersect(BedTool(feature_bedtool), c=True, f=f, F=F, e=e, **kwargs)]
        except:  # handle unexpected field numbers in feature bedtool by truncating it to bed3
            feature_df = BedTool(feature_bedtool).to_dataframe()
            feature_bedtool = BedTool.from_dataframe(
                feature_df.iloc[:, [0, 1, 2]])
            overlap_counts = [interval.count for interval in
                              region_bedtool.intersect(feature_bedtool, c=True, f=f, F=F, e=e, **kwargs)]
        labels = np.array(overlap_counts) > 0
        return labels.astype(int)[:, np.newaxis]
    else:
        return (AMBIG_LABEL * np.ones((region_bedtool.count(), 1))).astype(int)
Exemplo n.º 6
0
    def comp_score(self, gap_penalty):
        '''compute_score_given_gap_penalty'''
        if gap_penalty in self.__cache:
            return self.__cache[gap_penalty]['score']

        gb = self.orig_bins.scale_neg_scores(gap_penalty)
        observed_result = gb.max_segments()
        mc_res = MonteCarlo.run_simulation(gb.chrom_scores,
                                           niter=self.mc_trials,
                                           nprocs=self.nprocs)
        tester = IntervalTest(observed_result, mc_res)
        segments = [
            segment for (segment, pval) in tester.pvalues()
            if pval < self.pval_lim
        ]
        if len(segments) == 0:
            # no potential peaks found
            log.notice('''Gap penalty of %.2f gives a score of 0.0 \
            (0 potential peaks with 0.00MB coverage)''' % gap_penalty)
            self.__cache[gap_penalty] = {'score': 0.00}
            return 0.0
        # TODO use bx.python instead of pybedtools
        peaks_sb = StringIO.StringIO()
        tester.segments_to_bedstream(segments, peaks_sb)
        peaks = BedTool(peaks_sb.getvalue(), from_string=True)
        d = self.count_stats(self.bins_bedtool.intersect(peaks))
        d['gap-penalty'] = gap_penalty
        try:
            d['peak_EIB_ratio'] = d['EIB'] / float(d['EIB'] + d['DIB'])
        except ZeroDivisionError:
            # no peaks found
            d['peak_EIB_ratio'] = 0.0
        d['global_EIB_coverage'] = d['EIB'] / float(
            self.genome_wide_stats['EIB'])
        d['score'] = d['peak_EIB_ratio']**5 * d['global_EIB_coverage']
        peak_cov = sum(x.end - x.start for x in peaks) / 1e6
        log.notice('''Gap penalty of %.2f gives a score of %.3f \
        (%d potential peaks with %.2fMB coverage)''' %
                   (gap_penalty, d['score'], peaks.count(), peak_cov))
        self.__cache[gap_penalty] = d
        return d['score']
Exemplo n.º 7
0
 def comp_score(self, gap_penalty):
     '''compute_score_given_gap_penalty'''
     if gap_penalty in self.__cache:
         return self.__cache[gap_penalty]['score']
     
     gb = self.orig_bins.scale_neg_scores(gap_penalty)
     observed_result = gb.max_segments()
     mc_res = MonteCarlo.run_simulation(gb.chrom_scores, 
                                        niter=self.mc_trials, nprocs=self.nprocs)
     tester = IntervalTest(observed_result, mc_res)
     segments = [segment for (segment, pval) in tester.pvalues()
                 if pval < self.pval_lim]
     if len(segments) == 0:
         # no potential peaks found
         log.notice('''Gap penalty of %.2f gives a score of 0.0 \
         (0 potential peaks with 0.00MB coverage)''' % gap_penalty)
         self.__cache[gap_penalty] = {'score': 0.00}
         return 0.0
     # TODO use bx.python instead of pybedtools
     peaks_sb = StringIO.StringIO()
     tester.segments_to_bedstream(segments, peaks_sb)
     peaks = BedTool(peaks_sb.getvalue(), from_string=True)
     d = self.count_stats(self.bins_bedtool.intersect(peaks))
     d['gap-penalty'] = gap_penalty
     try:
         d['peak_EIB_ratio'] = d['EIB'] / float(d['EIB'] + d['DIB'])
     except ZeroDivisionError:
         # no peaks found
         d['peak_EIB_ratio'] = 0.0
     d['global_EIB_coverage'] = d['EIB'] / float(self.genome_wide_stats['EIB'])
     d['score'] = d['peak_EIB_ratio']**5 * d['global_EIB_coverage']
     peak_cov = sum(x.end - x.start for x in peaks) / 1e6
     log.notice('''Gap penalty of %.2f gives a score of %.3f \
     (%d potential peaks with %.2fMB coverage)''' % (gap_penalty, d['score'], peaks.count(), peak_cov))
     self.__cache[gap_penalty] = d
     return d['score']
Exemplo n.º 8
0
def main():
	
	error_site = BedTool('/home/kwoklab-user/Error_exome_all.bed')
#	protein_coding_site = BedTool('/home/kwoklab-user/Shared_resources/gemini/data/gencode15.protein_coding.20130131.hg19.bed')
	gemini_data_dir = "/home/kwoklab-user/Shared_resources/gemini/data"

	dbsnp_137_site = BedTool('/home/kwoklab-user/Shared_resources/gemini/data/dbsnp.137.vcf.gz')
	#https://code.google.com/p/discovering-cse/
	cse_site = BedTool('/home/kwoklab-user/Shared_resources/gemini/data/cse-hiseq-8_4-2013-02-20.bed.gz')
#	gms_site = BedTools( path.join(gemini_data_dir, 'GRCh37-gms-mappability.vcf.gz'))
	rmsk_site = BedTool(path.join(gemini_data_dir, 'hg19.rmsk.bed.gz'))
	segdup_site = BedTool(path.join(gemini_data_dir, 'hg19.segdup.bed.gz'))
	clinvar_site = BedTool(path.join(gemini_data_dir,'clinvar_20130118.vcf.gz'))
	dgv_site = BedTool(path.join(gemini_data_dir,'hg19.dgv.bed.gz'))
	CpG_site = BedTool(path.join(gemini_data_dir,'hg19.CpG.bed.gz'))
	
	print "total error sites %d" % error_site.count()
	print "error in cse site %d" % (error_site+cse_site).count()
	print "error in dbsnp137 site %d" %(error_site+dbsnp_137_site).count()
	print "errors in repeat mask region %d" % (error_site+rmsk_site).count()
	print "errors in segdup %d" % (error_site+segdup_site).count()
	print "errors in Clinvar %d" % (error_site+clinvar_site).count()
	print "errors in dgv %d" % (error_site+dgv_site).count()
	print "errors in CpG %d" % (error_site+CpG_site).count()
def mainfunc2(path1,
              path2,
              savebedpath,
              savecsvpath,
              tmpfilepath,
              RegInterval,
              cr_id,
              Thread,
              Length=4000):
    # path1: path for GEMs (i.e. ___ALL.region.PEanno)
    # path2: path for Region (i.e. ____PETcnt_G9.motifannot)
    # savebedpath: path for saving extracted GEMs in .bed
    # savecsvpath: path for saving summary table in .csv
    # tmpfilepath: path for saving tmpfiles produced by pybedtool, a directory
    # Thread: for naming the csv file. (i.e. '0')
    # Length: Length of extension. Default = 4000 (int)
    pybedtools.helpers.cleanup()
    pybedtools.set_tempdir(tmpfilepath)
    # Specify for the path of ___ALL.region.PEanno and import it (GEMs)
    #     path1 = 'Minji_data/SHG0180-181-182NR_hg38_cohesin_FDR_0.1_ALL_motifext4kbboth.region.PEanno'
    ChIA_Drop = BedTool(path1)

    # Specify for the path of ____PETcnt_G9.motifannot and import it (anchors, regions)
    #     path2 = 'Minji_data/LHG0052H.e500.clusters.cis.bothanchint_G250.PETcnt_G9.motifannot.sorted.domains'
    Region_short = BedTool(path2)

    #     # Remove unnecessary entries
    #     Region_short = Region.groupby(g=[1,2,6,12,14,20,8,9,16,21], c=[12], o=['count'])
    #     Region_short.moveto('Region_short.bed')
    #     Region_short = BedTool('Region_short.bed')
    Max_iter = Region_short.count()
    if RegInterval == 'All':
        RegInterval = range(1, Max_iter)
#     Length = 4000

    List1 = []
    NowRegion = BedTool(Region_short[0:1]).saveas()
    # Find all fragments that intersect with Nowregion
    Intersection = ChIA_Drop.intersect(NowRegion, wa=True)
    # Append original start/and. Technical purpose for using groupby...
    results = [(f[0], '0', '0', f[3], f[4], f[5], f[1], f[2])
               for f in Intersection]
    Intersection = BedTool(results)

    # Sort the grouping key!!!! Otherwise the later groupby doesn't work as intended...
    Intersection = Intersection.sort(
        chrThenScoreA=True).saveas('dummyfiles/Intersection' + Thread)

    #     ipdb.set_trace()
    # Dict = {'Type/loopID': ['Left_0','Left_1','Right_0','Right_1','Both_0','Both_1','None_0','None_1','Total','Left Intensity', 'Right Intensity','Left motif strand', 'Right motif strand']}
    for i in RegInterval:
        TempRegion = BedTool(Region_short[i:i + 1]).saveas()
        GEMid = TempRegion[0][3]
        if GEMid[-1] == 'S':
            MidRegion = np.array([TempRegion[0][5],
                                  TempRegion[0][2]]).astype(int)
        else:
            continue

        Len = Find2side(Intersection, NowRegion[0], savebedpath, Thread,
                        MidRegion, GEMid[:-1])
        List1.append([GEMid[:-1], Len])


#         ipdb.set_trace()

    DF1 = pd.DataFrame(List1, columns=['crID_M:x', '# of complexes'])

    DF1.to_csv(savecsvpath + 'List3_' + cr_id + '.csv', index=False)
def mainfunc(path1,
             path2,
             savebedpath,
             savecsvpath,
             tmpfilepath,
             RegInterval,
             cr_id,
             Thread,
             Length=4000):
    # path1: path for GEMs (i.e. ___ALL.region.PEanno)
    # path2: path for Region (i.e. ____PETcnt_G9.motifannot)
    # savebedpath: path for saving extracted GEMs in .bed
    # savecsvpath: path for saving summary table in .csv
    # tmpfilepath: path for saving tmpfiles produced by pybedtool, a directory
    # Thread: for naming the csv file. (i.e. '0')
    # Length: Length of extension. Default = 4000 (int)
    pybedtools.helpers.cleanup()
    pybedtools.set_tempdir(tmpfilepath)
    # Specify for the path of ___ALL.region.PEanno and import it (GEMs)
    #     path1 = 'Minji_data/SHG0180-181-182NR_hg38_cohesin_FDR_0.1_ALL_motifext4kbboth.region.PEanno'
    ChIA_Drop = BedTool(path1)

    # Specify for the path of ____PETcnt_G9.motifannot and import it (anchors, regions)
    #     path2 = 'Minji_data/LHG0052H.e500.clusters.cis.bothanchint_G250.PETcnt_G9.motifannot.sorted.domains'
    Region_short = BedTool(path2)

    #     # Remove unnecessary entries
    #     Region_short = Region.groupby(g=[1,2,6,12,14,20,8,9,16,21], c=[12], o=['count'])
    #     Region_short.moveto('Region_short.bed')
    #     Region_short = BedTool('Region_short.bed')
    Max_iter = Region_short.count()
    if RegInterval == 'All':
        RegInterval = range(Max_iter)
#     Length = 4000

    List1 = []
    List2 = []
    #     ipdb.set_trace()
    # Dict = {'Type/loopID': ['Left_0','Left_1','Right_0','Right_1','Both_0','Both_1','None_0','None_1','Total','Left Intensity', 'Right Intensity','Left motif strand', 'Right motif strand']}
    for i in RegInterval:
        # NowRegion: chrom, start_min, end_max, loop id, ...
        # This line can be improved...
        #     NowRegion = NowRegion.saveas('NowRegion.bed')
        NowRegion = BedTool(Region_short[i:i + 1]).saveas()
        # Find all fragments that intersect with Nowregion
        Intersection = ChIA_Drop.intersect(NowRegion, wa=True)
        # Append original start/and. Technical purpose for using groupby...
        results = [(f[0], '0', '0', f[3], f[4], f[5], f[1], f[2])
                   for f in Intersection]
        Intersection = BedTool(results)

        # Sort the grouping key!!!! Otherwise the later groupby doesn't work as intended...
        Intersection = Intersection.sort(chrThenScoreA=True)
        # Extract the valid GEMs
        FinalGEMs = ProcessRegion(Intersection, Thread)
        #         ipdb.set_trace()
        # Classify+sort+save
        if NowRegion[0][3][-2:] == 'SE':
            Count_L0, Count_L1 = SortGEM(FinalGEMs, NowRegion[0], 'Left',
                                         Length, savebedpath)
            Count_R0, Count_R1 = SortGEM(FinalGEMs, NowRegion[0], 'Right',
                                         Length, savebedpath)
            Count_B0, Count_B1 = SortGEM(FinalGEMs, NowRegion[0], 'Both',
                                         Length, savebedpath)

            Count_L = Count_L0 + Count_L1
            Count_R = Count_R0 + Count_R1
            Count_B = Count_B0 + Count_B1

            CRID = NowRegion[0][3][:-3]
            List1.append([NowRegion[0][3][:-3], 'S_to_E', Count_L])
            List1.append([NowRegion[0][3][:-3], 'E_to_S', Count_R])
            List1.append([NowRegion[0][3][:-3], 'S_and_E', Count_B])

            TempList = [NowRegion[0][3][:-3], Count_L + Count_R + Count_B]
        elif NowRegion[0][3][-1] == 'S':
            Count_R0, Count_R1 = SortGEM(FinalGEMs, NowRegion[0], 'Right',
                                         Length, savebedpath)
            Count_R = Count_R0 + Count_R1

            MID = NowRegion[0][3][len(CRID) + 1:-1]
            List1.append([CRID, MID + '_to_S', Count_R])
            Count_mid = Count_R
        elif NowRegion[0][3][-1] == 'E':
            Count_L0, Count_L1 = SortGEM(FinalGEMs, NowRegion[0], 'Left',
                                         Length, savebedpath)
            Count_L = Count_L0 + Count_L1

            MID = NowRegion[0][3][len(CRID) + 1:-1]
            List1.append([CRID, MID + '_to_E', Count_L])
            Count_mid += Count_L
            #             ipdb.set_trace()
            NowList = TempList.copy()
            NowList.extend([MID, Count_mid])
            List2.append(NowList)
#         Count_N0,Count_N1 = SortGEM(FinalGEMs, NowRegion[0],'None',Length,savebedpath)
#         Total = Count_L0+Count_L1+Count_R0+Count_R1+Count_B0+Count_B1+Count_N0+Count_N1

#         # Write into dictionary
#         Dict[NowRegion[0][3]] = [NowRegion[0][3],Count_L0,Count_L1,Count_L0+Count_L1,(Count_L0+Count_L1)/Total*100,
#                                  Count_R0,Count_R1,Count_R0+Count_R1,(Count_R0+Count_R1)/Total*100,
#                                  Count_B0,Count_B1,Count_B0+Count_B1,(Count_B0+Count_B1)/Total*100,
#                                  Count_N0,Count_N1,Count_N0+Count_N1,(Count_N0+Count_N1)/Total*100,
#                                  Total,Total-(Count_N0+Count_N1),(Total-(Count_N0+Count_N1))/Total*100,
#                                  NowRegion[0][0]+':'+str(NowRegion[0][1])+'-'+str(NowRegion[0][2])]
#         # Clear all temp files for this session
#         pybedtools.helpers.cleanup()

#     RenameCol = {}
#     NewCol = ['LoopID','Left_0','Left_1','Left_Tol','Left_Tol %','Right_0','Right_1','Right_Tol','Right_Tol %',
#               'Both_0','Both_1','Both_Tol','Both_Tol %',
#               'None_0','None_1','None_Tol','None_Tol %','Total','Total-None','Total-None %',
#               'Region']
#     for i, name in enumerate(NewCol):
#         RenameCol[i] = NewCol[i]
    DF1 = pd.DataFrame(List1,
                       columns=['crID', 'orientation', '# of complexes'])
    DF2 = pd.DataFrame(List2,
                       columns=['crID', 'anchorcomp', 'middleID', 'loadcomp'])

    DF1.to_csv(savecsvpath + 'List1_' + cr_id + '.csv', index=False)
    DF2.to_csv(savecsvpath + 'List2_' + cr_id + '.csv', index=False)
Exemplo n.º 11
0
def py_peak_calling(bedgraph, threshold, min_length, inter_peak_distance,
                    merge_close_peaks, keep_highest_close_peak, max_length,
                    generate_ID, output_name, delete_overlap_bed):
    import pybedtools
    import glob
    from pybedtools import BedTool
    import pandas as pd
    import csv

    if merge_close_peaks == keep_highest_close_peak:
        print 'Exiting... merge_close_peaks and keep_highest_close_peak set the same'
        sys.exit()

    #generate name for output
    bedgraph_name = glob.glob(bedgraph)
    filtered_name = bedgraph_name[0].replace('.bedgraph', 'filtered.bedgraph')

    if output_name != 'None':
        filename = output_name

    elif output_name == 'None':
        filename = bedgraph_name[0].replace('.bedgraph', '_peaks.bed')

    print 'input bedgraph file: ' + bedgraph_name[0]
    print 'output filename: ' + filename

    #import data as BedTool
    data = BedTool(bedgraph)
    print 'total sites read: ',
    print len(data)

    #retains intervals above threshold
    above_thresh = data.filter(
        lambda b: float(b.name) >= float(threshold)).saveas(filtered_name)
    print 'sites above threshold: ',
    print len(above_thresh)
    if len(above_thresh) == 0:
        print 'no regions are above the threashold\n'
        sys.exit()

    #merge adjacent above threshold regions and sum bedgraph scores (assumes bedgraph score in col 4)
    #d max distance between merged peaks, c: column modified
    merge_regions = above_thresh.merge(d=10, c=4, o='sum').saveas('temp.bed')

    #filter based on length criteria
    peaks = BedTool(
        merge_regions.filter(lambda x: len(x) >= min_length and len(x) <=
                             max_length)).saveas('temp2.bed')
    print 'number of regions identified: ' + str(peaks.count())

    if merge_close_peaks == 'True':
        if len(peaks) == 0:
            #merge the bonafide peaks if they they are shorter than the inter peak distance and sum scores and sort
            print 'merging peaks that are closer than: ' + str(
                inter_peak_distance)
            merge_peaks = peaks.merge(d=inter_peak_distance, c=4,
                                      o='sum').sort().saveas('temp3.bed')
        if len(peaks) > 0:
            print 'no regions can be merged'
            merge_close_peaks = 'False'
            keep_highest_close_peak = 'True'

    if keep_highest_close_peak == 'True':
        #need to read each line to find close peaks and throw away the one with the lowest score out of the two
        print 'entering loop'
        #        if len(peaks) > 0:
        peaks.saveas('temp_input.bed')

        #print 'before keeping highest, number of regions identified: ' + str(BedTool('temp_input.bed').count())

        last_line = [
            str(item)
            for item in (BedTool('temp_input.bed').to_dataframe().tail(
                n=1).iloc[0, :].tolist())
        ]

        with open('temp_input.bed') as myfile:
            with open('test_output.bed', 'w') as output:
                file_output = csv.writer(output, delimiter='\t')

                prev_line = None

                for line in csv.reader(myfile, delimiter='\t'):
                    print 'testing line: ' + str(line)

                    if prev_line is None:
                        prev_line = line
                        print

                    elif float(prev_line[2]) + float(
                            inter_peak_distance) <= float(line[1]):
                        print 'prev_line: ' + str(prev_line)
                        print 'line: ' + str(line)
                        print 'features far apart, so adding'
                        print
                        file_output.writerow(prev_line)
                        prev_line = line

                    else:
                        print 'prev_line: ' + str(prev_line)
                        print 'line: ' + str(line)
                        print 'features must be close'
                        print
                    if float(prev_line[3]) < float(line[3]):
                        prev_line = line
                        print 'prev_line smaller, so new prev_line'
                        print 'prev_line: ' + str(prev_line)
                        print

                print 'finished reading lines'
                print line
                print last_line
                if line == last_line:
                    print 'must be last line'
                    file_output.writerow(prev_line)

        merge_peaks = BedTool('test_output.bed')
    sys.exit()
    print 'number of peaks found: ' + str(merge_peaks.count())

    if delete_overlap_bed != None:
        print 'delete_overlap_bed provided: ' + delete_overlap_bed
        merge_peaks = merge_peaks.intersect(b=delete_overlap_bed, v=True)
        print 'number of peaks retained: ' + str(merge_peaks.count())

    if not generate_ID:
        print 'saving sorted peak bed file with no ID'

        merge_peaks.saveas(filename)

    if generate_ID:
        print 'saving sorted peak bed file with ID names'

        #change to pandas dataframe
        DF_peaks = merge_peaks.to_dataframe()

        #insert new column with id: 1.... # of peaks
        DF_peaks.insert(
            3, 'id',
            ['id' + str(item) for item in range(1, (len(DF_peaks) + 1))])

        ['id' + str(item) for item in range(1, 5)]
        #save output
        DF_peaks.to_csv(filename, sep='\t', header=False, index=False)

    return 'Finished'
Exemplo n.º 12
0
def get_tf_predictive_setup(true_feature_bedtools, region_bedtool=None,
                            ambiguous_feature_bedtools=None,
                            bin_size=200, flank_size=400, stride=50,
                            n_jobs=1, genome='hg19',
                            min_bin_distance_to_chrom_edge=5000,
                            filter_flank_overlaps=False):
    """
    Implements the tf (and general) imputation data setup for a single sample.
    TODOs
        support chrom.sizes file for personal genomes

    Parameters
    ----------
    tf_feature_peak_bedtools : list of filenames, BedTools or None items
        None items are treated as missing data.
    region_bedtools : filename or BedTool, optional
        If not set, union of tf_feature_peak_bedtools is used.
    filter_flank_overlaps : bool, default: True
        Labels negative bins whose flanks overlap target regions as ambiguous.
    ambiguous_feature_bedtools : list of filenames, BedTools or None items, optional
    genome : str, default: 'hg19'
        Can be any genome name supported by pybedtools.
    """
    # initialize feature bedtools
    true_feature_bedtools = [BedTool(bedtool) if bedtool is not None else None
                             for bedtool in true_feature_bedtools]
    # sanity checks
    if ambiguous_feature_bedtools is not None:
        assert len(ambiguous_feature_bedtools) == len(true_feature_bedtools)
        ambiguous_feature_bedtools = [BedTool(bedtool) if bedtool is not None else None
                                      for bedtool in ambiguous_feature_bedtools]
    # merge and bin region_bedtools
    if region_bedtool is not None:
	print(region_bedtool)
        region_bedtool = BedTool(region_bedtool).sort()
	print("Made Bedtool")
	region_bedtool = region_bedtool.merge()
        bins = bin_bed(region_bedtool, bin_size=bin_size, stride=stride)
    else:  # use union of true peak bedtools
        bedtools_to_merge = [
            bedtool for bedtool in true_feature_bedtools if bedtool is not None]
        region_bedtool = BedTool.cat(
            *bedtools_to_merge, postmerge=True, force_truncate=True)
        bins = bin_bed(region_bedtool, bin_size=bin_size, stride=stride)
    # throw out bins within 5kb of chromosome edge
    genome_chrom_sizes = getattr(genome_registry, genome)
    bins = bins.each(filter_by_chrom_sizes, genome_chrom_sizes,
                     min_bin_distance_to_chrom_edge)
    # filter bins to chr1-22,X,Y
    chrom_list = ["chr%i" % (i) for i in range(1, 23)]
    chrom_list += ["chrX", "chrY"]
    bins = BedTool(bins).each(filter_interval_by_chrom, chrom_list)
    bins = bins.saveas()  # save to temp file to enable counting
    num_bins = bins.count()
    # set genome to hg19
    bins = bins.set_chromsizes(genome)
    # intersect bins and tf_true_peaks for true labels
    if n_jobs == 1:
        true_labels_list = []
        for true_feature_bedtool in true_feature_bedtools:
            true_labels = bed_intersection_labels(bins, true_feature_bedtool)
            true_labels_list.append(true_labels)
    elif n_jobs > 1:  # multiprocess bed intersections
        # save feature bedtools in temp files. Note: not necessary when inputs
        # are filnames
        true_feature_fnames = [
            bedtool.fn if bedtool is not None else None for bedtool in true_feature_bedtools]
        true_labels_list = Parallel(n_jobs=n_jobs)(delayed(bed_intersection_labels)(bins.fn, fname)
                                                   for fname in true_feature_fnames)
    true_labels = np.concatenate(true_labels_list, axis=1)
    bins_and_flanks = bins.slop(b=flank_size)
    if filter_flank_overlaps:
        # intersect bins and flanks for any overlap  with true features
        if n_jobs == 1:
            flank_labels_list = []
            for true_feature_bedtool in true_feature_bedtools:
                flank_labels = bed_intersection_labels(
                    bins, true_feature_bedtool, f=10**-9, F=10**-9)
                flank_labels_list.append(flank_labels)
        elif n_jobs > 1:
            flank_labels_list = Parallel(n_jobs=n_jobs)(delayed(bed_intersection_labels)(bins.fn, bedtool.fn, f=10**-9, F=10**-9)
                                                        for bedtool in true_feature_bedtools)
        flank_labels = np.concatenate(flank_labels_list, axis=1)
        # we label negative bins with any flank overlap with true features as
        # ambiguous
        true_labels[(true_labels == 0) * (flank_labels == 1)] = AMBIG_LABEL
    if ambiguous_feature_bedtools is not None:
        # intersect bins and ambiguous tfs for ambiguous labels
        if n_jobs == 1:
            ambiguous_labels_list = []
            for ambiguous_feature_bedtool in ambiguous_feature_bedtools:
                ambiguous_labels = bed_intersection_labels(
                    bins, ambiguous_feature_bedtool)
                ambiguous_labels_list.append(ambiguous_labels)
        elif n_jobs > 1:
            ambiguous_feature_fnames = [
                bedtool.fn if bedtool is not None else None for bedtool in ambiguous_feature_bedtools]
            ambiguous_labels_list = Parallel(n_jobs=n_jobs)(delayed(bed_intersection_labels)(bins.fn, fname)
                                                            for fname in ambiguous_feature_fnames)
        ambiguous_labels = np.concatenate(ambiguous_labels_list, axis=1)
        # we label negative bins that overlap ambiguous feature as ambiguous
        true_labels[(true_labels == 0) * (ambiguous_labels == 1)] = AMBIG_LABEL
        # TODO: do we want to also filter based on any flank overlap with
        # ambiguous features??

    return bins_and_flanks, true_labels
Exemplo n.º 13
0
    line_count = 0
    for interval in bt:
        try:
            if filter_function(interval):
                intervals.append(interval)
            # if line_count % 10000 == 0 and verbose:
            #     print("Processed " + str(line_count) + " intervals ["+str(round(100*float(line_count)/float(initial_count), 2))+"%]")
        except ValueError, e:
            print(filter_name + " filtering failed for line #" +
                  str(line_count))
            print(str(e))
            print(traceback.format_exc())
            print(sys.exc_info()[0])
            print(str(interval))
            for i in range(1, 8):
                print("Field[" + str(i) + "]=" + str(interval[i]))
            sys.exit(1)

        line_count += 1

    # print("Building bedtool from " + str(len(intervals)) + " intervals..")
    rval = BedTool(fn=intervals)
    # print("Built bedtool.")
    if verbose:
        info_string = "Initial: " + str(initial_count) + ", removed " + \
                      str(initial_count-rval.count()) + ", " + str(rval.count()) + " left."
        if filter_name is not None:
            info_string = "[" + filter_name + "] " + info_string
        print(info_string)
    return rval
Exemplo n.º 14
0
def determine_sex(work_dir, bam_fpath, ave_depth, genome, target_bed=None):
    info()
    info('Determining sex')

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome + ', cannot identify sex')
        return None

    male_area_size = male_bed.count()
    info('Male region total size: ' + str(male_area_size))

    if target_bed:
        male_bed = BedTool(target_bed).intersect(male_bed).merge()
        target_male_area_size = male_bed.count()
        if target_male_area_size < male_area_size * MALE_TARGET_REGIONS_FACTOR:
            info('Target male region total size is ' + str(target_male_area_size) + ', which is less than the ' +
                 'checked male regions size * ' + str(MALE_TARGET_REGIONS_FACTOR) +
                 ' (' + str(male_area_size * MALE_TARGET_REGIONS_FACTOR) + ') - cannot determine sex')
            return None
        else:
            info('Target male region total size is ' + str(target_male_area_size) + ', which is higher than the ' +
                 'checked male regions size * ' + str(MALE_TARGET_REGIONS_FACTOR) +
                 ' (' + str(male_area_size * MALE_TARGET_REGIONS_FACTOR) + '). ' +
                 'Determining sex based on coverage in those regions.')
    else:
        info('WGS, determining sex based on chrY key regions coverage.')

    info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.')
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_cov_output_fpath = sambamba_depth(work_dir, male_bed, bam_fpath, [])
    chry_mean_coverage = get_mean_cov(chry_cov_output_fpath)
    info('Y key regions average depth: ' + str(chry_mean_coverage))
    ave_depth = float(ave_depth)
    info('Sample average depth: ' + str(ave_depth))
    if ave_depth < AVE_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        info('Sample average depth is too low (less then ' + str(AVE_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
             ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        info('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = ave_depth / chry_mean_coverage
        info('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            info('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            info('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male')
            sex = 'M'
    info('Sex is ' + sex)
    info()
    return sex
Exemplo n.º 15
0
split_num = 100

split_region_list = [[]*5]
print split_region_list
bed = BedTool('/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test.bed')


bed = BedTool(bed.sort().merge().window_maker(b=bed.fn, w=100))

bed.all_hits()

# x = BedTool().window_maker(genome='hg38', w=1000000)
bed.saveas('/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test_w100.bed')

split_num = bed.count() if bed.count() < split_num else split_num

print bed.count()/split_num

# print bed.split(10, 'out')

# print x

n = 0
for region in bed:
    # print region.length
    print str(region).strip()
    n += 1

print n
Exemplo n.º 16
0
split_num = 100

split_region_list = [[] * 5]
print split_region_list
bed = BedTool(
    '/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test.bed')

bed = BedTool(bed.sort().merge().window_maker(b=bed.fn, w=100))

bed.all_hits()

# x = BedTool().window_maker(genome='hg38', w=1000000)
bed.saveas(
    '/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test_w100.bed')

split_num = bed.count() if bed.count() < split_num else split_num

print bed.count() / split_num

# print bed.split(10, 'out')

# print x

n = 0
for region in bed:
    # print region.length
    print str(region).strip()
    n += 1

print n
Exemplo n.º 17
0
def py_peak_calling(bedgraph,
                    threshold,
                    min_length,
                    inter_peak_distance,
                    merge_close_peaks=True,
                    keep_highest_close_peak=False,
                    max_length=10000,
                    generate_ID=True,
                    output_name=None,
                    delete_overlap_bed=None):
    """
    - need to install a more up-to-date varsion of bedtools before invoking Jupyter
      type: module load bedtools/2.21.0
	(1) filters bedgraph based on threshold;
	
	(2) merges adjacent basepairs that are over threshold;
	
  (3) retains peaks that satisfy min/max length criteria; 
	
	(4) merges any peaks that are closer than the inter-peak distance cutoff -or-
  alternatively keeps just the highest peak (this is beta functionality)
	
    - max length is typically defaulted to be very large
    - outputs a bed file (default col4 is the sum of the bedgraph scores; sorted by chrom;start;stop)
    - generate ID: will auto generate a integer list as a ID number (1... number of peaks). This will 
    be reported as column 4 and the bedgraph scores will be shifted to column 5 as per standard bed format
    - note the peak score for merged peak is the *just* the sum of the two individual peaks not the 
    total score in the merged region (i.e. there could be some sub-threshold scores in the intervening 
    space that won't be included)
    -assumes bedgraph in standard format <chr> <start> <stop> <score>
    -output_name = option for user defined name (type with '...'), otherwise will generate name bedgraph_peaks.bed
    -delete_overlap_bed = option to add path to bedfile (as string), whereby any peaks that overlap this bed file will be discarded
    """

    import pybedtools
    import glob
    from pybedtools import BedTool
    import pandas as pd
    import csv

    if merge_close_peaks == keep_highest_close_peak:
        return 'Exiting... merge_close_peaks and keep_highest_close_peak set the same'

    #generate name for output
    bedgraph_name = glob.glob(bedgraph)

    if output_name != None:
        filename = output_name

    elif output_name == None:
        filename = bedgraph_name[0].replace('.bg', '_peaks.bed')

    print 'input bedgraph file: ' + bedgraph_name[0]
    print 'output filename: ' + filename

    #import data as BedTool
    data = BedTool(bedgraph)

    #retains intervals above threshold
    above_thresh = data.filter(lambda b: float(b.name) >= threshold)

    #merge adjacent above threshold regions and sum bedgraph scores (assumes bedgraph score in col 4)
    #by increasing d value can allow for
    merge_regions = above_thresh.merge(d=0, c=4, o='sum')

    #filter based on length criteria
    peaks = BedTool(
        merge_regions.filter(
            lambda x: len(x) >= min_length and len(x) <= max_length))

    #     print 'number of regions identified before merging or filtering: ' + str(peaks.count())

    if merge_close_peaks == True:
        #merge the bonafide peaks if they they are shorter than the inter peak distance and sum scores and sort
        print 'merging peaks that are closer than: ' + str(inter_peak_distance)
        merge_peaks = peaks.merge(d=inter_peak_distance, c=4, o='sum').sort()

    if keep_highest_close_peak == True:
        #need to read each line to find close peaks and throw away the one with the lowest score out of the two
        print 'entering loop'

        peaks.saveas('temp_input.bed')

        print 'before keeping highest, number of regions identified: ' + str(
            BedTool('temp_input.bed').count())

        last_line = [
            str(item)
            for item in (BedTool('temp_input.bed').to_dataframe().tail(
                n=1).iloc[0, :].tolist())
        ]

        with open('temp_input.bed') as myfile:
            with open('test_output.bed', 'w') as output:
                file_output = csv.writer(output, delimiter='\t')

                prev_line = None

                for line in csv.reader(myfile, delimiter='\t'):
                    #                     print 'testing line: ' +str(line)

                    if prev_line is None:
                        prev_line = line
#                         print

                    elif float(prev_line[2]) + float(
                            inter_peak_distance) <= float(line[1]):
                        #                         print 'prev_line: ' + str(prev_line)
                        #                         print 'line: ' + str(line)
                        #                         print 'features far apart, so adding'
                        #                         print
                        file_output.writerow(prev_line)
                        prev_line = line

                    else:
                        #                         print 'prev_line: ' + str(prev_line)
                        #                         print 'line: ' + str(line)
                        #                         print 'features must be close'
                        #                         print
                        if float(prev_line[3]) < float(line[3]):
                            prev_line = line
#                             print 'prev_line smaller, so new prev_line'
#                             print 'prev_line: ' + str(prev_line)
#                             print

#                 print 'finished reading lines'
#                 print line
#                 print last_line
                if line == last_line:
                    #                     print 'must be last line'
                    file_output.writerow(prev_line)

            merge_peaks = BedTool('test_output.bed')

    print 'number of peaks found: ' + str(merge_peaks.count())

    if delete_overlap_bed != None:
        print 'delete_overlap_bed provided: ' + delete_overlap_bed
        merge_peaks = merge_peaks.intersect(b=delete_overlap_bed, v=True)
        print 'number of peaks retained: ' + str(merge_peaks.count())

    if not generate_ID:
        print 'saving sorted peak bed file with no ID'

        merge_peaks.saveas(filename)

    if generate_ID:
        print 'saving sorted peak bed file with ID names'

        #change to pandas dataframe
        DF_peaks = merge_peaks.to_dataframe()

        #insert new column with id: 1.... # of peaks
        DF_peaks.insert(
            3, 'id',
            ['id' + str(item) for item in range(1, (len(DF_peaks) + 1))])

        ['id' + str(item) for item in range(1, 5)]
        #save output
        DF_peaks.to_csv(filename, sep='\t', header=False, index=False)

    return 'Finished'