Python BedTool.groupby示例，pybedtools.BedTool.groupby Python示例

示例#1

0

显示文件

def ConstructTranscripts(exonAnnoFile):
    '''input: ExonAnnoFile (gend id) sorted by genomic coordinates
    col4: gene_id, col7: transcript_id
    returns: a dict of {gene_id: Transcript object}
    '''
    df_exons = BedTool(exonAnnoFile).to_dataframe()
    df_exons.rename(columns={
        'name': 'gene_id',
        'thickStart': 'transcript_id'
    },
                    inplace=True)
    df_exons['length'] = df_exons['end'] - df_exons['start']
    # calculate transcript length
    df_transcripts_length = df_exons.groupby('gene_id',
                                             as_index=False)['length'].sum()
    df_transcripts_length.rename(columns={'length': 'transcript_length'},
                                 inplace=True)
    # extract transcript start position
    df_transcripts_start = df_exons.groupby('gene_id',
                                            as_index=False)['start'].min()
    df_transcripts_start.rename(columns={'start': 'transcript_start'},
                                inplace=True)
    # merge transcript length and transcript start position into one data frame
    df_transcripts = pd.merge(pd.merge(df_exons,
                                       df_transcripts_start,
                                       on='gene_id'),
                              df_transcripts_length,
                              on='gene_id')
    # calculate start position of exons relative to the transcripts
    df_transcripts_rel_start = df_transcripts.groupby(
        'gene_id', as_index=False)['length'].cumsum()
    df_transcripts[
        'rel_start'] = df_transcripts_rel_start - df_transcripts['length']
    # create a dict of gene id: Transcript objects
    transcripts = {}
    for name, group in df_transcripts.groupby('gene_id'):
        firstrow = group.irow(0)
        gene_id = firstrow['gene_id']
        t = Transcript(chrom=firstrow['chrom'],
                       start=firstrow['transcript_start'],
                       length=firstrow['transcript_length'],
                       strand=firstrow['strand'],
                       exon_count=len(group),
                       gene_id=row['gene_id'],
                       transcript_id=row['transcript_id'])
        i = 0
        for name, row in group.iterrows():
            t.exon_length[i] = row['length']
            t.exon_int[i][0] = row['rel_start']
            t.exon_int[i][1] = row['rel_start'] + row['length']
            t.exon_int[i][2] = row['start']
            t.exon_int[i][3] = row['end']
            i += 1

        transcripts[gene_id] = t

    return transcripts

示例#2

0

显示文件

文件： anno_info2.py 项目： tianxiahuihui/bioinformatics

def gene_regions(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    
    # first establish all the columns in the annotation file
    cols = set(f[4] for f in feats)

    results = {}

    intersection = v.intersect(feats, wb=True)

    if len(intersection) > 0:
        #sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn)
        #call(sort_cmd1, shell=True)
        tempfile1 = tempfile.mktemp()
        sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s > %s' % (intersection.fn, tempfile1)
        call(sort_cmd2, shell=True)
        intersection = BedTool(tempfile1)   
        annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse')

        for entry in annots:
            regions = {}
            regions[entry[4]] = entry[5]

            results[entry.name] = Series(regions)

    df = DataFrame(results, index = cols)

    return df.T.fillna(0)

示例#3

0

显示文件

文件： anno_info2.py 项目： tianxiahuihui/bioinformatics

def repeats(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    intersection = v.intersect(feats, wb=True)
    results = {}
    if len(intersection) > 0:
        tempfile1 = tempfile.mktemp()
        sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$5"_"$6"_"$7"_"$8"_"$9"_"$10}\' %s > %s' % (intersection.fn, tempfile1)
        call(sort_cmd2, shell=True)
        intersection = BedTool(tempfile1)   			
        annots = intersection.groupby(g=[1,2,3,4], c=5, ops='collapse')
        for entry in annots:
            results[entry.name] = entry[4]

    return Series(results, name='repeat')

示例#4

0

显示文件

文件： anno_info2.py 项目： tianxiahuihui/bioinformatics

def motifs(vf, af):
    v = BedTool(vf)
    cpg = BedTool(af)
    overlap = v.intersect(cpg, wb=True)
    sort_cmd1 = 'sort -k1,1 -k2,2n -k3,3n -k4,4 %s -o %s' % (overlap.fn, overlap.fn)
    tempfile1 = tempfile.mktemp()
    call(sort_cmd1, shell=True)
    sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$5"__"$6"__"$7"__"$8"__"$9"__"$10"__"$11"__"$12"__"$13}\' %s > %s' % (overlap.fn, tempfile1)
    call(sort_cmd2, shell=True)
    intersection = BedTool(tempfile1)    
    annots = intersection.groupby(g=[1,2,3,4], c=5, ops='collapse')
    results = {}
    for entry in annots:
        results[entry.name] = entry[4]
    return Series(results, name="pwm")

示例#5

0

显示文件

文件： anno_num.py 项目： tianxiahuihui/bioinformatics

def encode_feats(vf, af):
    results = {}
    cols = open(af+'.cols', 'r').readline().strip().split(',')
    #intersection = vs.intersect(feats, wb=True)#TRUE
    tempfile1 = tempfile.mktemp()
    sort_cmd1 = 'bedtools intersect -wb -a %s -b %s > %s' % (vf, af, tempfile1)
    call(sort_cmd1, shell=True)
    intersection = BedTool(tempfile1)
    annots = intersection.groupby(g=[1,2,3,4], c=10, ops='freqdesc')
    for entry in annots:
        fs = entry[4].strip(',').split(',')
        results[entry.name] = Series({e[0]: int(e[1]) for e in [f.split(':') for f in fs]})
        df = DataFrame(results, index = cols)
    # transpose to turn feature types into columns, and turn all the NAs in to 0s
    return df.T.fillna(0)

示例#6

0

显示文件

文件： bed.py 项目： zachary-zzc/jcvi

def juncs(args):
    """
    %prog junctions junctions1.bed [junctions2.bed ...]

    Given a TopHat junctions.bed file, trim the read overhang to get intron span

    If more than one junction bed file is provided, uniq the junctions and
    calculate cumulative (sum) junction support
    """
    from tempfile import mkstemp
    from pybedtools import BedTool

    p = OptionParser(juncs.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fh, trimbed = mkstemp(suffix = ".bed")
    fw = must_open(trimbed, "w")
    for i, juncbed in enumerate(args):
        bed = Bed(juncbed, juncs=True)
        for b in bed:
            ovh = [int(x) for x in b.extra[-2].split(",")]
            b.start += ovh[0]
            b.end -= ovh[1]
            b.accn = "{0}-{1}".format(b.accn, i)
            b.extra = None
            print >> fw, b
    fw.close()

    if len(args) > 1:
        sh("sort -k1,1 -k2,2n {0} -o {0}".format(trimbed))

        tbed = BedTool(trimbed)
        grouptbed = tbed.groupby(g=[1,2,3,6], c=5, ops=['sum'])

        cmd = """awk -F $'\t' 'BEGIN { OFS = FS } { ID = sprintf("mJUNC%07d", NR); print $1,$2,$3,ID,$5,$4; }'"""
        infile = grouptbed.fn
        sh(cmd, infile=infile, outfile=opts.outfile)
    else:
        sort([trimbed, "-o", opts.outfile])

    os.unlink(trimbed)

示例#7

0

显示文件

文件： anno_info2.py 项目： tianxiahuihui/bioinformatics

def bound_motifs(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    #intersection = feats.intersect(v, wb=True,wa=True)
    intersection = v.intersect(feats, wb=True)
    results = {}
    if len(intersection) > 0:
        sort_cmd1 = 'sort -k1,1 -k2,2n -k3,3n %s -o %s' % (intersection.fn, intersection.fn)
        call(sort_cmd1, shell=True)
        tempfile1 = tempfile.mktemp()
        sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$5"__"$6"__"$7"__"$8"__"$9}\' %s > %s' % (intersection.fn, tempfile1)
        call(sort_cmd2, shell=True)
        intersection = BedTool(tempfile1)   		
        annots = intersection.groupby(g=[1,2,3,4], c=5, ops='collapse')
        for entry in annots:
            results[entry.name] = entry[4]

    return Series(results, name='bound_motifs')

示例#8

0

显示文件

文件： anno_info2.py 项目： tianxiahuihui/bioinformatics

def encode_feats(vf, af):
    results = {}
    cols = open(af+'.cols', 'r').readline().strip().split(',')
    #intersection = vs.intersect(feats, wb=True)#TRUE
    tempfile1 = tempfile.mktemp()
    sort_cmd1 = 'bedtools intersect -wb -a %s -b %s > %s' % (vf, af, tempfile1)
    call(sort_cmd1, shell=True)
    tempfile2 = tempfile.mktemp()
    sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$10"\t"$5"_"$6"_"$7"_"$8"_"$9"_"$10"_"$11"_"$12}\' %s > %s' % (tempfile1, tempfile2)
    call(sort_cmd2, shell=True)
    intersection = BedTool(tempfile2)
    annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse')
    for entry in annots:
        #fs = entry[5].strip(',').split(',')
        #results[entry.name] = Series({e[0]: int(e[1]) for e in [f.split(':') for f in fs]})
        results[entry.name] = Series({entry[4]: entry[5]})
        df = DataFrame(results, index = cols)
    # transpose to turn feature types into columns, and turn all the NAs in to 0s
    return df.T.fillna(0)

示例#9

0

显示文件

文件： Extract_GEMs_for_regions.py 项目： TheJacksonLaboratory/gm-cells-chiadrop

def mainfunc(path1,
             path2,
             savebedpath,
             savecsvpath,
             tmpfilepath,
             RegInterval,
             Thread,
             Length=4000):
    # path1: path for GEMs (i.e. ___ALL.region.PEanno)
    # path2: path for Region (i.e. ____PETcnt_G9.motifannot)
    # savebedpath: path for saving extracted GEMs in .bed
    # savecsvpath: path for saving summary table in .csv
    # tmpfilepath: path for saving tmpfiles produced by pybedtool, a directory
    # Thread: for naming the csv file. (i.e. '0')
    # Length: Length of extension. Default = 4000 (int)
    pybedtools.helpers.cleanup()
    pybedtools.set_tempdir(tmpfilepath)
    # Specify for the path of ___ALL.region.PEanno and import it (GEMs)
    #     path1 = 'Minji_data/SHG0180-181-182NR_hg38_cohesin_FDR_0.1_ALL_motifext4kbboth.region.PEanno'
    ChIA_Drop = BedTool(path1)

    # Specify for the path of ____PETcnt_G9.motifannot and import it (anchors, regions)
    #     path2 = 'Minji_data/LHG0052H.e500.clusters.cis.bothanchint_G250.PETcnt_G9.motifannot.sorted.domains'
    Region = BedTool(path2)

    # Remove unnecessary entries
    Region_short = Region.groupby(g=[1, 2, 6, 12, 14, 20, 8, 9, 16, 21],
                                  c=[12],
                                  o=['count'])
    #     Region_short.moveto('Region_short.bed')
    #     Region_short = BedTool('Region_short.bed')
    Max_iter = Region_short.count()
    #     Length = 4000

    Dict = {}
    # Dict = {'Type/loopID': ['Left_0','Left_1','Right_0','Right_1','Both_0','Both_1','None_0','None_1','Total','Left Intensity', 'Right Intensity','Left motif strand', 'Right motif strand']}
    for i in RegInterval:
        # NowRegion: chrom, start_min, end_max, loop id, ...
        # This line can be improved...
        #     NowRegion = NowRegion.saveas('NowRegion.bed')
        NowRegion = BedTool(Region_short[i:i + 1]).saveas()
        # Find all fragments that intersect with Nowregion
        Intersection = ChIA_Drop.intersect(NowRegion, wa=True)
        # Append original start/and. Technical purpose for using groupby...
        results = [(f[0], '0', '0', f[3], f[4], f[5], f[1], f[2])
                   for f in Intersection]
        Intersection = BedTool(results)

        # Sort the grouping key!!!! Otherwise the later groupby doesn't work as intended...
        Intersection = Intersection.sort(chrThenScoreA=True)
        # Extract the valid GEMs
        FinalGEMs = ProcessRegion(Intersection)
        # Classify+sort+save
        Count_L0, Count_L1 = SortGEM(FinalGEMs, NowRegion[0], 'Left', Length,
                                     savebedpath)
        Count_R0, Count_R1 = SortGEM(FinalGEMs, NowRegion[0], 'Right', Length,
                                     savebedpath)
        Count_B0, Count_B1 = SortGEM(FinalGEMs, NowRegion[0], 'Both', Length,
                                     savebedpath)
        Count_N0, Count_N1 = SortGEM(FinalGEMs, NowRegion[0], 'None', Length,
                                     savebedpath)
        Total = Count_L0 + Count_L1 + Count_R0 + Count_R1 + Count_B0 + Count_B1 + Count_N0 + Count_N1

        # Write into dictionary
        Dict[NowRegion[0][3]] = [
            NowRegion[0][3], Count_L0, Count_L1, Count_L0 + Count_L1,
            (Count_L0 + Count_L1) / Total * 100, Count_R0, Count_R1,
            Count_R0 + Count_R1, (Count_R0 + Count_R1) / Total * 100, Count_B0,
            Count_B1, Count_B0 + Count_B1, (Count_B0 + Count_B1) / Total * 100,
            Count_N0, Count_N1, Count_N0 + Count_N1,
            (Count_N0 + Count_N1) / Total * 100,
            Total, Total - (Count_N0 + Count_N1),
            (Total - (Count_N0 + Count_N1)) / Total * 100, NowRegion[0][6],
            NowRegion[0][7], NowRegion[0][8], NowRegion[0][9],
            NowRegion[0][0] + ':' + str(NowRegion[0][1]) + '-' +
            str(NowRegion[0][2])
        ]
        # Clear all temp files for this session
        pybedtools.helpers.cleanup()

    RenameCol = {}
    NewCol = [
        'LoopID', 'Left_0', 'Left_1', 'Left_Tol', 'Left_Tol %', 'Right_0',
        'Right_1', 'Right_Tol', 'Right_Tol %', 'Both_0', 'Both_1', 'Both_Tol',
        'Both_Tol %', 'None_0', 'None_1', 'None_Tol', 'None_Tol %', 'Total',
        'Total-None', 'Total-None %', 'Left Intensity', 'Right Intensity',
        'Left motif strand', 'Right motif strand', 'Region'
    ]
    for i, name in enumerate(NewCol):
        RenameCol[i] = NewCol[i]

    DF = pd.DataFrame.from_dict(Dict, orient='index').rename(columns=RenameCol)
    # savecsvpath = 'Minji_data/Cohesin_results/01ALL/4kbext_dm/'
    DF.to_csv(savecsvpath + 'LRBNstats_' + Thread + '.csv', index=False)

示例#10

0

显示文件

文件： Cal_Slope.py 项目： TheJacksonLaboratory/gm-cells-chiadrop

def mainfunc(pathR, pathM, pathM2, RegInterval, pathB, saveBGratepath,
             saveMTratepath):
    # pathR: path for region file (i.e. Minji_data/LHG0052H.e500.clusters.cis.bothanchint_G250.PETcnt_G9.motifannot)
    # pathM: path for Motif file (i.e. Minji_data/CTCF_motifs_STORM_hg38_Ext4kbBoth_with_supermotif_domain_id_v3.bed)
    # RegInterval: range(Start_pos,End_pos)
    # pathB: path for directory of bedfiles extracted from data (i.e. Minji_data/Cohesin_results/01ALL/4kbext_dm/Bedfiles/)

    # saveBGratepath: path for saving the background_rate table (i.e. Minji_data/Cohesin_results/01ALL/4kbext_dm/Slope/Background_rate.csv)
    # saveMTratepath: path for saving the Motif_rate table (i.e. Minji_data/Cohesin_results/01ALL/4kbext_dm/Slope/Motif_rate.csv)
    # Note that if you want to parallel this process, give different name for .csv files of each Thread.
    # Then use MergeCSV.py to merge them (each directory should contain only 1 type of .csv file)

    # pathR = 'Minji_data/LHG0052H.e500.clusters.cis.bothanchint_G250.PETcnt_G9.motifannot'
    Region = BedTool(pathR)
    # Remove unnecessary entries
    Region_short = Region.groupby(g=[1, 2, 6, 12, 14, 20, 8, 9, 16, 21],
                                  c=[12],
                                  o=['count'])

    # pathM = 'Minji_data/CTCF_motifs_STORM_hg38_Ext4kbBoth_with_supermotif_domain_id_v3.bed'
    Mfile = pd.read_csv(pathM,
                        sep='\t',
                        names=[
                            'chr', 'M_start', 'M_end', 'Sign', 'M_name',
                            'dmID', 'Side', 'CTCF_Pet_int', 'CTCF_Drop_int',
                            'Coh_Pet_int', 'Coh_Drop_int'
                        ])

    #     pathM2 = 'Minji_data/CTCF_motifs_STORM_hg38_Ext4kbBoth.sorted.id.bed'
    M2file = pd.read_csv(pathM2,
                         sep='\t',
                         names=['chr', 'M_start', 'M_end', 'Sign', 'M_name'])

    List = []
    List_mt = []
    window_size = 8000

    for i in RegInterval:
        # for i in tqdm(range(7,8)):
        # print(Region_short[i])
        total_length = int(Region_short[i][2]) - int(Region_short[i][1])
        # print(total_length)
        # Count how many slicing should be
        Num_window = int(np.ceil(total_length / window_size))
        lpID = Region_short[i][3]
        #     print(lpID)

        R_start = int(Region_short[i][1])
        R_end = int(Region_short[i][2])

        for direction in ['Left', 'Right']:

            #             pathB = 'Minji_data/Cohesin_results/01ALL/4kbext_dm/Bedfiles/'
            pathB_all = pathB + '{}_{}.bed'.format(lpID, direction)
            Bfile = pd.read_csv(pathB_all,
                                sep='\t',
                                names=[
                                    'chr', 'Lm_frag_start', 'Lm_frag_end',
                                    'GEMID', 'FragNum', '???', 'Mid_frags',
                                    'Rm_frag_start', 'Rm_frag_end'
                                ])
            # Bfile preprocessing
            Bfile.insert(9, 'Fragmp', None)
            for k in range(len(Bfile)):
                Bfile.at[k, 'Fragmp'] = get_list_of_fragsmp(Bfile, k)
        #     ipdb.set_trace()

    #         #         Do for Motif_rate
            Lmp_start = int(Region_short[i][1])
            Rmp_end = int(Region_short[i][2])
            #         Find the corresponding mt
            RegMfile = Mfile[Mfile['chr'] == Region_short[i][0]]
            RegM2file = M2file[M2file['chr'] == Region_short[i][0]]
            #         ipdb.set_trace()
            TMP = RegMfile[RegMfile['dmID'] == Region_short[i][3]]
            Lmt = TMP[TMP['Side'] == 'L']
            #         Rmt = RegMfile[RegMfile['dmID']==Region_short[i][3]]
            Rmt = TMP[TMP['Side'] == 'R']
            #         ipdb.set_trace()
            if len(Lmt) > 0:
                if len(Rmt) > 0:
                    TempMt = Mt_inInterval(
                        RegMfile, int(Lmt['M_start']),
                        int(Rmt['M_start'])).sort_values(by=['M_start'])
                else:
                    TempMt = Mt_inInterval(RegMfile, int(Lmt['M_start']),
                                           Rmp_end).sort_values(by=['M_start'])
            else:
                if len(Rmt) > 0:
                    TempMt = Mt_inInterval(
                        RegMfile, Lmp_start,
                        int(Rmt['M_start'])).sort_values(by=['M_start'])
                else:
                    TempMt = Mt_inInterval(RegMfile, Lmp_start,
                                           Rmp_end).sort_values(by=['M_start'])

            if direction == 'Left' and len(Lmt) > 0:
                #     Do for back_ground, use binning
                Count = 0
                Table = np.zeros(Num_window)
                for k in range(len(Bfile)):
                    Fragmp = np.array(Bfile.loc[k, 'Fragmp'])
                    Idx = np.minimum(
                        (np.maximum(np.array(
                            (Fragmp - R_start) / window_size), 0)),
                        Num_window - 1).astype(int)
                    Table[Idx] += 1
                for j in range(Num_window):
                    W_start = R_start + window_size * j
                    W_end = R_start + window_size * (j + 1) - 1
                    if j == Num_window:
                        W_end = R_end
                    TempMfile = Mt_inInterval(RegM2file, W_start, W_end, 1)
                    Count = int(Table[j])

                    if len(TempMfile) > 0:
                        for l in range(len(TempMfile)):
                            List.append([
                                Region_short[i][3], Region_short[i][0],
                                W_start, W_end, TempMfile.iloc[l, 0] + ':' +
                                str(TempMfile.iloc[l, 1]) + '-' +
                                str(TempMfile.iloc[l, 2]) + ',' +
                                TempMfile.iloc[l, 3] + ',' +
                                TempMfile.iloc[l, 4], Count, 'L2R'
                            ])
                    else:
                        List.append([
                            Region_short[i][3], Region_short[i][0], W_start,
                            W_end, '.', Count, 'L2R'
                        ])

        #         loop for all mt within Lmt_id, Rmt_id
                for j in range(1, len(TempMt)):
                    #             mt = TempMt.iloc[j]
                    mt_start = TempMt.iloc[j]['M_start']
                    mt_end = TempMt.iloc[j]['M_end']
                    Count = 0
                    for k in range(len(Bfile)):
                        #                 ipdb.set_trace()
                        Fragmp = np.array(Bfile.loc[k, 'Fragmp'])
                        if any((Fragmp >= mt_start) & (Fragmp <= mt_end)):
                            Count += 1
                    List_mt.append([
                        Region_short[i][3],
                        Lmt.iloc[0, 0] + ':' + str(Lmt.iloc[0, 1]) + '-' +
                        str(Lmt.iloc[0, 2]) + ',' + Lmt.iloc[0, 3],
                        Lmt.iloc[0, 4], Lmt.iloc[0, 7], Lmt.iloc[0, 8],
                        Lmt.iloc[0, 9], Lmt.iloc[0, 10],
                        TempMt.iloc[j, 0] + ':' + str(TempMt.iloc[j, 1]) +
                        '-' + str(TempMt.iloc[j, 2]) + ',' + TempMt.iloc[j, 3],
                        TempMt.iloc[j, 4], TempMt.iloc[j, 7], TempMt.iloc[j,
                                                                          8],
                        TempMt.iloc[j, 9], TempMt.iloc[j, 10], Count, 'L2R'
                    ])

            elif direction == 'Right' and len(Rmt) > 0:
                Count = 0
                Table = np.zeros(Num_window)
                for k in range(len(Bfile)):
                    Fragmp = np.array(Bfile.loc[k, 'Fragmp'])
                    Idx = np.minimum(
                        np.maximum(np.array((R_end - Fragmp) / window_size),
                                   0), Num_window - 1).astype(int)
                    Table[Idx] += 1
                for j in range(Num_window):
                    W_start = R_end - window_size * (j + 1) + 1
                    W_end = R_end - window_size * j
                    if j == Num_window:
                        W_start = R_start
                    TempMfile = Mt_inInterval(RegM2file, W_start, W_end, 1)
                    Count = int(Table[j])

                    if len(TempMfile) > 0:
                        for l in range(len(TempMfile)):
                            List.append([
                                Region_short[i][3], Region_short[i][0],
                                W_start, W_end, TempMfile.iloc[l, 0] + ':' +
                                str(TempMfile.iloc[l, 1]) + '-' +
                                str(TempMfile.iloc[l, 2]) + ',' +
                                TempMfile.iloc[l, 3] + ',' +
                                TempMfile.iloc[l, 4], Count, 'R2L'
                            ])
                    else:
                        List.append([
                            Region_short[i][3], Region_short[i][0], W_start,
                            W_end, '.', Count, 'R2L'
                        ])
        # loop for all mt within Lmt_id, Rmt_id
                for j in range(0, len(TempMt) - 1):
                    mt = TempMt.iloc[j]
                    mt_start = mt['M_start']
                    mt_end = mt['M_end']
                    Count = 0
                    for k in range(len(Bfile)):
                        #                 ipdb.set_trace()
                        Fragmp = np.array(Bfile.loc[k, 'Fragmp'])
                        if any((Fragmp >= mt_start) & (Fragmp <= mt_end)):
                            Count += 1
                    List_mt.append([
                        Region_short[i][3],
                        TempMt.iloc[j, 0] + ':' + str(TempMt.iloc[j, 1]) +
                        '-' + str(TempMt.iloc[j, 2]) + ',' + TempMt.iloc[j, 3],
                        TempMt.iloc[j, 4], TempMt.iloc[j, 7],
                        TempMt.iloc[j, 8], TempMt.iloc[j, 9], TempMt.iloc[j,
                                                                          10],
                        Rmt.iloc[0, 0] + ':' + str(Rmt.iloc[0, 1]) + '-' +
                        str(Rmt.iloc[0, 2]) + ',' + Rmt.iloc[0, 3],
                        Rmt.iloc[0, 4], Rmt.iloc[0, 7], Rmt.iloc[0, 8],
                        Rmt.iloc[0, 9], Rmt.iloc[0, 10], Count, 'R2L'
                    ])

    Background_rate = pd.DataFrame(List,
                                   columns=[
                                       'LoopID', 'Chr', 'W_start', 'W_end',
                                       'Motif overlap', 'GEM count',
                                       'Direction'
                                   ])
    saveBGratepath = 'Minji_data/Cohesin_results/01ALL/4kbext_dm/Slope/Background_rate.csv'
    Background_rate.to_csv(saveBGratepath, index=False)
    Motif_rate = pd.DataFrame(
        List_mt,
        columns=[
            'LoopID', 'Left motif', 'LM ID', 'LM CTCF ChIA_PET intensity',
            'LM CTCF ChIA_Drop intensity', 'LM Cohesin ChIA_PET intensity',
            'LM Cohesin ChIA_Drop intensity', 'Right motif', 'RM ID',
            'RM CTCF ChIA_PET intensity', 'RM CTCF ChIA_Drop intensity',
            'RM Cohesin ChIA_PET intensity', 'RM Cohesin ChIA_Drop intensity',
            'GEM count', 'Direction'
        ])
    saveMTratepath = 'Minji_data/Cohesin_results/01ALL/4kbext_dm/Slope/Motif_rate.csv'
    Motif_rate.to_csv(saveMTratepath, index=False)

示例#11

0

显示文件

def mainfunc(pathR, pathM, pathM2, RegInterval, pathB, saveFragannopath):
    # pathR: path for region file (i.e. Minji_data/LHG0052H.e500.clusters.cis.bothanchint_G250.PETcnt_G9.motifannot)
    # pathM: path for Motif file (i.e. Minji_data/CTCF_motifs_STORM_hg38_Ext4kbBoth_with_supermotif_domain_id_v3.bed)
    # RegInterval: range(Start_pos,End_pos)
    # pathB: path for directory of bedfiles extracted from data (i.e. Minji_data/Cohesin_results/01ALL/4kbext_dm/Bedfiles/)

    # saveFragannopath: path for saving the Frags_anno table (i.e. Minji_data/Cohesin_results/01ALL/4kbext_dm/Tables/Frags_anno.csv)
    # Note that if you want to parallel this process, give different name for .csv files of each Thread.
    # Then use MergeCSV.py to merge them (each directory should contain only 1 type of .csv file)

    # Region_short = BedTool('Region_short_dm.bed')

    # pathR = 'Minji_data/LHG0052H.e500.clusters.cis.bothanchint_G250.PETcnt_G9.motifannot'
    Region = BedTool(pathR)
    # Remove unnecessary entries
    Region_short = Region.groupby(g=[1, 2, 6, 12, 14, 20, 8, 9, 16, 21],
                                  c=[12],
                                  o=['count'])

    # path = 'Minji_data/CTCF_motifs_STORM_hg38_Ext4kbBoth_chiapet_chiadrop_intensity_id.bed'
    # Mfile = pd.read_csv(path, sep = '\t',names = ['chr','M_start','M_end','Sign','Pet_int','Drop_int','M_name'])

    #     path = 'Minji_data/CTCF_motifs_STORM_hg38_Ext4kbBoth_with_supermotif_domain_id_v7.sorted.bed'
    Mfile = pd.read_csv(pathM,
                        sep='\t',
                        names=[
                            'chr', 'M_start', 'M_end', 'Sign', 'M_name',
                            'dmID', 'Side', 'CTCF_Pet_int', 'CTCF_Drop_int',
                            'Coh_Pet_int', 'Coh_Drop_int'
                        ])

    #     path = 'Minji_data/CTCF_motifs_STORM_hg38_Ext4kbBoth.sorted.id.bed'
    M2file = pd.read_csv(pathM2,
                         sep='\t',
                         names=['chr', 'M_start', 'M_end', 'Sign', 'M_name'])

    #     Thread = '0'
    Max_iter = len(Region_short)
    #     Chuck = int(Max_iter/20)
    # LPlist = [8476,3647,4936]
    # List = []
    # List_mt = []
    List_new = []
    window_size = 8000

    # for i in tqdm(range(Chuck*int(Thread),Chuck*(int(Thread)+1))):
    for i in RegInterval:
        # print(Region_short[i])
        total_length = int(Region_short[i][2]) - int(Region_short[i][1])
        # print(total_length)
        # Count how many slicing should be
        Num_window = int(np.ceil(total_length / window_size))
        lpID = Region_short[i][3]
        #     print(lpID)

        R_start = int(Region_short[i][1])
        R_end = int(Region_short[i][2])
        # ===========================================================================================================
        for direction in ['Left', 'Right', 'Both', 'None']:
            #             path = 'Minji_data/Final_data_results/CTCF_NR_results/01PASS_dm/Bedfiles/{}_{}.bed'.format(lpID,direction)
            pathB_all = pathB + '{}_{}.bed'.format(lpID, direction)
            try:
                Bfile = pd.read_csv(pathB_all,
                                    sep='\t',
                                    names=[
                                        'chr', 'Lm_frag_start', 'Lm_frag_end',
                                        'GEMID', 'FragNum', '???', 'Mid_frags',
                                        'Rm_frag_start', 'Rm_frag_end'
                                    ])
            except:
                continue
            # Bfile preprocessing
            Bfile.insert(9, 'Frags', None)
            Bfile.insert(10, 'Len',
                         Bfile['Rm_frag_end'] - Bfile['Lm_frag_start'])
            #         ipdb.set_trace()
            Bfile = Bfile.sort_values(by=['Len']).reset_index(drop=True)
            for k in range(len(Bfile)):
                Bfile.at[k, 'Frags'] = get_list_of_frags(Bfile, k)
        #     ipdb.set_trace()
            Lmp_start = int(Region_short[i][1])
            Rmp_end = int(Region_short[i][2])
            #         Find the corresponding mt
            RegMfile = Mfile[Mfile['chr'] == Region_short[i][0]]
            RegM2file = M2file[M2file['chr'] == Region_short[i][0]]
            #         ipdb.set_trace()
            TMP = RegMfile[RegMfile['dmID'] == Region_short[i][3]]
            Lmt = TMP[TMP['Side'] == 'L']
            #         Rmt = RegMfile[RegMfile['dmID']==Region_short[i][3]]
            Rmt = TMP[TMP['Side'] == 'R']
            #         ipdb.set_trace()
            if len(Lmt) > 0:
                if len(Rmt) > 0:
                    TempMt = Mt_inInterval(RegMfile, int(Lmt['M_start']),
                                           int(Rmt['M_end']),
                                           1).sort_values(by=['M_start'])
                else:
                    TempMt = Mt_inInterval(RegMfile, int(Lmt['M_start']),
                                           Rmp_end,
                                           1).sort_values(by=['M_start'])
            else:
                if len(Rmt) > 0:
                    TempMt = Mt_inInterval(RegMfile, Lmp_start,
                                           int(Rmt['M_end']),
                                           1).sort_values(by=['M_start'])
                else:
                    TempMt = Mt_inInterval(RegMfile, Lmp_start, Rmp_end,
                                           1).sort_values(by=['M_start'])

            for k in range(len(Bfile)):
                Frags = np.array(Bfile.loc[k, 'Frags'])
                #             ipdb.set_trace()
                sublist = [str() for c in 'c' * len(Frags)]

                for j in range(len(TempMt)):
                    mt_start = TempMt.iloc[j]['M_start']
                    mt_end = TempMt.iloc[j]['M_end']
                    #                 ipdb.set_trace()
                    Condi = Fg_inInterval(Frags[:, 0], mt_start, Frags[:, 1],
                                          mt_end)
                    Res = [KK for KK, val in enumerate(Condi) if val]
                    for _, KK in enumerate(Res):
                        #                     ipdb.set_trace()
                        sublist[KK] += TempMt.iloc[j]['M_name']

                SubStr = ''
                for j in range(len(sublist) - 1):
                    if sublist[j]:
                        SubStr += sublist[j] + ','
                    else:
                        SubStr += '0' + ','
                if sublist[len(sublist) - 1]:
                    SubStr += sublist[len(sublist) - 1]
                else:
                    SubStr += '0'

                List_new.append([
                    Region_short[i][3], direction, SubStr, Bfile.loc[k,
                                                                     'GEMID']
                ])
    Frags_anno = pd.DataFrame(
        List_new,
        columns=['Domain ID', 'Category', 'Fragment_annotation', 'GEM_ID'])
    #     saveFragannopath = 'Minji_data/Final_data_results/CTCF_NR_results/01PASS_dm/Tables/Frags_anno.csv'
    Frags_anno.to_csv(saveFragannopath, index=False)