def ConstructTranscripts(exonAnnoFile): '''input: ExonAnnoFile (gend id) sorted by genomic coordinates col4: gene_id, col7: transcript_id returns: a dict of {gene_id: Transcript object} ''' df_exons = BedTool(exonAnnoFile).to_dataframe() df_exons.rename(columns={ 'name': 'gene_id', 'thickStart': 'transcript_id' }, inplace=True) df_exons['length'] = df_exons['end'] - df_exons['start'] # calculate transcript length df_transcripts_length = df_exons.groupby('gene_id', as_index=False)['length'].sum() df_transcripts_length.rename(columns={'length': 'transcript_length'}, inplace=True) # extract transcript start position df_transcripts_start = df_exons.groupby('gene_id', as_index=False)['start'].min() df_transcripts_start.rename(columns={'start': 'transcript_start'}, inplace=True) # merge transcript length and transcript start position into one data frame df_transcripts = pd.merge(pd.merge(df_exons, df_transcripts_start, on='gene_id'), df_transcripts_length, on='gene_id') # calculate start position of exons relative to the transcripts df_transcripts_rel_start = df_transcripts.groupby( 'gene_id', as_index=False)['length'].cumsum() df_transcripts[ 'rel_start'] = df_transcripts_rel_start - df_transcripts['length'] # create a dict of gene id: Transcript objects transcripts = {} for name, group in df_transcripts.groupby('gene_id'): firstrow = group.irow(0) gene_id = firstrow['gene_id'] t = Transcript(chrom=firstrow['chrom'], start=firstrow['transcript_start'], length=firstrow['transcript_length'], strand=firstrow['strand'], exon_count=len(group), gene_id=row['gene_id'], transcript_id=row['transcript_id']) i = 0 for name, row in group.iterrows(): t.exon_length[i] = row['length'] t.exon_int[i][0] = row['rel_start'] t.exon_int[i][1] = row['rel_start'] + row['length'] t.exon_int[i][2] = row['start'] t.exon_int[i][3] = row['end'] i += 1 transcripts[gene_id] = t return transcripts
def gene_regions(vf, af): v = BedTool(vf) feats = BedTool(af) # first establish all the columns in the annotation file cols = set(f[4] for f in feats) results = {} intersection = v.intersect(feats, wb=True) if len(intersection) > 0: #sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn) #call(sort_cmd1, shell=True) tempfile1 = tempfile.mktemp() sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s > %s' % (intersection.fn, tempfile1) call(sort_cmd2, shell=True) intersection = BedTool(tempfile1) annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse') for entry in annots: regions = {} regions[entry[4]] = entry[5] results[entry.name] = Series(regions) df = DataFrame(results, index = cols) return df.T.fillna(0)
def repeats(vf, af): v = BedTool(vf) feats = BedTool(af) intersection = v.intersect(feats, wb=True) results = {} if len(intersection) > 0: tempfile1 = tempfile.mktemp() sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$5"_"$6"_"$7"_"$8"_"$9"_"$10}\' %s > %s' % (intersection.fn, tempfile1) call(sort_cmd2, shell=True) intersection = BedTool(tempfile1) annots = intersection.groupby(g=[1,2,3,4], c=5, ops='collapse') for entry in annots: results[entry.name] = entry[4] return Series(results, name='repeat')
def motifs(vf, af): v = BedTool(vf) cpg = BedTool(af) overlap = v.intersect(cpg, wb=True) sort_cmd1 = 'sort -k1,1 -k2,2n -k3,3n -k4,4 %s -o %s' % (overlap.fn, overlap.fn) tempfile1 = tempfile.mktemp() call(sort_cmd1, shell=True) sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$5"__"$6"__"$7"__"$8"__"$9"__"$10"__"$11"__"$12"__"$13}\' %s > %s' % (overlap.fn, tempfile1) call(sort_cmd2, shell=True) intersection = BedTool(tempfile1) annots = intersection.groupby(g=[1,2,3,4], c=5, ops='collapse') results = {} for entry in annots: results[entry.name] = entry[4] return Series(results, name="pwm")
def encode_feats(vf, af): results = {} cols = open(af+'.cols', 'r').readline().strip().split(',') #intersection = vs.intersect(feats, wb=True)#TRUE tempfile1 = tempfile.mktemp() sort_cmd1 = 'bedtools intersect -wb -a %s -b %s > %s' % (vf, af, tempfile1) call(sort_cmd1, shell=True) intersection = BedTool(tempfile1) annots = intersection.groupby(g=[1,2,3,4], c=10, ops='freqdesc') for entry in annots: fs = entry[4].strip(',').split(',') results[entry.name] = Series({e[0]: int(e[1]) for e in [f.split(':') for f in fs]}) df = DataFrame(results, index = cols) # transpose to turn feature types into columns, and turn all the NAs in to 0s return df.T.fillna(0)
def juncs(args): """ %prog junctions junctions1.bed [junctions2.bed ...] Given a TopHat junctions.bed file, trim the read overhang to get intron span If more than one junction bed file is provided, uniq the junctions and calculate cumulative (sum) junction support """ from tempfile import mkstemp from pybedtools import BedTool p = OptionParser(juncs.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fh, trimbed = mkstemp(suffix = ".bed") fw = must_open(trimbed, "w") for i, juncbed in enumerate(args): bed = Bed(juncbed, juncs=True) for b in bed: ovh = [int(x) for x in b.extra[-2].split(",")] b.start += ovh[0] b.end -= ovh[1] b.accn = "{0}-{1}".format(b.accn, i) b.extra = None print >> fw, b fw.close() if len(args) > 1: sh("sort -k1,1 -k2,2n {0} -o {0}".format(trimbed)) tbed = BedTool(trimbed) grouptbed = tbed.groupby(g=[1,2,3,6], c=5, ops=['sum']) cmd = """awk -F $'\t' 'BEGIN { OFS = FS } { ID = sprintf("mJUNC%07d", NR); print $1,$2,$3,ID,$5,$4; }'""" infile = grouptbed.fn sh(cmd, infile=infile, outfile=opts.outfile) else: sort([trimbed, "-o", opts.outfile]) os.unlink(trimbed)
def bound_motifs(vf, af): v = BedTool(vf) feats = BedTool(af) #intersection = feats.intersect(v, wb=True,wa=True) intersection = v.intersect(feats, wb=True) results = {} if len(intersection) > 0: sort_cmd1 = 'sort -k1,1 -k2,2n -k3,3n %s -o %s' % (intersection.fn, intersection.fn) call(sort_cmd1, shell=True) tempfile1 = tempfile.mktemp() sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$5"__"$6"__"$7"__"$8"__"$9}\' %s > %s' % (intersection.fn, tempfile1) call(sort_cmd2, shell=True) intersection = BedTool(tempfile1) annots = intersection.groupby(g=[1,2,3,4], c=5, ops='collapse') for entry in annots: results[entry.name] = entry[4] return Series(results, name='bound_motifs')
def encode_feats(vf, af): results = {} cols = open(af+'.cols', 'r').readline().strip().split(',') #intersection = vs.intersect(feats, wb=True)#TRUE tempfile1 = tempfile.mktemp() sort_cmd1 = 'bedtools intersect -wb -a %s -b %s > %s' % (vf, af, tempfile1) call(sort_cmd1, shell=True) tempfile2 = tempfile.mktemp() sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$10"\t"$5"_"$6"_"$7"_"$8"_"$9"_"$10"_"$11"_"$12}\' %s > %s' % (tempfile1, tempfile2) call(sort_cmd2, shell=True) intersection = BedTool(tempfile2) annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse') for entry in annots: #fs = entry[5].strip(',').split(',') #results[entry.name] = Series({e[0]: int(e[1]) for e in [f.split(':') for f in fs]}) results[entry.name] = Series({entry[4]: entry[5]}) df = DataFrame(results, index = cols) # transpose to turn feature types into columns, and turn all the NAs in to 0s return df.T.fillna(0)
def mainfunc(path1, path2, savebedpath, savecsvpath, tmpfilepath, RegInterval, Thread, Length=4000): # path1: path for GEMs (i.e. ___ALL.region.PEanno) # path2: path for Region (i.e. ____PETcnt_G9.motifannot) # savebedpath: path for saving extracted GEMs in .bed # savecsvpath: path for saving summary table in .csv # tmpfilepath: path for saving tmpfiles produced by pybedtool, a directory # Thread: for naming the csv file. (i.e. '0') # Length: Length of extension. Default = 4000 (int) pybedtools.helpers.cleanup() pybedtools.set_tempdir(tmpfilepath) # Specify for the path of ___ALL.region.PEanno and import it (GEMs) # path1 = 'Minji_data/SHG0180-181-182NR_hg38_cohesin_FDR_0.1_ALL_motifext4kbboth.region.PEanno' ChIA_Drop = BedTool(path1) # Specify for the path of ____PETcnt_G9.motifannot and import it (anchors, regions) # path2 = 'Minji_data/LHG0052H.e500.clusters.cis.bothanchint_G250.PETcnt_G9.motifannot.sorted.domains' Region = BedTool(path2) # Remove unnecessary entries Region_short = Region.groupby(g=[1, 2, 6, 12, 14, 20, 8, 9, 16, 21], c=[12], o=['count']) # Region_short.moveto('Region_short.bed') # Region_short = BedTool('Region_short.bed') Max_iter = Region_short.count() # Length = 4000 Dict = {} # Dict = {'Type/loopID': ['Left_0','Left_1','Right_0','Right_1','Both_0','Both_1','None_0','None_1','Total','Left Intensity', 'Right Intensity','Left motif strand', 'Right motif strand']} for i in RegInterval: # NowRegion: chrom, start_min, end_max, loop id, ... # This line can be improved... # NowRegion = NowRegion.saveas('NowRegion.bed') NowRegion = BedTool(Region_short[i:i + 1]).saveas() # Find all fragments that intersect with Nowregion Intersection = ChIA_Drop.intersect(NowRegion, wa=True) # Append original start/and. Technical purpose for using groupby... results = [(f[0], '0', '0', f[3], f[4], f[5], f[1], f[2]) for f in Intersection] Intersection = BedTool(results) # Sort the grouping key!!!! Otherwise the later groupby doesn't work as intended... Intersection = Intersection.sort(chrThenScoreA=True) # Extract the valid GEMs FinalGEMs = ProcessRegion(Intersection) # Classify+sort+save Count_L0, Count_L1 = SortGEM(FinalGEMs, NowRegion[0], 'Left', Length, savebedpath) Count_R0, Count_R1 = SortGEM(FinalGEMs, NowRegion[0], 'Right', Length, savebedpath) Count_B0, Count_B1 = SortGEM(FinalGEMs, NowRegion[0], 'Both', Length, savebedpath) Count_N0, Count_N1 = SortGEM(FinalGEMs, NowRegion[0], 'None', Length, savebedpath) Total = Count_L0 + Count_L1 + Count_R0 + Count_R1 + Count_B0 + Count_B1 + Count_N0 + Count_N1 # Write into dictionary Dict[NowRegion[0][3]] = [ NowRegion[0][3], Count_L0, Count_L1, Count_L0 + Count_L1, (Count_L0 + Count_L1) / Total * 100, Count_R0, Count_R1, Count_R0 + Count_R1, (Count_R0 + Count_R1) / Total * 100, Count_B0, Count_B1, Count_B0 + Count_B1, (Count_B0 + Count_B1) / Total * 100, Count_N0, Count_N1, Count_N0 + Count_N1, (Count_N0 + Count_N1) / Total * 100, Total, Total - (Count_N0 + Count_N1), (Total - (Count_N0 + Count_N1)) / Total * 100, NowRegion[0][6], NowRegion[0][7], NowRegion[0][8], NowRegion[0][9], NowRegion[0][0] + ':' + str(NowRegion[0][1]) + '-' + str(NowRegion[0][2]) ] # Clear all temp files for this session pybedtools.helpers.cleanup() RenameCol = {} NewCol = [ 'LoopID', 'Left_0', 'Left_1', 'Left_Tol', 'Left_Tol %', 'Right_0', 'Right_1', 'Right_Tol', 'Right_Tol %', 'Both_0', 'Both_1', 'Both_Tol', 'Both_Tol %', 'None_0', 'None_1', 'None_Tol', 'None_Tol %', 'Total', 'Total-None', 'Total-None %', 'Left Intensity', 'Right Intensity', 'Left motif strand', 'Right motif strand', 'Region' ] for i, name in enumerate(NewCol): RenameCol[i] = NewCol[i] DF = pd.DataFrame.from_dict(Dict, orient='index').rename(columns=RenameCol) # savecsvpath = 'Minji_data/Cohesin_results/01ALL/4kbext_dm/' DF.to_csv(savecsvpath + 'LRBNstats_' + Thread + '.csv', index=False)
def mainfunc(pathR, pathM, pathM2, RegInterval, pathB, saveBGratepath, saveMTratepath): # pathR: path for region file (i.e. Minji_data/LHG0052H.e500.clusters.cis.bothanchint_G250.PETcnt_G9.motifannot) # pathM: path for Motif file (i.e. Minji_data/CTCF_motifs_STORM_hg38_Ext4kbBoth_with_supermotif_domain_id_v3.bed) # RegInterval: range(Start_pos,End_pos) # pathB: path for directory of bedfiles extracted from data (i.e. Minji_data/Cohesin_results/01ALL/4kbext_dm/Bedfiles/) # saveBGratepath: path for saving the background_rate table (i.e. Minji_data/Cohesin_results/01ALL/4kbext_dm/Slope/Background_rate.csv) # saveMTratepath: path for saving the Motif_rate table (i.e. Minji_data/Cohesin_results/01ALL/4kbext_dm/Slope/Motif_rate.csv) # Note that if you want to parallel this process, give different name for .csv files of each Thread. # Then use MergeCSV.py to merge them (each directory should contain only 1 type of .csv file) # pathR = 'Minji_data/LHG0052H.e500.clusters.cis.bothanchint_G250.PETcnt_G9.motifannot' Region = BedTool(pathR) # Remove unnecessary entries Region_short = Region.groupby(g=[1, 2, 6, 12, 14, 20, 8, 9, 16, 21], c=[12], o=['count']) # pathM = 'Minji_data/CTCF_motifs_STORM_hg38_Ext4kbBoth_with_supermotif_domain_id_v3.bed' Mfile = pd.read_csv(pathM, sep='\t', names=[ 'chr', 'M_start', 'M_end', 'Sign', 'M_name', 'dmID', 'Side', 'CTCF_Pet_int', 'CTCF_Drop_int', 'Coh_Pet_int', 'Coh_Drop_int' ]) # pathM2 = 'Minji_data/CTCF_motifs_STORM_hg38_Ext4kbBoth.sorted.id.bed' M2file = pd.read_csv(pathM2, sep='\t', names=['chr', 'M_start', 'M_end', 'Sign', 'M_name']) List = [] List_mt = [] window_size = 8000 for i in RegInterval: # for i in tqdm(range(7,8)): # print(Region_short[i]) total_length = int(Region_short[i][2]) - int(Region_short[i][1]) # print(total_length) # Count how many slicing should be Num_window = int(np.ceil(total_length / window_size)) lpID = Region_short[i][3] # print(lpID) R_start = int(Region_short[i][1]) R_end = int(Region_short[i][2]) for direction in ['Left', 'Right']: # pathB = 'Minji_data/Cohesin_results/01ALL/4kbext_dm/Bedfiles/' pathB_all = pathB + '{}_{}.bed'.format(lpID, direction) Bfile = pd.read_csv(pathB_all, sep='\t', names=[ 'chr', 'Lm_frag_start', 'Lm_frag_end', 'GEMID', 'FragNum', '???', 'Mid_frags', 'Rm_frag_start', 'Rm_frag_end' ]) # Bfile preprocessing Bfile.insert(9, 'Fragmp', None) for k in range(len(Bfile)): Bfile.at[k, 'Fragmp'] = get_list_of_fragsmp(Bfile, k) # ipdb.set_trace() # # Do for Motif_rate Lmp_start = int(Region_short[i][1]) Rmp_end = int(Region_short[i][2]) # Find the corresponding mt RegMfile = Mfile[Mfile['chr'] == Region_short[i][0]] RegM2file = M2file[M2file['chr'] == Region_short[i][0]] # ipdb.set_trace() TMP = RegMfile[RegMfile['dmID'] == Region_short[i][3]] Lmt = TMP[TMP['Side'] == 'L'] # Rmt = RegMfile[RegMfile['dmID']==Region_short[i][3]] Rmt = TMP[TMP['Side'] == 'R'] # ipdb.set_trace() if len(Lmt) > 0: if len(Rmt) > 0: TempMt = Mt_inInterval( RegMfile, int(Lmt['M_start']), int(Rmt['M_start'])).sort_values(by=['M_start']) else: TempMt = Mt_inInterval(RegMfile, int(Lmt['M_start']), Rmp_end).sort_values(by=['M_start']) else: if len(Rmt) > 0: TempMt = Mt_inInterval( RegMfile, Lmp_start, int(Rmt['M_start'])).sort_values(by=['M_start']) else: TempMt = Mt_inInterval(RegMfile, Lmp_start, Rmp_end).sort_values(by=['M_start']) if direction == 'Left' and len(Lmt) > 0: # Do for back_ground, use binning Count = 0 Table = np.zeros(Num_window) for k in range(len(Bfile)): Fragmp = np.array(Bfile.loc[k, 'Fragmp']) Idx = np.minimum( (np.maximum(np.array( (Fragmp - R_start) / window_size), 0)), Num_window - 1).astype(int) Table[Idx] += 1 for j in range(Num_window): W_start = R_start + window_size * j W_end = R_start + window_size * (j + 1) - 1 if j == Num_window: W_end = R_end TempMfile = Mt_inInterval(RegM2file, W_start, W_end, 1) Count = int(Table[j]) if len(TempMfile) > 0: for l in range(len(TempMfile)): List.append([ Region_short[i][3], Region_short[i][0], W_start, W_end, TempMfile.iloc[l, 0] + ':' + str(TempMfile.iloc[l, 1]) + '-' + str(TempMfile.iloc[l, 2]) + ',' + TempMfile.iloc[l, 3] + ',' + TempMfile.iloc[l, 4], Count, 'L2R' ]) else: List.append([ Region_short[i][3], Region_short[i][0], W_start, W_end, '.', Count, 'L2R' ]) # loop for all mt within Lmt_id, Rmt_id for j in range(1, len(TempMt)): # mt = TempMt.iloc[j] mt_start = TempMt.iloc[j]['M_start'] mt_end = TempMt.iloc[j]['M_end'] Count = 0 for k in range(len(Bfile)): # ipdb.set_trace() Fragmp = np.array(Bfile.loc[k, 'Fragmp']) if any((Fragmp >= mt_start) & (Fragmp <= mt_end)): Count += 1 List_mt.append([ Region_short[i][3], Lmt.iloc[0, 0] + ':' + str(Lmt.iloc[0, 1]) + '-' + str(Lmt.iloc[0, 2]) + ',' + Lmt.iloc[0, 3], Lmt.iloc[0, 4], Lmt.iloc[0, 7], Lmt.iloc[0, 8], Lmt.iloc[0, 9], Lmt.iloc[0, 10], TempMt.iloc[j, 0] + ':' + str(TempMt.iloc[j, 1]) + '-' + str(TempMt.iloc[j, 2]) + ',' + TempMt.iloc[j, 3], TempMt.iloc[j, 4], TempMt.iloc[j, 7], TempMt.iloc[j, 8], TempMt.iloc[j, 9], TempMt.iloc[j, 10], Count, 'L2R' ]) elif direction == 'Right' and len(Rmt) > 0: Count = 0 Table = np.zeros(Num_window) for k in range(len(Bfile)): Fragmp = np.array(Bfile.loc[k, 'Fragmp']) Idx = np.minimum( np.maximum(np.array((R_end - Fragmp) / window_size), 0), Num_window - 1).astype(int) Table[Idx] += 1 for j in range(Num_window): W_start = R_end - window_size * (j + 1) + 1 W_end = R_end - window_size * j if j == Num_window: W_start = R_start TempMfile = Mt_inInterval(RegM2file, W_start, W_end, 1) Count = int(Table[j]) if len(TempMfile) > 0: for l in range(len(TempMfile)): List.append([ Region_short[i][3], Region_short[i][0], W_start, W_end, TempMfile.iloc[l, 0] + ':' + str(TempMfile.iloc[l, 1]) + '-' + str(TempMfile.iloc[l, 2]) + ',' + TempMfile.iloc[l, 3] + ',' + TempMfile.iloc[l, 4], Count, 'R2L' ]) else: List.append([ Region_short[i][3], Region_short[i][0], W_start, W_end, '.', Count, 'R2L' ]) # loop for all mt within Lmt_id, Rmt_id for j in range(0, len(TempMt) - 1): mt = TempMt.iloc[j] mt_start = mt['M_start'] mt_end = mt['M_end'] Count = 0 for k in range(len(Bfile)): # ipdb.set_trace() Fragmp = np.array(Bfile.loc[k, 'Fragmp']) if any((Fragmp >= mt_start) & (Fragmp <= mt_end)): Count += 1 List_mt.append([ Region_short[i][3], TempMt.iloc[j, 0] + ':' + str(TempMt.iloc[j, 1]) + '-' + str(TempMt.iloc[j, 2]) + ',' + TempMt.iloc[j, 3], TempMt.iloc[j, 4], TempMt.iloc[j, 7], TempMt.iloc[j, 8], TempMt.iloc[j, 9], TempMt.iloc[j, 10], Rmt.iloc[0, 0] + ':' + str(Rmt.iloc[0, 1]) + '-' + str(Rmt.iloc[0, 2]) + ',' + Rmt.iloc[0, 3], Rmt.iloc[0, 4], Rmt.iloc[0, 7], Rmt.iloc[0, 8], Rmt.iloc[0, 9], Rmt.iloc[0, 10], Count, 'R2L' ]) Background_rate = pd.DataFrame(List, columns=[ 'LoopID', 'Chr', 'W_start', 'W_end', 'Motif overlap', 'GEM count', 'Direction' ]) saveBGratepath = 'Minji_data/Cohesin_results/01ALL/4kbext_dm/Slope/Background_rate.csv' Background_rate.to_csv(saveBGratepath, index=False) Motif_rate = pd.DataFrame( List_mt, columns=[ 'LoopID', 'Left motif', 'LM ID', 'LM CTCF ChIA_PET intensity', 'LM CTCF ChIA_Drop intensity', 'LM Cohesin ChIA_PET intensity', 'LM Cohesin ChIA_Drop intensity', 'Right motif', 'RM ID', 'RM CTCF ChIA_PET intensity', 'RM CTCF ChIA_Drop intensity', 'RM Cohesin ChIA_PET intensity', 'RM Cohesin ChIA_Drop intensity', 'GEM count', 'Direction' ]) saveMTratepath = 'Minji_data/Cohesin_results/01ALL/4kbext_dm/Slope/Motif_rate.csv' Motif_rate.to_csv(saveMTratepath, index=False)
def mainfunc(pathR, pathM, pathM2, RegInterval, pathB, saveFragannopath): # pathR: path for region file (i.e. Minji_data/LHG0052H.e500.clusters.cis.bothanchint_G250.PETcnt_G9.motifannot) # pathM: path for Motif file (i.e. Minji_data/CTCF_motifs_STORM_hg38_Ext4kbBoth_with_supermotif_domain_id_v3.bed) # RegInterval: range(Start_pos,End_pos) # pathB: path for directory of bedfiles extracted from data (i.e. Minji_data/Cohesin_results/01ALL/4kbext_dm/Bedfiles/) # saveFragannopath: path for saving the Frags_anno table (i.e. Minji_data/Cohesin_results/01ALL/4kbext_dm/Tables/Frags_anno.csv) # Note that if you want to parallel this process, give different name for .csv files of each Thread. # Then use MergeCSV.py to merge them (each directory should contain only 1 type of .csv file) # Region_short = BedTool('Region_short_dm.bed') # pathR = 'Minji_data/LHG0052H.e500.clusters.cis.bothanchint_G250.PETcnt_G9.motifannot' Region = BedTool(pathR) # Remove unnecessary entries Region_short = Region.groupby(g=[1, 2, 6, 12, 14, 20, 8, 9, 16, 21], c=[12], o=['count']) # path = 'Minji_data/CTCF_motifs_STORM_hg38_Ext4kbBoth_chiapet_chiadrop_intensity_id.bed' # Mfile = pd.read_csv(path, sep = '\t',names = ['chr','M_start','M_end','Sign','Pet_int','Drop_int','M_name']) # path = 'Minji_data/CTCF_motifs_STORM_hg38_Ext4kbBoth_with_supermotif_domain_id_v7.sorted.bed' Mfile = pd.read_csv(pathM, sep='\t', names=[ 'chr', 'M_start', 'M_end', 'Sign', 'M_name', 'dmID', 'Side', 'CTCF_Pet_int', 'CTCF_Drop_int', 'Coh_Pet_int', 'Coh_Drop_int' ]) # path = 'Minji_data/CTCF_motifs_STORM_hg38_Ext4kbBoth.sorted.id.bed' M2file = pd.read_csv(pathM2, sep='\t', names=['chr', 'M_start', 'M_end', 'Sign', 'M_name']) # Thread = '0' Max_iter = len(Region_short) # Chuck = int(Max_iter/20) # LPlist = [8476,3647,4936] # List = [] # List_mt = [] List_new = [] window_size = 8000 # for i in tqdm(range(Chuck*int(Thread),Chuck*(int(Thread)+1))): for i in RegInterval: # print(Region_short[i]) total_length = int(Region_short[i][2]) - int(Region_short[i][1]) # print(total_length) # Count how many slicing should be Num_window = int(np.ceil(total_length / window_size)) lpID = Region_short[i][3] # print(lpID) R_start = int(Region_short[i][1]) R_end = int(Region_short[i][2]) # =========================================================================================================== for direction in ['Left', 'Right', 'Both', 'None']: # path = 'Minji_data/Final_data_results/CTCF_NR_results/01PASS_dm/Bedfiles/{}_{}.bed'.format(lpID,direction) pathB_all = pathB + '{}_{}.bed'.format(lpID, direction) try: Bfile = pd.read_csv(pathB_all, sep='\t', names=[ 'chr', 'Lm_frag_start', 'Lm_frag_end', 'GEMID', 'FragNum', '???', 'Mid_frags', 'Rm_frag_start', 'Rm_frag_end' ]) except: continue # Bfile preprocessing Bfile.insert(9, 'Frags', None) Bfile.insert(10, 'Len', Bfile['Rm_frag_end'] - Bfile['Lm_frag_start']) # ipdb.set_trace() Bfile = Bfile.sort_values(by=['Len']).reset_index(drop=True) for k in range(len(Bfile)): Bfile.at[k, 'Frags'] = get_list_of_frags(Bfile, k) # ipdb.set_trace() Lmp_start = int(Region_short[i][1]) Rmp_end = int(Region_short[i][2]) # Find the corresponding mt RegMfile = Mfile[Mfile['chr'] == Region_short[i][0]] RegM2file = M2file[M2file['chr'] == Region_short[i][0]] # ipdb.set_trace() TMP = RegMfile[RegMfile['dmID'] == Region_short[i][3]] Lmt = TMP[TMP['Side'] == 'L'] # Rmt = RegMfile[RegMfile['dmID']==Region_short[i][3]] Rmt = TMP[TMP['Side'] == 'R'] # ipdb.set_trace() if len(Lmt) > 0: if len(Rmt) > 0: TempMt = Mt_inInterval(RegMfile, int(Lmt['M_start']), int(Rmt['M_end']), 1).sort_values(by=['M_start']) else: TempMt = Mt_inInterval(RegMfile, int(Lmt['M_start']), Rmp_end, 1).sort_values(by=['M_start']) else: if len(Rmt) > 0: TempMt = Mt_inInterval(RegMfile, Lmp_start, int(Rmt['M_end']), 1).sort_values(by=['M_start']) else: TempMt = Mt_inInterval(RegMfile, Lmp_start, Rmp_end, 1).sort_values(by=['M_start']) for k in range(len(Bfile)): Frags = np.array(Bfile.loc[k, 'Frags']) # ipdb.set_trace() sublist = [str() for c in 'c' * len(Frags)] for j in range(len(TempMt)): mt_start = TempMt.iloc[j]['M_start'] mt_end = TempMt.iloc[j]['M_end'] # ipdb.set_trace() Condi = Fg_inInterval(Frags[:, 0], mt_start, Frags[:, 1], mt_end) Res = [KK for KK, val in enumerate(Condi) if val] for _, KK in enumerate(Res): # ipdb.set_trace() sublist[KK] += TempMt.iloc[j]['M_name'] SubStr = '' for j in range(len(sublist) - 1): if sublist[j]: SubStr += sublist[j] + ',' else: SubStr += '0' + ',' if sublist[len(sublist) - 1]: SubStr += sublist[len(sublist) - 1] else: SubStr += '0' List_new.append([ Region_short[i][3], direction, SubStr, Bfile.loc[k, 'GEMID'] ]) Frags_anno = pd.DataFrame( List_new, columns=['Domain ID', 'Category', 'Fragment_annotation', 'GEM_ID']) # saveFragannopath = 'Minji_data/Final_data_results/CTCF_NR_results/01PASS_dm/Tables/Frags_anno.csv' Frags_anno.to_csv(saveFragannopath, index=False)