def load_coliner(self, f_coliner): group_pattern = re.compile("\s*(\d+)-\s*(\d+)") for line in iterline(f_coliner): data = line.rstrip().split("\t") g_data, cds1, cds2, score = data group_id, pair_id = re.findall(group_pattern, g_data)[0] score = score.strip() cds1 = cds1.rstrip("_CDS") cds2 = cds2.rstrip("_CDS") if self.genome1.is_in_genome(cds1): if self.genome2.is_in_genome(cds2): name1 = cds1 name2 = cds2 else: continue else: assert self.genome2.is_in_genome(cds1) if self.genome1.is_in_genome(cds2): name1 = cds2 name2 = cds1 else: continue coliner_rec = ColinerRec(int(group_id), int(pair_id), name1, name2, score) coliner_rec.find_uniq_id(self.genome1, self.genome2) self.coliner_dict[coliner_rec.group_id].append(coliner_rec) self.pair_dict[(coliner_rec.name1, coliner_rec.name2)] = coliner_rec
def main(args): args = parse_args(args) f_coliner = args.coliner f_ppi = args.ppi f_string_ref = args.string_ref f_plant_ref = args.plant_ref f_out = args.output string_name_li = load_ref_name(f_string_ref) plant_name_li = load_ref_name(f_plant_ref) trans_map = defaultdict(set) for string_name, plant_name in iter_coliner_pair(f_coliner, string_name_li, plant_name_li): trans_map[string_name].add(plant_name) with open(f_out, "w") as f: header = "ori_prot1\tori_prot2\tprot1\tprot2\tcombined_score\n" f.write(header) for indx, line in enumerate(iterline(f_ppi)): if indx == 0: continue ori_prot1, ori_prot2, score = line.rstrip().split(" ") for prot1 in sorted(trans_map[ori_prot1]): for prot2 in sorted(trans_map[ori_prot2]): if prot1 == prot2: continue elif prot1 < prot2: data = [ori_prot1, ori_prot2, prot1, prot2, score] else: data = [ori_prot2, ori_prot1, prot2, prot1, score] f.write("\t".join(data) + "\n")
def load_blast(self, f_blast): for line in iterline(f_blast): data = line.rstrip().split("\t") cds1 = data[0].rstrip("_CDS") cds2 = data[1].rstrip("_CDS") if self.genome1.is_in_genome(cds1): if self.genome2.is_in_genome(cds2): name1 = cds1 name2 = cds2 fwd = True else: continue else: assert self.genome2.is_in_genome(cds1) if self.genome1.is_in_genome(cds2): name1 = cds2 name2 = cds1 fwd = False else: continue key = (name1, name2) if key not in self.pair_dict: blast_rec = BlastRec(name1, name2) blast_rec.find_uniq_id(self.genome1, self.genome2) self.pair_dict[key] = blast_rec self.pair_dict[key].set_pos(fwd)
def load_uniq_locus_map(self, f_uniq_locus_map): for indx, line in enumerate(iterline(f_uniq_locus_map)): if indx == 0: continue locus_id, assemble, tid = line.rstrip().split("\t") if assemble != self.assemble: continue self.uniq_locus_map[tid] = locus_id
def load_uniq_cds_map(self, f_uniq_cds_map): for indx, line in enumerate(iterline(f_uniq_cds_map)): if indx == 0: continue hash_id, assemble, cds_id = line.rstrip().split("\t") if assemble != self.assemble: continue cds_id = cds_id.rstrip("_CDS") self.uniq_cds_map[cds_id] = hash_id
def iter_coliner_pair(f, string_li, plant_li): for line in iterline(f): data = line.rstrip().split("\t") name1 = data[1] name2 = data[2] if name1 in string_li: if name2 in plant_li: yield name1, name2 else: assert name1 in plant_li if name2 in string_li: yield name2, name1
def main(args): args = parse_args(args) f_in = args.input f_uniq_locus = args.uniq_locus assemble_li = args.assemble f_out_prot = args.out_prot f_out_locus = args.out_locus assert len(f_in) == len(assemble_li) print("Loading score file ...") score_dict = defaultdict(list) for assemble, file in zip(assemble_li, f_in): for prot1, prot2, score in iter_data(file): prot1 = prot1.rstrip("_CDS") prot2 = prot2.rstrip("_CDS") score_dict[(assemble, prot1, prot2)].append(score) print("Building locus map ...") locus_map = dict() for indx, line in enumerate(iterline(f_uniq_locus)): if indx == 0: continue hash_id, assemble, cds_id = line.rstrip().split("\t") cds_id = cds_id.rstrip("_CDS") locus_map[(assemble, cds_id)] = hash_id locus_score_dict = defaultdict(list) print("Writing prot interactrion ...") with open(f_out_prot, "w") as f: header = "Assemble\tProtein1\tProtein2\tcombined_score\n" f.write(header) for (assemble, prot1, prot2), score_li in sorted(score_dict.items()): score = np.mean(score_li) data = [assemble, prot1, prot2, "{0:.1f}".format(score)] f.write("\t".join(data)+"\n") locus1 = locus_map[(assemble, prot1)] locus2 = locus_map[(assemble, prot2)] locus_score_dict[(locus1, locus2)].append(score) print("Writing locus interactrion ...") with open(f_out_locus, "w") as f: header = "Locus1\tLocus2\tcombined_score\n" f.write(header) for (locus1, locus2), score_li in sorted(locus_score_dict.items()): score = np.mean(score_li) data = [locus1, locus2, "{0:.1f}".format(score)] f.write("\t".join(data) + "\n")
def main(args): args = parse_args(args) f_in = args.input f_ref = args.ref f_output = args.output min_loop_len = args.min_loop_len min_rg4_len = args.min_rg4_len utr_bed = BedFile(f_ref, "r") utr_recs = dict() for rec in utr_bed.load("isoform"): utr_recs[rec.name] = rec res_rec = list() idx = 0 for line in iterline(f_in): name, rg4_indx, sidx1, sidx2, sidx3, sidx4, rg4_len, score, seq = line.rstrip( "\n").split("\t") sidx1 = int(sidx1) sidx2 = int(sidx2) sidx3 = int(sidx3) sidx4 = int(sidx4) rg4_len = int(rg4_len) if rg4_len < min_rg4_len: continue if (sidx2 - sidx1 - rg4_len - min_loop_len) < 0: continue if (sidx3 - sidx2 - rg4_len - min_loop_len) < 0: continue if (sidx4 - sidx3 - rg4_len - min_loop_len) < 0: continue idx += 1 rec = utr_recs[name] rg4_region = rec.slice(sidx1, sidx4 + rg4_len, name="{0}_rg4_{1}".format(name, idx), strand=True) res_rec.append(rg4_region.trans("bed12")) out_bed = BedFile(f_output, "w") out_bed.write(res_rec)
def main(args): args = parse_args(args) f_in = args.input f_out_genome = args.out_genome f_out_cds = args.out_cds f_ref = args.reference col_dict = { "outside": "200,50,50", "inside": "50,200,50", "TMhelix": "50,50,200" } cds_bed = BedFile(f_ref, "r") cds_dict = dict() for iso in cds_bed.load("isoform"): cds_dict[iso.name] = iso cds_li = list() with open(f_out_genome, "w") as f: for indx, line in enumerate(iterline(f_in)): if indx > 0: data = line.rstrip("\n").split() cds_name = data[0] pos = data[2] sidx = int(data[3]) - 1 eidx = int(data[4]) name = "{0}_{1}".format(cds_name, pos) cds_bed_rec = Bed6Record() cds_bed_rec.init_by_data(cds_name, sidx, eidx, pos, 0, ".") cds_li.append(cds_bed_rec) cds = cds_dict[cds_name] sub_cds = cds.slice(sidx * 3, eidx * 3).trans("bed12") sub_cds.score = 0 sub_cds.itemRgb = col_dict[pos] sub_cds.name = name f.write(str(sub_cds) + "\n") cds_bed = BedFile(f_out_cds, "w") cds_bed.write(cds_li)
def main(args): args = parse_args(args) f_in = args.input f_out = args.output f_ref = args.reference cds_bed = BedFile(f_ref, "r") cds_dict = dict() for iso in cds_bed.load("isoform"): cds_dict[iso.name] = iso with open(f_out, "w") as f: for line in iterline(f_in): if line: data = line.rstrip("\n").split() cds_name = data[0] sidx = int(data[1]) - 1 eidx = int(data[2]) name = "{0}_{1}:{2}({3})".format(cds_name, data[7], data[6], data[5]) cds = cds_dict[cds_name] sub_cds = cds.slice(sidx * 3, eidx * 3).trans("bed12") sub_cds.score = 0 sub_cds.name = name f.write(str(sub_cds) + "\n")
def iter_data(file): for indx, line in enumerate(iterline(file)): if indx == 0: continue ori_prot1, ori_prot2, prot1, prot2, combined_score = line.rstrip().split("\t") yield prot1, prot2, float(combined_score)
def load_ref_name(f): name_li = list() for line in iterline(f): data = line.rstrip().split("\t") name_li.append(data[1]) return name_li