def realign(fasta_filename, pdb_filename, outfilename='', chain='*'): ### get sequence seq = list(parse_fasta.read_fasta(open(fasta_filename, 'r')).values())[0][0] ref_len = len(seq) atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) pdbfile = open(pdb_filename, 'r') align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] #print (atom_seq_ali,seq_ali) res_i = -9999 resno = {} i = 0 atompos = 0 seqpos = 0 maxlen = len(atom_seq_ali) for i in range(0, maxlen): if atom_seq_ali[i] == "-": seqpos += 1 elif seq_ali[i] == "-": atompos += 1 resno[atompos] = -9999 else: atompos += 1 seqpos += 1 resno[atompos] = seqpos if not chain: chain = get_first_chain(pdbfile) pdbfile.seek(0) i = 0 for line in pdbfile: if not line.startswith('ATOM'): continue atm_record = parse_pdb.parse_atm_record(line) if atm_record['chain'] != ' ' and atm_record[ 'chain'] != chain and chain != '*': continue if atm_record['res_no'] != res_i: i += 1 res_i = atm_record['res_no'] atm_record['res_no'] = resno[i] #print (atm_record) if resno[i] > 0: parse_pdb.write_pdb_atm_record(atm_record) #res_dict[res_i].append(np.array(atm)) #pdbfile.close() return
def get_global_align_from_pdb(pdb_filename, chain, seq): """ Return a list with 1 tupla of 5 elements: [(pdb_aligned_seq,fasta_seq, float, float, int)]. """ # Generate the atom sequence from input chain. # Default values in get_atom_seq(pdbfile, chain="", model=1, return_lines=False) atom_seq_chain = parse_pdb.get_atom_seq(pdb_filename, chain) # Align seq from fasta with seq from pdb # 2: match, -1: missmatch, -0.5: open gap, -0.1: extend gap # For H**o-oligomer we should use the two chains, e.g: # atom_seq_chain1 and atom_seq_chain2. # The result is a list with 1 tupla of 5 elements: # [(pdb_aligned_seq,fasta_seq, float, float, int)] align = pairwise2.align.globalms(atom_seq_chain, seq, 2, -1, -0.5, -0.1) return align
def get_co_pdb(pdb_filename, chain, cb_cutoff=8): cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) dist_mat = get_cb_contacts(cb_lst) ref_contact_map = dist_mat < cb_cutoff atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) np.fill_diagonal(ref_contact_map, 0) co = 0 L = len(atom_seq) N = sum((ref_contact_map != 0).sum(0) / 2) for (i, j), is_contact in np.ndenumerate(ref_contact_map): if i < j and is_contact: S_ij = j - i co += S_ij co = float(co) / float(N * L) print co return co
def plot_map(fasta_filename, c_filename, factor, c2_filename='', ss_fname='', psipred_horiz_fname='', psipred_vert_fname='', pdb_filename='', is_heavy=False, chain='', sep=',', outfilename=''): acc = fasta_filename.split('.')[0][:4] # get sequence seq = list(parse_fasta.read_fasta(open(fasta_filename, 'r')).values())[0][0] ref_len = len(seq) # get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep) contacts_x = [] contacts_y = [] scores = [] # contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 # Also checking here, # should remove in parse # too_close = False if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if count >= ref_len * factor: break # start plotting fig = plt.figure() ax = fig.add_subplot(111) # plot secondary structure on the diagonal if given if psipred_horiz_fname or psipred_vert_fname or ss_fname: if psipred_horiz_fname: ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r')) elif psipred_vert_fname: ss = parse_psipred.vertical(open(psipred_vert_fname, 'r')) else: ss = parse_ss.parse(open(ss_fname, 'r')) assert len(ss) == ref_len for i in range(len(ss)): if ss[i] == 'H': plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2) if ss[i] == 'E': plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2) if ss[i] == 'C': continue # plot reference contacts in the background if given if pdb_filename: res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] j = 0 gapped_res_lst = [] gapped_cb_lst = [] for i in range(len(atom_seq_ali)): if atom_seq_ali[i] == '-': gapped_res_lst.append('-') gapped_cb_lst.append('-') elif seq_ali[i] == '-': j += 1 continue else: gapped_res_lst.append(res_lst[j]) gapped_cb_lst.append(cb_lst[j]) j += 1 if is_heavy: dist_mat = get_heavy_contacts(gapped_res_lst) heavy_cutoff = 5 ref_contact_map = dist_mat < heavy_cutoff ref_contacts = np.where(dist_mat < heavy_cutoff) else: dist_mat = get_cb_contacts(gapped_cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff ref_contacts = np.where(dist_mat < cb_cutoff) ref_contacts_x = ref_contacts[0] ref_contacts_y = ref_contacts[1] PPVs, TPs, FPs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali) print('%s %s %s %s' % (pdb_filename, PPVs[-1], TPs[-1], FPs[-1])) ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#CCCCCC', lw=0, edgecolor='#CCCCCC') # plot predicted contacts from second contact map if given if c2_filename: contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep) contacts2_x = [] contacts2_y = [] scores2 = [] # contact_dict2 = {} count = 0 for i in range(len(contacts2)): score = contacts2[i][0] c_x = contacts2[i][1] - 1 c_y = contacts2[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts2_x.append(c_x) contacts2_y.append(c_y) scores2.append(score) count += 1 if count >= ref_len * factor: break # use TP/FP color coding if reference contacts given if pdb_filename: PPVs2, TPs2, FPs2 = get_ppvs(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp2_colors = get_tp_colors(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali) print('%s %s %s %s' % (pdb_filename, PPVs2[-1], TPs2[-1], FPs2[-1])) fig.suptitle('%s\nPPV (upper left) = %.2f |' % (PPVs[-1]) + 'PPV (lower right) = %.2f' % (PPVs2[-1])) sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c=tp2_colors[::-1], s=6, alpha=0.75, linewidths=0.0) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) else: sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c='#D70909', edgecolor='#D70909', s=4, linewidths=0.5) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=4, linewidths=0.5) # plot predicted contacts from first contact map on both triangles # if no second contact map given else: if pdb_filename: fig.suptitle('%s\nPPV = %.2f' % (acc, PPVs[-1])) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) else: sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1) plt.colorbar(sc) plt.gca().set_xlim([0, ref_len]) plt.gca().set_ylim([0, ref_len]) if outfilename: if outfilename.endswith('.pdf'): pp = PdfPages(outfilename) pp.savefig(fig) pp.close() elif outfilename.endswith(('.png', '.jpg', '.jpeg')): plt.savefig(outfilename) else: pp = PdfPages('%s.pdf' % outfilename) pp.savefig(fig) pp.close() else: pp = PdfPages('%s_ContactMap.pdf' % c_filename) pp.savefig(fig) pp.close() plt.show()
def get_ppv(fasta_filename, c_filename, pdb_filename, factor=1.0, min_score=-1.0, chain='', sep=' ', outfilename='', name='', noalign=False, min_dist=5, print_dist=False): acc = fasta_filename.split('.')[-2][-5:-1] ### get sequence seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] ref_len = len(seq) ### get top ranked predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep, min_dist=min_dist) contacts_x = [] contacts_y = [] scores = [] contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < min_dist if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if min_score == -1.0 and count >= ref_len * factor: break if score < min_score: break assert(len(contacts_x) == len(contacts_y) == len(scores)) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) bfactor = parse_pdb.get_area(open(pdb_filename, 'r'), chain) surf = parse_pdb.get_dist_to_surface(open(pdb_filename, 'r'), chain) if noalign: dist_mat = get_cb_contacts(cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor) else: atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] gapped_cb_lst = [] ali_lst =[] j = 0 k = 0 for i in xrange(len(atom_seq_ali)): #print i,j,k,seq_ali[i],atom_seq_ali[i] if atom_seq_ali[i] == '-': gapped_cb_lst.append(['-']) ali_lst.append(-9999) k += 1 elif seq_ali[i] == '-': j += 1 continue else: ali_lst.append(j) gapped_cb_lst.append(cb_lst[j]) k += 1 j += 1 dist_mat = get_cb_contacts(gapped_cb_lst) area = parse_pdb.get_area(open(pdb_filename, 'r'), chain) surf = parse_pdb.get_dist_to_surface(open(pdb_filename, 'r'), chain) if print_dist: print_distances(contacts_x, contacts_y, scores, dist_mat, area, surf, ref_len,ref_len, seq, ali_lst=ali_lst, atom_seq=atom_seq, outfile=outfilename) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor, atom_seq_ali=atom_seq_ali) if name: print '%s %s %s %s' % (name, PPV, TP, FP) else: print '%s %s %s %s %s' % (fasta_filename, c_filename, PPV, TP, FP) return (pdb_filename, PPV, TP, FP)
def get_ppv(fasta_filenameA, c_filename, pdb_filenameA, fasta_filenameB, pdb_filenameB, factor=1.0, min_score=-1.0, chainA='', chainB='', sep=' ', outfilename='', name='', noalign=False, min_dist=5, interfacelen=10, print_dist=False, cutoff=0.25): ### get sequence seqA = parse_fasta.read_fasta(open(fasta_filenameA, 'r')).values()[0][0] seqB = parse_fasta.read_fasta(open(fasta_filenameB, 'r')).values()[0][0] seq = seqA + seqA # Actually the contact map sequence is just two copies of seqA ref_lenA = len(seqA) ref_lenB = len(seqB) ref_len = len(seq) ### get top ranked predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep, min_dist=min_dist) contacts_x = [] contacts_y = [] scores = [] contactsA_x = [] contactsA_y = [] scoresA = [] contactsB_x = [] contactsB_y = [] scoresB = [] contactsI_x = [] contactsI_y = [] scoresI = [] contact_dict = {} count = 0 countA = 0 countB = 0 countI = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 #print i,c_x,c_y,score pos_diff = abs(c_x - c_y) too_close = pos_diff < min_dist if not too_close: # The contacts only covers contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) #contacts_x.append(c_x+ref_lenA) #contacts_y.append(c_y+ref_lenA) #scores.append(score) contactsA_x.append(c_x) contactsA_y.append(c_y) scoresA.append(score) contactsB_x.append(c_x) contactsB_y.append(c_y) scoresB.append(score) # if min_score == -1.0 and count >= ref_len * factor: # break # if score < min_score: # break assert (len(contacts_x) == len(contacts_y) == len(scores)) assert (len(contactsA_x) == len(contactsA_y) == len(scoresA)) assert (len(contactsB_x) == len(contactsB_y) == len(scoresB)) assert (len(contactsI_x) == len(contactsI_y) == len(scoresI)) cb_lstA = parse_pdb.get_cb_coordinates(open(pdb_filenameA, 'r'), chainA) cb_lstB = parse_pdb.get_cb_coordinates(open(pdb_filenameB, 'r'), chainB) cb_lst = cb_lstA + cb_lstB bfactorA = parse_pdb.get_area(open(pdb_filenameA, 'r'), chainA) bfactorB = parse_pdb.get_area(open(pdb_filenameB, 'r'), chainB) bfactor = bfactorA + bfactorB surfA = parse_pdb.get_dist_to_surface(open(pdb_filenameA, 'r'), chainA) surfB = parse_pdb.get_dist_to_surface(open(pdb_filenameB, 'r'), chainB) surf = surfA + surfB #print cb_lst,noalign if noalign: dist_mat = get_cb_contacts(cb_lst) dist_matA = get_cb_contacts(cb_lstA) dist_matB = get_cb_contacts(cb_lstB) #PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor) else: atom_seqA = parse_pdb.get_atom_seq(open(pdb_filenameA, 'r'), chainA) atom_seqB = parse_pdb.get_atom_seq(open(pdb_filenameB, 'r'), chainB) atom_seq = atom_seqA + atom_seqB align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) alignA = pairwise2.align.globalms(atom_seqA, seqA, 2, -1, -0.5, -0.1) alignB = pairwise2.align.globalms(atom_seqB, seqA, 2, -1, -0.5, -0.1) # Align to seq A atom_seq_ali = align[-1][0] seq_ali = align[-1][1] atom_seq_aliA = alignA[-1][0] seq_aliA = alignA[-1][1] atom_seq_aliB = alignB[-1][0] seq_aliB = alignB[-1][1] gapped_cb_lst = [] gapped_cb_lstA = [] gapped_cb_lstB = [] ali_lst = [] ali_lstA = [] ali_lstB = [] j = 0 k = 0 for i in xrange(len(atom_seq_ali)): #print i,j,k,seq_ali[i],atom_seq_ali[i] if atom_seq_ali[i] == '-': gapped_cb_lst.append(['-']) ali_lst.append(-9999) k += 1 elif seq_ali[i] == '-': j += 1 continue else: ali_lst.append(j) gapped_cb_lst.append(cb_lst[j]) k += 1 j += 1 j = 0 k = 0 for i in xrange(len(atom_seq_aliA)): if atom_seq_aliA[i] == '-': gapped_cb_lstA.append(['-']) ali_lstA.append(-9999) k += 1 elif seq_aliA[i] == '-': j += 1 continue else: ali_lstA.append(j) gapped_cb_lstA.append(cb_lstA[j]) k += 1 j += 1 j = 0 k = 0 for i in xrange(len(atom_seq_aliB)): #print "B",i,j,k,seq_aliB[i],atom_seq_aliB[i] if atom_seq_aliB[i] == '-': gapped_cb_lstB.append(['-']) ali_lstB.append(-9999) k += 1 elif seq_aliB[i] == '-': j += 1 continue else: ali_lstB.append(j) gapped_cb_lstB.append(cb_lstB[j]) k += 1 j += 1 #print len(gapped_cb_lst),len(gapped_cb_lstA),len(gapped_cb_lstB) dist_mat = get_cb_contacts(gapped_cb_lst) dist_matA = get_cb_contacts(gapped_cb_lstA) dist_matB = get_cb_contacts(gapped_cb_lstB) cb_cutoff = 8 #ref_contact_map = dist_mat < cb_cutoff # This routine adds all interface and B chain contacts contacts_x, contacts_y, scores = get_interface_contacts( contacts_x, contacts_y, scores, dist_mat, ref_lenA, factor, cb_cutoff + 4, atom_seq_ali=atom_seq_ali) ref_contact_map = dist_mat < cb_cutoff ref_contact_mapA = dist_matA < cb_cutoff ref_contact_mapB = dist_matB < cb_cutoff # Here we need to append if print_dist: print_distances(contacts_x, contacts_y, scores, dist_mat, bfactor, surf, ref_lenA, ref_lenB, seq, ali_lst=ali_lst, atom_seq=atom_seq, outfile=outfilename) Zscore = get_Zscore(contacts_x, contacts_y, ref_contact_map, scores, atom_seq_ali=atom_seq_ali) ZscoreA = get_Zscore(contactsA_x, contactsA_y, ref_contact_mapA, scoresA, atom_seq_ali=atom_seq_aliA) ZscoreB = get_Zscore(contactsB_x, contactsB_y, ref_contact_mapB, scoresB, atom_seq_ali=atom_seq_aliB) ZscoreI = get_Zscore_interface(contacts_x, contacts_y, ref_contact_map, ref_lenA, ref_lenB, scores, atom_seq_ali=atom_seq_ali) PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor, atom_seq_ali=atom_seq_ali) PPVa, TPa, FPa = get_ppv_helper(contactsA_x, contactsA_y, ref_contact_mapA, interfacelen, factor, atom_seq_ali=atom_seq_aliA) PPVb, TPb, FPb = get_ppv_helper(contactsB_x, contactsB_y, ref_contact_mapB, interfacelen, factor, atom_seq_ali=atom_seq_aliB) PPVi, TPi, FPi, PPViE, TPiE, FPiE = get_ppv_helper_interface( contacts_x, contacts_y, ref_contact_map, bfactor, ref_lenA, ref_lenB, interfacelen, cutoff, atom_seq_ali=atom_seq_ali) #for i in range(10): # print "I: ",i,contactsI_x[i],contactsI_y[i],scoresI[i],dist_mat[contactsI_x[i]][contactsI_y[i]],ref_contact_map[contactsI_x[i]][contactsI_y[i]] # print "A: ",i,contactsA_x[i],contactsA_y[i],scoresA[i],dist_mat[contactsA_x[i]][contactsA_y[i]],ref_contact_map[contactsA_x[i]][contactsA_y[i]] # print "B: ",i,contactsB_x[i],contactsB_y[i],scoresB[i],dist_mat[contactsB_x[i]][contactsB_y[i]],ref_contact_map[contactsB_x[i]][contactsB_y[i]] if name: print '%s %s %s %s %s' % (name, PPVa, TPa, FPa, ZscoreA) print '%s %s %s %s %s' % (name, PPVb, TPb, FPb, ZscoreB) print '%s %s %s %s %s' % ("BOTH", PPV, TP, FP, Zscore) print '%s %s %s %s %s' % ("Interface", PPVi, TPi, FPi, ZscoreI) print '%s %s %s %s' % ("Interface-Exposed", PPViE, TPiE, FPiE) else: print '%s %s %s %s %s %s' % (fasta_filenameA, c_filename, PPVa, TPa, FPa, ZscoreA) print '%s %s %s %s %s %s' % (fasta_filenameB, c_filename, PPVb, TPb, FPb, ZscoreB) print '%s %s %s %s %s %s' % ("BOTH", c_filename, PPV, TP, FP, Zscore) print '%s %s %s %s %s %s' % ("Interface", c_filename, PPVi, TPi, FPi, ZscoreI) print '%s %s %s %s %s' % ("Interface-Exposed", c_filename, PPViE, TPiE, FPiE) print 'PPV %s %s %s %s %s %s' % (c_filename, PPV, PPVa, PPVb, PPVi, PPViE) print 'Zscore %s %s %s %s %s' % (c_filename, Zscore, ZscoreA, ZscoreB, ZscoreI) return (pdb_filenameA, PPV, TP, FP)
def get_dist(fasta_filename, c_filename, pdb_filename, chain='', sep='', outfilename='', noalign=False, dist_type='CB'): acc = fasta_filename.split('.')[-2][-5:-1] ### get sequence seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] ref_len = len(seq) ### get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep, min_dist=5) contacts_x = [] contacts_y = [] scores = [] count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) ca_lst = parse_pdb.get_ca_coordinates(open(pdb_filename, 'r'), chain) if noalign: if dist_type == 'CB': dist_mat = get_dist_mat(cb_lst) elif dist_type == 'CA': dist_mat = get_dist_mat(ca_lst) else: dist_mat = get_dist_mat_heavy(res_lst) contacts_dist = get_dist_helper(contacts_x, contacts_y, dist_mat) else: atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] j = 0 gapped_res_lst = [] gapped_cb_lst = [] gapped_ca_lst = [] for i in xrange(len(atom_seq_ali)): if atom_seq_ali[i] == '-': gapped_res_lst.append('-') gapped_cb_lst.append('-') gapped_ca_lst.append('-') elif seq_ali[i] == '-': j += 1 continue else: gapped_res_lst.append(res_lst[j]) gapped_cb_lst.append(cb_lst[j]) gapped_ca_lst.append(ca_lst[j]) j += 1 assert (len(gapped_ca_lst) == len(gapped_cb_lst) == len(gapped_res_lst)) if dist_type == 'CB': dist_mat = get_dist_mat(gapped_cb_lst) elif dist_type == 'CA': dist_mat = get_dist_mat(gapped_ca_lst) else: dist_mat = get_dist_mat_heavy(gapped_res_lst) contacts_dist = get_dist_helper(contacts_x, contacts_y, dist_mat, atom_seq_ali=atom_seq_ali) assert (len(contacts_dist) == len(contacts_x) == len(contacts_y) == len(scores)) num_c = len(contacts_dist) if outfilename: with open(outfilename, 'w') as outfile: for i in xrange(num_c): outfile.write('%s %s %f %f\n' % (contacts_x[i], contacts_y[i], scores[i], contacts_dist[i])) return (contacts_x, contacts_y, scores, contacts_dist)
def plot_map(fasta_filename, c_filename, factor, c2_filename='', psipred_horiz_fname='', psipred_vert_fname='', pdb_filename='', is_heavy=False, chain='', sep=',', outfilename=''): acc = fasta_filename.split('.')[0][:4] ### get sequence seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] ref_len = len(seq) ### get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep) contacts_x = [] contacts_y = [] scores = [] contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if count >= ref_len * factor: break ### start plotting fig = plt.figure() ax = fig.add_subplot(111) ### plot secondary structure on the diagonal if given if psipred_horiz_fname or psipred_vert_fname: if psipred_horiz_fname: ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r')) else: ss = parse_psipred.vertical(open(psipred_vert_fname, 'r')) assert len(ss) == ref_len for i in range(len(ss)): if ss[i] == 'H': plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2) if ss[i] == 'E': plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2) if ss[i] == 'C': continue ### plot reference contacts in the background if given if pdb_filename: res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] j = 0 gapped_res_lst = [] gapped_cb_lst = [] for i in xrange(len(atom_seq_ali)): if atom_seq_ali[i] == '-': gapped_res_lst.append('-') gapped_cb_lst.append('-') elif seq_ali[i] == '-': j += 1 continue else: gapped_res_lst.append(res_lst[j]) gapped_cb_lst.append(cb_lst[j]) j += 1 if is_heavy: dist_mat = get_heavy_contacts(gapped_res_lst) heavy_cutoff = 5 ref_contact_map = dist_mat < heavy_cutoff ref_contacts = np.where(dist_mat < heavy_cutoff) else: dist_mat = get_cb_contacts(gapped_cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff ref_contacts = np.where(dist_mat < cb_cutoff) ref_contacts_x = ref_contacts[0] ref_contacts_y = ref_contacts[1] PPVs, TPs, FPs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali) print '%s %s %s %s' % (pdb_filename, PPVs[-1], TPs[-1], FPs[-1]) ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#CCCCCC', lw=0, edgecolor='#CCCCCC') ### plot predicted contacts from second contact map if given if c2_filename: contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep) contacts2_x = [] contacts2_y = [] scores2 = [] contact_dict2 = {} count = 0 for i in range(len(contacts2)): score = contacts2[i][0] c_x = contacts2[i][1] - 1 c_y = contacts2[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts2_x.append(c_x) contacts2_y.append(c_y) scores2.append(score) count += 1 if count >= ref_len * factor: break ### use TP/FP color coding if reference contacts given if pdb_filename: PPVs2, TPs2, FPs2 = get_ppvs(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp2_colors = get_tp_colors(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali) print '%s %s %s %s' % (pdb_filename, PPVs2[-1], TPs2[-1], FPs2[-1]) fig.suptitle('%s\nPPV (upper left) = %.2f | PPV (lower right) = %.2f' % (acc, PPVs[-1], PPVs2[-1])) sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c=tp2_colors[::-1], s=6, alpha=0.75, linewidths=0.0) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) else: sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c='#D70909', edgecolor='#D70909', s=4, linewidths=0.5) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=4, linewidths=0.5) ### plot predicted contacts from first contact map on both triangles ### if no second contact map given else: if pdb_filename: fig.suptitle('%s\nPPV = %.2f' % (acc, PPVs[-1])) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) else: sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1) plt.colorbar(sc) plt.gca().set_xlim([0,ref_len]) plt.gca().set_ylim([0,ref_len]) if outfilename: if outfilename.endswith('.pdf'): pp = PdfPages(outfilename) pp.savefig(fig) pp.close() elif outfilename.endswith(('.png', '.jpg', '.jpeg')): plt.savefig(outfilename) else: pp = PdfPages('%s.pdf' % outfilename) pp.savefig(fig) pp.close() else: pp = PdfPages('%s_ContactMap.pdf' % c_filename) pp.savefig(fig) pp.close()
def fix(pdb1_filename, pdb2_filename, out_filename, chain1='', chain2=''): if not chain1: chain1 = parse_pdb.get_first_chain(open(pdb1_filename, 'r')) pdb1 = parse_pdb.read(open(pdb1_filename, 'r'), chain1) seq1 = parse_pdb.get_atom_seq(open(pdb1_filename, 'r'), chain1) if not chain2: chain2 = parse_pdb.get_first_chain(open(pdb2_filename, 'r')) pdb2 = parse_pdb.read(open(pdb2_filename, 'r'), chain2) seq2 = parse_pdb.get_atom_seq(open(pdb2_filename, 'r'), chain2) align = pairwise2.align.globalms(seq1, seq2, 2, -1, -0.5, -0.1) #print seq1 #print seq2 #print align seq1_ali = align[-1][0] seq2_ali = align[-1][1] #print pdb2 pdb2_idx = [] offset = 0 for i in xrange(len(seq2_ali)): if seq1_ali[i] == '-': offset -= 1 idx = i + 1 + offset pdb2_idx.append(idx) elif seq2_ali[i] == '-': continue #offset += 1 #idx = i+1 + offset #pdb2_idx.append(idx) else: idx = i + 1 + offset pdb2_idx.append(idx) #else: pdb2_new = ['', [], pdb2[2]] i = 0 prev_idx = -1 #print len(pdb2_idx) #print len(pdb2[1]) for res in pdb2[1]: if i >= len(pdb2_idx): break new_res = [] new_idx = pdb2_idx[i] if new_idx == 0: i = i + 1 continue elif new_idx == prev_idx: i = i + 1 continue else: for atm in res: new_idx_str = str(pdb2_idx[i]) #print atm #print new_idx_str lendiff = 4 - len(new_idx_str) new_atm = atm[:22] + lendiff * ' ' + new_idx_str + atm[26:] new_res.append(new_atm) pdb2_new[1].append(new_res) prev_idx = new_idx i = i + 1 #print pdb1_filename #print pdb2_filename #print pdb2_idx #print len(pdb2_idx) #print align[-1] #print len(align[-1][1]) if out_filename: pdb2_outfile = open(out_filename, 'w') else: pdb2_outfile = open( '.'.join(pdb2_filename.split('.')[:-1]) + '.aligned.pdb', 'w') #print pdb2_new parse_pdb.write(pdb2_new, pdb2_outfile)
def plot_map(fasta_filename, c_filename, factor, c2_filename='', psipred_filename='', pdb_filename='', is_heavy=False, chain='', sep='', sep2='', outfilename=''): acc = c_filename.split('.')[0] ### get sequence #seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] #ref_len = len(seq) ### get id f = open(fasta_filename,"rU") seqs = SeqIO.parse(f,"fasta") # we assume there is only one record for record in seqs: seq = str(record.seq) protein_id = record.id ref_len = len(seq) # guessing separator of constraint file if sep == '': line = open(c_filename,'r').readline() if len(line.split(',')) != 1: sep = ',' elif len(line.split(' ')) != 1: sep = ' ' else: sep = '\t' ### get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep) contacts_x = [] contacts_y = [] scores = [] contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if count >= ref_len * factor: break ### start plotting fig = plt.figure() plt.title('Contact map for ' + protein_id) ax = fig.add_subplot(111) ### plot secondary structure on the diagonal if given if psipred_filename: ss = parse_psipred.horizontal(open(psipred_filename, 'r')) for i in range(len(ss)): if ss[i] == 'H': plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2) if ss[i] == 'E': plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2) if ss[i] == 'C': continue ### plot reference contacts in the background if given if pdb_filename: res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) if (len(res_lst)==0) or (len(cb_lst)==0): print "Could not parse the PDB file, res_list or cb_list is empty" return try: atom_seq_ali = align[-1][0] seq_ali = align[-1][1] except Exception,ex: print "Could not parse the PDB file:", ex return j = 0 gapped_res_lst = [] gapped_cb_lst = [] for i in xrange(len(atom_seq_ali)): if atom_seq_ali[i] == '-': gapped_res_lst.append('-') gapped_cb_lst.append('-') elif seq_ali[i] == '-': j += 1 continue else: gapped_res_lst.append(res_lst[j]) gapped_cb_lst.append(cb_lst[j]) j += 1 if is_heavy: dist_mat = get_heavy_contacts(gapped_res_lst) heavy_cutoff = 5 ref_contact_map = dist_mat < heavy_cutoff ref_contacts = np.where(dist_mat < heavy_cutoff) else: dist_mat = get_cb_contacts(gapped_cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff ref_contacts = np.where(dist_mat < cb_cutoff) ref_contacts_x = ref_contacts[0] ref_contacts_y = ref_contacts[1] PPVs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali) print '%s\t%s' % (acc, PPVs[-1]) ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#CCCCCC', lw=0, edgecolor='#CCCCCC')