def plot_map(fasta_filename, c_filename, factor, c2_filename='', psipred_filename='', pdb_filename='', is_heavy=False, chain='', sep=' '): acc = c_filename.split('.')[0] ### get sequence seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] ref_len = len(seq) ### get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep) contacts_x = [] contacts_y = [] scores = [] contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if count >= ref_len * factor: break ### start plotting fig = plt.figure(figsize=(8, 8), dpi=100) ax = fig.add_subplot(111) #ax = plt.axes([.1, .1, .8, .8], frameon=False) ax.set_xlim(xmin=-1) ax.set_ylim(ymin=-1) ### plot secondary structure on the diagonal if given if psipred_filename: ss = parse_psipred.horizontal(open(psipred_filename, 'r')) for i in range(len(ss)): if ss[i] == 'H': plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043") #, markersize=8) if ss[i] == 'E': plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD") #, markersize=8) if ss[i] == 'C': plt.plot(i, i, 'D', c='#CCCCCC', mec="#CCCCCC", markersize=4) ### plot reference contacts in the background if given if pdb_filename: res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] j = 0 gapped_res_lst = [] gapped_cb_lst = [] for i in xrange(len(atom_seq_ali)): if atom_seq_ali[i] == '-': gapped_res_lst.append('-') gapped_cb_lst.append('-') elif seq_ali[i] == '-': j += 1 continue else: gapped_res_lst.append(res_lst[j]) gapped_cb_lst.append(cb_lst[j]) j += 1 if is_heavy: dist_mat = get_heavy_contacts(gapped_res_lst) heavy_cutoff = 5 ref_contact_map = dist_mat < heavy_cutoff ref_contacts = np.where(dist_mat < heavy_cutoff) else: dist_mat = get_cb_contacts(gapped_cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff ref_contacts = np.where(dist_mat < cb_cutoff) ref_contacts_x = ref_contacts[0] ref_contacts_y = ref_contacts[1] PPVs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali) print '%s\t%s' % (acc, PPVs[-1]) ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#DDDDDD', lw=0) ### plot predicted contacts from second contact map if given if c2_filename: contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep) contacts2_x = [] contacts2_y = [] scores2 = [] contact_dict2 = {} count = 0 for i in range(len(contacts2)): score = contacts2[i][0] c_x = contacts2[i][1] - 1 c_y = contacts2[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts2_x.append(c_x) contacts2_y.append(c_y) scores2.append(score) count += 1 if count >= ref_len * factor: break ### use TP/FP color coding if reference contacts given if pdb_filename: PPVs2 = get_ppvs(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp2_colors = get_tp_colors(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali) print '%s\t%s' % (acc, PPVs2[-1]) fig.suptitle( '%s\n%s (upper left) PPV = %.2f | %s (lower right) PPV = %.2f' % (acc, c_filename, PPVs[-1], c2_filename, PPVs2[-1])) sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c=tp2_colors[::-1], linewidths=0.0) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], linewidths=0.0) else: fig.suptitle('%s\n%s (upper left) | %s (lower right)' % (acc, c_filename, c2_filename)) sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c='#D70909', edgecolor='#D70909', s=8, linewidths=0.5) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=8, linewidths=0.5) ### plot predicted contacts from first contact map on both triangles ### if no second contact map given else: if pdb_filename: fig.suptitle('%s (%s)\nPPV = %.2f' % (acc, c_filename, PPVs[-1])) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], linewidths=0.0) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], linewidths=0.0) else: #fig.suptitle('%s (%s)' % (acc, c_filename)) #sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=scores[::-1], s=6, alpha=0.75, cmap=cm.jet, linewidths=0.5) #sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=6, alpha=0.75, cmap=cm.jet, linewidths=0.5) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=8, linewidths=0.5) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=8, linewidths=0.5) #plt.colorbar(sc) plt.gca().set_xlim([-1, ref_len]) plt.gca().set_ylim([-1, ref_len]) plt.savefig('%s.cm.png' % c_filename, bbox_inches=0)
def plot_map(fasta_filename, c_filename, factor, c2_filename='', psipred_filename='', pdb_filename='', is_heavy=False, chain='', sep=' '): acc = c_filename.split('.')[0] ### get sequence seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] ref_len = len(seq) ### get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep) contacts_x = [] contacts_y = [] scores = [] contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if count >= ref_len * factor: break ### start plotting fig = plt.figure(figsize=(8,8), dpi=100) ax = fig.add_subplot(111) #ax = plt.axes([.1, .1, .8, .8], frameon=False) ax.set_xlim(xmin=-1) ax.set_ylim(ymin=-1) ### plot secondary structure on the diagonal if given if psipred_filename: ss = parse_psipred.horizontal(open(psipred_filename, 'r')) for i in range(len(ss)): if ss[i] == 'H': plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043")#, markersize=8) if ss[i] == 'E': plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD")#, markersize=8) if ss[i] == 'C': plt.plot(i, i, 'D', c='#CCCCCC', mec="#CCCCCC", markersize=4) ### plot reference contacts in the background if given if pdb_filename: res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] j = 0 gapped_res_lst = [] gapped_cb_lst = [] for i in xrange(len(atom_seq_ali)): if atom_seq_ali[i] == '-': gapped_res_lst.append('-') gapped_cb_lst.append('-') elif seq_ali[i] == '-': j += 1 continue else: gapped_res_lst.append(res_lst[j]) gapped_cb_lst.append(cb_lst[j]) j += 1 if is_heavy: dist_mat = get_heavy_contacts(gapped_res_lst) heavy_cutoff = 5 ref_contact_map = dist_mat < heavy_cutoff ref_contacts = np.where(dist_mat < heavy_cutoff) else: dist_mat = get_cb_contacts(gapped_cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff ref_contacts = np.where(dist_mat < cb_cutoff) ref_contacts_x = ref_contacts[0] ref_contacts_y = ref_contacts[1] PPVs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali) print '%s\t%s' % (acc, PPVs[-1]) ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#DDDDDD', lw=0) ### plot predicted contacts from second contact map if given if c2_filename: contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep) contacts2_x = [] contacts2_y = [] scores2 = [] contact_dict2 = {} count = 0 for i in range(len(contacts2)): score = contacts2[i][0] c_x = contacts2[i][1] - 1 c_y = contacts2[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts2_x.append(c_x) contacts2_y.append(c_y) scores2.append(score) count += 1 if count >= ref_len * factor: break ### use TP/FP color coding if reference contacts given if pdb_filename: PPVs2 = get_ppvs(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp2_colors = get_tp_colors(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali) print '%s\t%s' % (acc, PPVs2[-1]) fig.suptitle('%s\n%s (upper left) PPV = %.2f | %s (lower right) PPV = %.2f' % (acc, c_filename, PPVs[-1], c2_filename, PPVs2[-1])) sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c=tp2_colors[::-1], linewidths=0.0) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], linewidths=0.0) else: fig.suptitle('%s\n%s (upper left) | %s (lower right)' % (acc, c_filename, c2_filename)) sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c='#D70909', edgecolor='#D70909', s=8, linewidths=0.5) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=8, linewidths=0.5) ### plot predicted contacts from first contact map on both triangles ### if no second contact map given else: if pdb_filename: fig.suptitle('%s (%s)\nPPV = %.2f' % (acc, c_filename, PPVs[-1])) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], linewidths=0.0) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], linewidths=0.0) else: fig.suptitle('%s (%s)' % (acc, c_filename)) #sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=scores[::-1], s=6, alpha=0.75, cmap=cm.jet, linewidths=0.5) #sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=6, alpha=0.75, cmap=cm.jet, linewidths=0.5) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=8, linewidths=0.5) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=8, linewidths=0.5) #plt.colorbar(sc) plt.gca().set_xlim([-1,ref_len]) plt.gca().set_ylim([-1,ref_len]) plt.savefig('%s.cm.png' % c_filename, bbox_inches=0)
def plot_map(fasta_filename, c_filename, factor=1.0, th=-1, c2_filename='', psipred_horiz_fname='', psipred_vert_fname='', iupred_fname='', pdb_filename='', is_heavy=False, chain='', sep=',', outfilename='', ali_filename='', meff_filename='', name='', start=0, end=-1): #acc = c_filename.split('.')[0] #acc = fasta_filename.split('.')[0][:4] if name == '': acc = '.'.join(os.path.basename(fasta_filename).split('.')[:-1]) else: acc = name ### get sequence seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] ref_len = len(seq) ### trim sequence according to given positions ### default: take full sequence if end == -1: end = ref_len seq = seq[start:end] ref_len = len(seq) unit = (ref_len / 50.0) if ali_filename: coverage_lst = get_ali_coverage(ali_filename) max_cover = max(coverage_lst) # Current Meff format does not work # elif meff_filename: # coverage_lst = get_meff_coverage(meff_filename) # max_cover = max(coverage_lst) else: coverage_lst = np.zeros(ref_len) max_cover = 0 average_disorder = 0. average_order = 0. fraction_disorder = 0. cover_order = 0. cover_disorder = 0. if iupred_fname: disorder = parse_iupred.pred(open(iupred_fname, 'r')) else: disorder = np.zeros(ref_len) average_disorder = np.sum(disorder) / ref_len fraction_disorder = 0.0 num_disorder = 0 num_order = 0 j = 0 for i in disorder: if (i > 0.5): fraction_disorder += 1 / ref_len num_disorder += 1 cover_disorder += coverage_lst[j] else: num_order += 1 cover_order += coverage_lst[j] j += 1 if (num_disorder > 0): cover_disorder = cover_disorder / num_disorder if (num_order > 0): cover_order = cover_order / num_order ### get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep, 1) contacts_np = parse_contacts.get_numpy_cmap(contacts) contacts_np = contacts_np[start:end, start:end] contacts_x = [] contacts_y = [] scores = [] mixscores = [] disoscores = [] orderscores = [] tooclose = [] contact_dict = {} if iupred_fname: disorder = parse_iupred.pred(open(iupred_fname, 'r')) else: disorder = np.zeros(ref_len) count = 1.e-20 mixcount = 1.e-20 disocount = 1.e-20 ordercount = 1.e-20 longcount = 1.e-20 highscore = 0 numbins = 20 sum = 0.0 longsum = 0.0 disosum = 0.0 ordersum = 0.0 mixsum = 0.0 average = 0.0 longaverage = 0.0 mixaverage = 0.0 disoaverage = 0.0 orderaverage = 0.0 histo = np.zeros(numbins) disotop = 0 ordertop = 0 doubletop = 0 mixcount = 0 mixtop = 0 separation = 0.0 # We actually divide the analysis into three groups (ordered,disordered and mixed) for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 # only look at contacts within given range # default: take full sequence range into account if c_x < start or c_x >= end: continue if c_y < start or c_y >= end: continue pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 long_dist = pos_diff > 24 if not too_close: if score > th: contacts_x.append(c_x - start) contacts_y.append(c_y - start) if (disorder[c_x] > 0.5 and disorder[c_y] > 0.5): disocount += 1 disoscores.append(score) if (disocount <= ref_len * factor): disosum += score disoaverage = disosum / disocount elif (disorder[c_x] > 0.5 or disorder[c_y] > 0.5): mixcount += 1 mixscores.append(score) if (mixcount <= ref_len * factor): mixsum += score mixaverage = mixsum / mixcount else: ordercount += 1 orderscores.append(score) if (ordercount <= ref_len * factor): ordersum += score orderaverage = ordersum / ordercount count += 1 scores.append(score) if (count <= ref_len * factor): sum += score average = sum / count separation += pos_diff if long_dist: longcount += 1 if (longcount <= ref_len * factor): longsum += score longaverage = longsum / longcount else: tooclose.append(score) # statline="Highs: %.1f (%.1f%%) (%.1f%%) average: %.2f (%.2f) (%.2f) Meff: %.0f Diso: %.1f%% " % (count/ref_len,100*mixcount/count,100*disocount/count,average,mixaverage,disoaverage,max_cover,100*fraction_disorder) # statline="Highs: %.1f %.3f %.3f average: %.2f %.2f %.2f Meff: %.0f Diso: %.3f " % (count/ref_len,mixcount/count,disocount/count,average,mixaverage,disoaverage,max_cover,fraction_disorder) # statline="Length: %d NumAli: %d Counts: %d %d %d %.3f %.3f %.3f %.3f\n" % ( ref_len,max_cover,(count-mixcount-disocount),mixcount,disocount,sum,mixsum,disosum,fraction_disorder) statline = "NumAli: %d %d %d Length: %d %d %d Counts: %d %d %d %d RelContacts: %.3f %.3f %.3f Disorder: %.3f Long: %.3f %.3f %.3f \n" % ( max_cover, cover_order, cover_disorder, ref_len, num_order, num_disorder, count, ordercount, mixcount, disocount, count / (ref_len + 1.e-20), ordercount / (1.e-20 + num_order), disocount / (1.e-20 + num_disorder), fraction_disorder, longcount / count, longcount / (ref_len + 1.e-20), separation / count) statfig = plt.figure(figsize=(8, 8), dpi=96, facecolor='w') plt.hist((tooclose, scores), numbins, range=(0, 1), histtype='bar', normed=(numbins, numbins), alpha=0.75, label=['Too_Close', 'Contacts']) plt.xlabel('Score') plt.ylabel('Normalized count') statfig.suptitle('%s\n%s\n' % (c_filename, line)) ### start plotting fig = plt.figure(figsize=(8, 8), dpi=96, facecolor='w') ax = fig.add_subplot(111) #, aspect='auto') ax.set_adjustable('box-forced') ax.tick_params(direction='out', right='off', top='off') ax.set_xlim([-unit, ref_len]) ax.set_ylim([-unit, ref_len]) max_cover = 0 ### plot alignment coverage if alignemnt given (only on Y-axis) if ali_filename: # or meff_filename: # adjust overall canvas ax = plt.subplot2grid((8, 8), (1, 1), colspan=7, rowspan=7) #, aspect='auto') #ax.set_adjustable('box-forced') #ax.set_autoscale_on(False) ax.autoscale(False) ax.tick_params(direction='out', labelleft='off', right='off', top='off') ax.set_xlim([-unit, ref_len]) ax.set_ylim([-unit, ref_len]) if ali_filename: coverage_lst = get_ali_coverage(ali_filename) #elif meff_filename: # coverage_lst = get_meff_coverage(meff_filename) max_cover = max(coverage_lst) #lt = pow(10, max(1,floor(log10(max_cover)) - 1)) #upper = int(ceil(max_cover/float(lt)) * lt) ax2 = plt.subplot2grid((8, 8), (1, 0), rowspan=7, sharey=ax) #ax2.set_adjustable('box-forced') #ax2.set_autoscale_on(False) ax2.autoscale(False) #print len([0]+coverage_lst+[0]) #print len([0]+range(ref_len)+[ref_len-1]) ax2.plot([0] + coverage_lst + [0], [0] + range(ref_len) + [ref_len - 1], 'k', lw=0) ax2.axvline(x=max_cover * 0.25, lw=0.5, c='black', ls=':') ax2.axvline(x=max_cover * 0.5, lw=0.5, c='black', ls=':') ax2.axvline(x=max_cover * 0.75, lw=0.5, c='black', ls=':') ax2.fill([0] + coverage_lst + [0], [0] + range(ref_len) + [ref_len - 1], facecolor='gray', lw=0, alpha=0.5) ax2.set_xticks([0, max_cover]) ax2.tick_params(axis='x', top='off', direction='out') ax2.invert_xaxis() #ax2.spines['top'].set_visible(False) #ax2.spines['left'].set_visible(False) #ax.get_xaxis().tick_bottom() #ax.get_yaxis().tick_right() ax2.grid() ax2.set_ylim([-unit, ref_len]) #ax3 = plt.subplot2grid((8,8), (0,1), colspan=7, sharex=ax) #ax3.set_adjustable('box-forced') #ax3.set_autoscale_on(False) #ax3.autoscale(False) #ax3.plot([0]+range(ref_len)+[ref_len-1], [0]+coverage_lst+[0], 'k', lw=0) #ax3.axhline(y=max_cover*0.25, lw=0.5, c='black', ls=':') #ax3.axhline(y=max_cover*0.5, lw=0.5, c='black', ls=':') #ax3.axhline(y=max_cover*0.75, lw=0.5, c='black', ls=':') #ax3.fill([0]+range(ref_len)+[ref_len-1], [0]+coverage_lst+[0], facecolor='gray', lw=0, alpha=0.5) #ax3.xaxis.tick_top() #ax3.set_yticks([0, max_cover]) #ax3.tick_params(labelbottom='off') ax2.tick_params(axis='y', right='off', direction='out', left='on') #ax3.spines['top'].set_visible(False) #ax3.spines['right'].set_visible(False) #ax.get_xaxis().tick_top() #ax.get_yaxis().tick_left() #ax3.grid() #ax3.set_xlim([-unit,ref_len]) ### plot secondary structure along axis if given average_disorder = 0. fraction_disorder = 0. #statline = "Highs: %.1f Aver: %.2f Meff: %.0f" % (count/ref_len,average,max_cover) ax.get_xaxis().tick_top() ax.get_yaxis().tick_left() if iupred_fname: ax = plt.subplot2grid((8, 8), (1, 1), colspan=7, rowspan=7) #, aspect='auto') #ax.set_adjustable('box-forced') #ax.set_autoscale_on(False) ax.autoscale(False) ax.tick_params(direction='out', labelleft='off', right='off', top='off') ax.set_xlim([-unit, ref_len]) ax.set_ylim([-unit, ref_len]) average_disorder = np.sum(disorder) / ref_len fraction_disorder = 0.0 for d in disorder: if (d > 0.5): fraction_disorder += 1 / ref_len ax3 = plt.subplot2grid((8, 8), (0, 1), colspan=7, sharex=ax) ax3.set_adjustable('box-forced') ax3.set_autoscale_on(False) ax3.autoscale(False) ax3.plot([0] + range(ref_len) + [ref_len - 1], [0] + disorder + [0], 'b', lw=2) ax3.axhline(y=0.5, lw=0.5, c='black', ls=':') #ax3.fill([0]+range(ref_len)+[ref_len-1], [0]+disorder+[0], facecolor='gray', lw=0, alpha=0.5) ax3.xaxis.tick_top() ax3.set_yticks([0, 1]) ax3.tick_params(labelbottom='off') ax3.spines['top'].set_visible(False) ax3.spines['right'].set_visible(False) ax3.grid() ax3.set_xlim([-unit, ref_len]) #statline = "Highs: %.3f %.3f %.3f Aver: %.2f Meff: %.0f Diso: %.3f " % (count/ref_len,disocount/count,doublecount/count,average,max_cover,fraction_disorder) print "STATs: %s %s" % (c_filename, statline) if psipred_horiz_fname or psipred_vert_fname: if psipred_horiz_fname: ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r')) else: ss = parse_psipred.vertical(open(psipred_vert_fname, 'r')) ss = ss[start:end] assert len(ss) == ref_len ax.axhline(y=0, lw=1, c='black') ax.axvline(x=0, lw=1, c='black') for i in range(len(ss)): if ss[i] == 'H': #ax.plot(-unit/2, i, 's', c='#8B0043', mec="#8B0043")#, markersize=2) #ax.plot(i, -unit/2, 's', c='#8B0043', mec="#8B0043")#, markersize=2) #ax.plot(i, -unit/2, 's', c='#8B0043', mec="#8B0043")#, markersize=2) ax.add_patch( plt.Rectangle((-unit, i - 0.5), unit, 1, edgecolor='#8B0043', facecolor="#8B0043")) ax.add_patch( plt.Rectangle((i - 0.5, -unit), 1, unit, edgecolor='#8B0043', facecolor="#8B0043")) if ss[i] == 'E': ax.add_patch( plt.Rectangle((-unit, i - 0.5), unit, 1, edgecolor='#0080AD', facecolor="#0080AD")) ax.add_patch( plt.Rectangle((i - 0.5, -unit), 1, unit, edgecolor='#0080AD', facecolor="#0080AD")) #ax.plot(-unit/2, i, 's', c='#0080AD', mec="#0080AD")#, markersize=2) #ax.plot(i, -unit/2, 's', c='#0080AD', mec="#0080AD")#, markersize=2) if ss[i] == 'C': continue ### plot reference contacts in the background if given if pdb_filename: chain = '*' # We try to get all chains... res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) # cb_lst = parse_pdb.get_ca_coordinates(open(pdb_filename, 'r'), chain) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) # align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -10.5, -10.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] print atom_seq_ali print seq_ali j = 0 gapped_res_lst = [] gapped_cb_lst = [] for i in xrange(len(atom_seq_ali)): # print i,j if atom_seq_ali[i] == '-': gapped_res_lst.append('-') gapped_cb_lst.append('-') elif seq_ali[i] == '-': j += 1 continue else: gapped_res_lst.append(res_lst[j]) gapped_cb_lst.append(cb_lst[j]) j += 1 if is_heavy: dist_mat = get_heavy_contacts(gapped_res_lst) heavy_cutoff = 5 ref_contact_map = dist_mat < heavy_cutoff ref_contacts = np.where(dist_mat < heavy_cutoff) else: dist_mat = get_cb_contacts(gapped_cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff ref_contacts = np.where(dist_mat < cb_cutoff) #ref_contacts = np.where(np.ma.array(dist_mat, mask=np.tri(dist_mat.shape[0]), fill_value=float("inf")) < cb_cutoff) ref_contacts_x = ref_contacts[0] ref_contacts_y = ref_contacts[1] PPVs, TPs, FPs, orderPPVs, orderTPs, orderFPs, mixPPVs, mixTPs, mixFPs, disoPPVs, disoTPs, disoFPs = get_ppvs( contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor, disorder) tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali) img = get_colors(contacts_np, ref_contact_map=dist_mat, atom_seq_ali=atom_seq_ali, th=th, factor=factor) sc = ax.imshow(img, interpolation='none') # print 'PPV: %s %s %s %s %s' % (acc, PPVs[-1], orderPPVs[-1], mixPPVs[-1], disoPPVs[-1]) cmap = cm.get_cmap("binary") cmap.set_bad([1, 1, 1, 0]) dist_mat_masked = np.ma.array(dist_mat, mask=np.tri(dist_mat.shape[0], k=-1)) #sc = ax.imshow(s_score_vec(dist_mat_masked, 5), cmap=cmap, interpolation='none') ref_contacts_diag_x = [] ref_contacts_diag_y = [] for i in range(len(ref_contacts_x)): x_i = ref_contacts_x[i] y_i = ref_contacts_y[i] if not dist_mat_masked.mask[x_i, y_i] and abs(x_i - y_i) >= 5: ref_contacts_diag_x.append(x_i) ref_contacts_diag_y.append(y_i) #ax.scatter(ref_contacts_diag_x, ref_contacts_diag_y, marker='+', c='#000000') ### plot predicted contacts from second contact map if given if c2_filename: contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep) contacts2_x = [] contacts2_y = [] scores2 = [] contact_dict2 = {} count = 0 for i in range(len(contacts2)): score = contacts2[i][0] c_x = contacts2[i][1] - 1 c_y = contacts2[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts2_x.append(c_x) contacts2_y.append(c_y) scores2.append(score) count += 1 if count >= ref_len * factor: break ### use TP/FP color coding if reference contacts given if pdb_filename: PPVs, TPs, FPs, orderPPVs, orderTPs, orderFPs, mixPPVs, mixTPs, mixFPs, disoPPVs, disoTPs, disoFPs = get_ppvs( contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor, disorder) tp2_colors = get_tp_colors(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali) print '%s %s %s %s' % (acc, PPVs2[-1], TPs2[-1], FPs2[-1]) fig.suptitle( '%s\nPPV (upper left) = %.2f | PPV (lower right) = %.2f' % (acc, PPVs[-1], PPVs2[-1])) sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c=tp2_colors[::-1], s=6, alpha=0.75, lw=0) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, lw=0) else: sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='0', c='#D70909', edgecolor='#D70909', s=6, linewidths=0.5) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=6, linewidths=0.5) # ppv='PPV: %.2f %.2f %.2f' % (float(PPVs[-1]), float(TPs[-1]), float(FPs[-1])) ppv = 'PPV: %.2f (%d) %.2f (%d) %.2f (%d) ' % (float( PPVs[-1]), len(PPVs), float( mixPPVs[-1]), len(mixPPVs), float(disoPPVs[-1]), len(disoPPVs)) else: if pdb_filename: pdb_acc = parse_pdb.get_acc(open(pdb_filename)) if pdb_acc: if chain: fig.suptitle( '%s (PDB: %s, chain %s)\nPPV = %.2f\n%s' % (c_filename, pdb_acc, chain, PPVs[-1], statline), fontsize=8) else: fig.suptitle('%s (PDB: %s)\nPPV = %.2f \n%s' % (c_filename, pdb_acc, PPVs[-1], statline), fontsize=8) else: fig.suptitle('%s\nPPV = %.2f\n%s' % (c_filename, PPVs[-1], statline), fontsize=8) #cmap = cm.get_cmap("binary") #cmap.set_bad([1,1,1,0]) #contacts_np_masked = np.ma.array(contacts_np, mask=np.tri(contacts_np.shape[0], k=-1)) #sc = ax.imshow(contacts_np_masked.T, cmap=cmap) #sc = ax.imshow(contacts_np, cmap=cmap) #sc = ax.imshow(contacts_np + contacts_np.T, cmap=cm.binary, vmin=0.2, vmax=1.0, interpolation='none') #sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) #sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) else: #if c_filename.startswith('data'): # acc = c_filename.split('/')[1] #else: # acc = c_filename.split('/')[-1] fig.suptitle('%s\n%s' % (c_filename, statline), fontsize=8) #sc = ax.imshow(contacts_np + contacts_np.T, cmap=cm.hot_r) #sc = ax.imshow(contacts_np + contacts_np.T, # cmap=cm.binary, vmin=th, vmax=1.0, interpolation='none') img = get_colors(contacts_np, th=th, factor=factor) sc = ax.imshow(img, interpolation='none') #divider1 = make_axes_locatable(ax) #cax1 = divider1.append_axes("right", size="2%", pad=0.05) #plt.colorbar(sc, cax=cax1) #plt.colorbar(sc, ax=ax) #sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], # marker='o', c="black", s=6, alpha=0.75, # linewidths=0.1, edgecolors='none') #sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=10, alpha=0.75, cmap=cm.hot_r, linewidths=0.1, edgecolors='none') #plt.gca().set_xlim([0,ref_len]) #plt.gca().set_ylim([0,ref_len]) ax.grid() ax.set_xlim([-unit, ref_len]) ax.set_ylim([-unit, ref_len]) #print ax.axis() ax.axis([-unit, ref_len, -unit, ref_len]) #ax.invert_yaxis() ax.set_autoscale_on(False) if outfilename: if outfilename.endswith('.pdf'): pp = PdfPages(outfilename) pp.savefig(fig) pp.close() ppstat = PdfPages(outfilename + "_statistics.pdf") ppstat.savefig(statfig) ppstat.close() elif outfilename.endswith('.png'): plt.savefig(outfilename) else: pp = PdfPages('%s.pdf' % outfilename) pp.savefig(fig) pp.close() ppstat = PdfPages(outfilename + "_statistics.pdf") ppstat.savefig(statfig) ppstat.close() else: pp = PdfPages('%s_ContactMap.pdf' % c_filename) pp.savefig(fig) pp.close() ppstat = PdfPages('%s_statistics.pdf' % c_filename) ppstat.savefig(statfig) ppstat.close()
def plot_map(acc, fasta_filename, contact_filename, psipred_filename, sep=',', pdb_filename='', chain='A'): pdb_flag = pdb_filename.strip() != '' rep_len = 1000 psipred_filename = '%s.horiz' % '.'.join(fasta_filename.split('.')[:-1]) ss = parse_psipred.horizontal(open(psipred_filename, 'r')) ss_lst = get_ss_pos(ss) print ss print ss_lst seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] #ss = seq ref_len = len(seq) if pdb_flag: #pdb_code = pdb_filename.split('/')[-1].split('.')[0] res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) #cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) #print seq #print atom_seq align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) #print align[-1] atom_seq_ali = align[-1][0] seq_ali = align[-1][1] print atom_seq_ali print seq_ali """ # get 1D-domain assignments as char sequence # 'D' = in domain / 'N' = not in domain acc = fasta_filename.split('/')[-1].split('.')[0] dom_seq, dom_lst = get_dom_seq(acc, ref_len, open('/bubo/home/h9/mircomic/glob/2013-05-29_human_repeats/query.txt', 'r')) dom_seq_lst = map(int, list(dom_seq)) print dom_seq """ #dist_mat = calc_dist_matrix_heavy(ref_chain, ref_chain) j = 0 gapped_res_lst = [] #gapped_cb_lst = [] for i in xrange(len(atom_seq_ali)): if atom_seq_ali[i] == '-': gapped_res_lst.append('-') #gapped_cb_lst.append('-') elif seq_ali[i] == '-': j += 1 continue else: gapped_res_lst.append(res_lst[j]) #gapped_cb_lst.append(cb_lst[j]) j += 1 #print gapped_res_lst print len(gapped_res_lst) print len(res_lst) #print len(gapped_cb_lst) #print len(cb_lst) print len(atom_seq) print len(seq) dist_mat = get_heavy_contacts(gapped_res_lst) #dist_mat = get_cb_contacts(gapped_cb_lst) heavy_cutoff = 5 cb_cutoff = 8 #ref_contact_map = (dist_mat < 8) & (dist_mat > 4) ref_contact_map = dist_mat < heavy_cutoff ref_contacts = np.where(dist_mat < heavy_cutoff) #ref_contact_map = dist_mat < cb_cutoff #ref_contacts = np.where(dist_mat < cb_cutoff) ref_contacts_x = ref_contacts[0] ref_contacts_y = ref_contacts[1] """ tmp_x = [] tmp_y = [] for i in range(len(ref_contacts_x)): x = ref_contacts_x[i] y = ref_contacts_y[i] if y > x: tmp_x.append(x) tmp_y.append(y) ref_contacts_x = tmp_x ref_contacts_y = tmp_y """ #print dist_mat #print ref_contact_map #print 'ref_contacts=' + str(ref_contacts[1]) contacts = parse_contacts.parse(open(contact_filename, 'r'), sep) #contacts_cut = contacts[0:ref_len] contacts_x = [] contacts_y = [] scores = [] contact_dict = {} count = 0 #for i in range(ref_len * 1): for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 too_far = pos_diff > rep_len * 1.5 if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if count >= ref_len * 1.0: break if pdb_flag: PPVs = [] inter_PPVs = [] intra_PPVs = [] TPs = [] inter_TPs = [] intra_TPs = [] FPs = [] inter_FPs = [] intra_FPs = [] for num_c in range(min(len(contacts_x), ref_len * 1))[1:]: TP = 0.0 intra_TP = 0.0 inter_TP = 0.0 FP = 0.0 intra_FP = 0.0 inter_FP = 0.0 for i in range(num_c): c_x = contacts_x[i] c_y = contacts_y[i] if atom_seq_ali[c_x] == '-': continue if atom_seq_ali[c_y] == '-': continue c_x_dom = in_dom(c_x, dom_lst) c_y_dom = in_dom(c_y, dom_lst) #print c_x_dom #print c_y_dom if ref_contact_map[c_x, c_y] > 0: TP += 1.0 if c_x_dom != c_y_dom and c_x_dom != 0 and c_y_dom != 0: inter_TP += 1.0 if c_x_dom == c_y_dom and c_x_dom != 0 and c_y_dom != 0: intra_TP += 1.0 else: FP += 1.0 if c_x_dom != c_y_dom and c_x_dom != 0 and c_y_dom != 0: inter_FP += 1.0 if c_x_dom == c_y_dom and c_x_dom != 0 and c_y_dom != 0: intra_FP += 1.0 #print '%s, %s, %s, %s, %s, %s' % (TP, FP, inter_TP, inter_FP, intra_TP, intra_FP) if TP > 0 and FP > 0: PPVs.append(TP / (TP + FP)) if inter_TP > 0 or inter_FP > 0: inter_PPVs.append(inter_TP / (inter_TP + inter_FP)) if intra_TP > 0 or intra_FP > 0: intra_PPVs.append(intra_TP / (intra_TP + intra_FP)) TPs.append(TP / ref_len) FPs.append(FP / ref_len) print len(PPVs) if len(PPVs) > 0: print PPVs[-1] else: PPVs.append(0.0) print PPVs[-1] if len(inter_PPVs) > 0: print inter_PPVs[-1] else: inter_PPVs.append(0.0) print inter_PPVs[-1] if len(intra_PPVs) > 0: print intra_PPVs[-1] else: intra_PPVs.append(0.0) print intra_PPVs #print TPs[-1] #print FPs[-1] PPV_file = open( '/bubo/home/h9/mircomic/glob/2013-05-29_human_repeats/pconsc_predictions/PPV.txt', 'a') PPV_file.write('%s\t%s\n' % (acc, PPVs[-1])) PPV_file.close() intra_PPV_file = open( '/bubo/home/h9/mircomic/glob/2013-05-29_human_repeats/pconsc_predictions/intra_PPV.txt', 'a') intra_PPV_file.write('%s\t%s\n' % (acc, intra_PPVs[-1])) intra_PPV_file.close() inter_PPV_file = open( '/bubo/home/h9/mircomic/glob/2013-05-29_human_repeats/pconsc_predictions/inter_PPV.txt', 'a') inter_PPV_file.write('%s\t%s\n' % (acc, inter_PPVs[-1])) inter_PPV_file.close() ### get pairwise distances from the alignment #alinum = parse_jones.get_numeric('%s.jones' % '.'.join(contact_filename.split('.')[:-1])) #alidist = squareform(pdist(alinum.T, 'euclidean')) #alidistlog = np.log(alidist) """ dot_matrix = dotter.calc_dot_matrix(seq) tmp_dot_matrix = dot_matrix for (i,j), score in np.ndenumerate(dot_matrix): if i > j: tmp_dot_matrix[i, j] = 0.0 dot_matrix = tmp_dot_matrix """ fig = plt.figure() ax = fig.add_subplot(111) for i in range(len(ss)): if ss[i] == 'H': plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2) #plt.plot(i, i, 'o', c='#999999', mec="#444444") if ss[i] == 'E': plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2) #plt.plot(i, i, 'D', c='#999999', mec="#444444", markersize=6) if ss[i] == 'C': continue #plt.plot(i, i, 'D', c='#999999', mec='#999999', markersize=3) #plt.plot(i, i, 'D', c='#999999', mec='#999999', markersize=3) """ for dom in dom_lst: start = dom[0] - 1 end = dom[1] - 1 dom_len = end - start print '%d - %d' % (start, end) ax.add_patch(plt.Rectangle((start, start), dom_len, dom_len, facecolor='#EEEEEE', edgecolor='black', lw=0.5, zorder=0)) for dom2 in dom_lst: if dom2 == dom: continue start2 = dom2[0] - 1 end2 = dom2[1] - 1 dom_len2 = end2 - start2 ax.add_patch(plt.Rectangle((start, start2), dom_len, dom_len2, facecolor='#EEEEEE', edgecolor='black', lw=0.5, zorder=0, alpha=0.3)) """ ss_col_dict = {'H': '#8B0043', 'E': '#0080AD', 'HE': '#3F467D'} for ss_elem in ss_lst: start = ss_elem[0] - 1 end = ss_elem[1] - 1 elem_len = end - start curr_ss = ss[start] #print '%d - %d' % (start, end) ax.add_patch( plt.Rectangle((start, start), elem_len, elem_len, facecolor=ss_col_dict[curr_ss], edgecolor='black', lw=0.5, zorder=0, alpha=0.5)) for ss_elem2 in ss_lst: if ss_elem2 == ss_elem: continue start2 = ss_elem2[0] - 1 end2 = ss_elem2[1] - 1 elem_len2 = end2 - start2 curr_ss2 = ss[start2] if curr_ss != curr_ss2: # interaction between sheet and helix curr_ss2 = 'HE' ax.add_patch( plt.Rectangle((start, start2), elem_len, elem_len2, facecolor=ss_col_dict[curr_ss2], edgecolor='black', lw=0.5, zorder=0, alpha=0.2)) #ax.imshow(dot_matrix, origin='lower', cmap=cm.binary) if pdb_flag: ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#CCCCCC', lw=0) #ax.scatter(range(ref_len), range(ref_len), marker='d', c=dom_seq_lst, lw=0, edgecolor=dom_seq_lst, cmap=cm.spectral_r) #plt.plot(ref_contacts_x, ref_contacts_y, 'o', c='#CCCCCC', mec='#CCCCCC') fig.suptitle( '%s\nPPV = %.2f intra-PPV = %.2f inter-PPV = %.2f' % (contact_filename, PPVs[-1], intra_PPVs[-1], inter_PPVs[-1])) else: fig.suptitle('%s - %s' % (acc, contact_filename)) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.5) #sc = ax.scatter(contacts_nf_y[::-1], contacts_nf_x[::-1], marker='o', c=scores_nf[::-1], s=8, alpha=0.75, cmap=cm.jet, linewidths=0.5) plt.gca().set_xlim([0, ref_len]) plt.gca().set_ylim([0, ref_len]) plt.colorbar(sc) #cbar = plt.colorbar(ax, ticks=[min(scores), max(scores)]) #cbar.ax.set_yticklabels(['Low', 'High']) #cbar.set_label(r'Contact Score') pp = PdfPages('%s_ContactMap.pdf' % contact_filename) pp.savefig(fig) pp.close() #outfile = open('%s.contacts' % '.'.join(contact_filename.split('.')[0:-1]),'w') #for i in range(len(scores)): # outfile.write('%s,%s,%s\n' % (int(contacts_x[i] + 1), int(contacts_y[i] + 1), scores[i])) if pdb_flag: fig.clf() ax2 = fig.add_subplot(111) ax2.plot(PPVs) pp = PdfPages('%s_PPVs.pdf' % contact_filename) pp.savefig(fig) pp.close()
def plot_map(fasta_filename, c_filename, factor=1.0, th=-1, f_obs=-1, c2_filename='', psipred_horiz_fname='', psipred_vert_fname='', pdb_filename='', is_heavy=False, chain='', sep=',', outfilename='', ali_filename='', meff_filename='', name='', start=0, end=-1, pdb_start=0, pdb_end=-1, noalign=False, pdb_alignment='', pdb_id='', binary=False): #acc = c_filename.split('.')[0] #acc = fasta_filename.split('.')[0][:4] if name == '': acc = '.'.join(os.path.basename(fasta_filename).split('.')[:-1]) else: acc = name ### get sequence seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] ref_len = len(seq) ### trim sequence according to given positions ### default: take full sequence if end == -1: end = ref_len seq = seq[start:end] ref_len = len(seq) unit = (ref_len / 50.0) ### get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep) contacts_np = parse_contacts.get_numpy_cmap(contacts, seq_len=ref_len) contacts_np = contacts_np[start:end, start:end] contacts_x = [] contacts_y = [] scores = [] contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 # only look at contacts within given range # default: take full sequence range into account if c_x < start or c_x >= end: continue if c_y < start or c_y >= end: continue pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts_x.append(c_x - start) contacts_y.append(c_y - start) scores.append(score) count += 1 if count >= ref_len * factor and th == -1 and f_obs == -1: th = score break if score < th and not th == -1 and f_obs == -1: factor = count / float(ref_len) break # if cutoff by fraction of observed contacts: # take all contacts and cut list after reading pdb #if f_obs != -1: # factor = ref_len # th = -1 ### start plotting fig = plt.figure(figsize=(8, 8), dpi=96, facecolor='w') ax = fig.add_subplot(111) #, aspect='auto') ax.set_adjustable('box-forced') ax.tick_params(direction='out', right='off', top='off') ax.set_xlim([-unit, ref_len]) ax.set_ylim([-unit, ref_len]) ### plot alignment coverage if alignemnt given if ali_filename or meff_filename: # adjust overall canvas ax = plt.subplot2grid((8, 8), (1, 1), colspan=7, rowspan=7) #, aspect='auto') #ax.set_adjustable('box-forced') #ax.set_autoscale_on(False) ax.autoscale(False) ax.tick_params(direction='out', labelleft='off', right='off', top='off') ax.set_xlim([-unit, ref_len]) ax.set_ylim([-unit, ref_len]) if ali_filename: coverage_lst, M = get_ali_coverage(ali_filename) max_cover = M elif meff_filename: coverage_lst, Meff = get_meff_coverage(meff_filename) max_cover = Meff #max_cover = max(coverage_lst) coverage_lst = coverage_lst[start:end] #lt = pow(10, max(1,floor(log10(max_cover)) - 1)) #upper = int(ceil(max_cover/float(lt)) * lt) ax2 = plt.subplot2grid((8, 8), (1, 0), rowspan=7, sharey=ax) #ax2.set_adjustable('box-forced') #ax2.set_autoscale_on(False) ax2.autoscale(False) #print len([0]+coverage_lst+[0]) #print len([0]+range(ref_len)+[ref_len-1]) ax2.plot([0] + coverage_lst + [0], [0] + range(ref_len) + [ref_len - 1], 'k', lw=0) ax2.axvline(x=max_cover * 0.25, lw=0.5, c='black', ls=':') ax2.axvline(x=max_cover * 0.5, lw=0.5, c='black', ls=':') ax2.axvline(x=max_cover * 0.75, lw=0.5, c='black', ls=':') ax2.fill([0] + coverage_lst + [0], [0] + range(ref_len) + [ref_len - 1], facecolor='gray', lw=0, alpha=0.5) ax2.set_xticks([0, max_cover]) ax2.tick_params(axis='x', top='off', direction='out') ax2.invert_xaxis() #ax2.spines['top'].set_visible(False) #ax2.spines['left'].set_visible(False) #ax.get_xaxis().tick_bottom() #ax.get_yaxis().tick_right() ax2.grid() ax2.set_ylim([-unit, ref_len]) ax3 = plt.subplot2grid((8, 8), (0, 1), colspan=7, sharex=ax) #ax3.set_adjustable('box-forced') #ax3.set_autoscale_on(False) ax3.autoscale(False) ax3.plot([0] + range(ref_len) + [ref_len - 1], [0] + coverage_lst + [0], 'k', lw=0) ax3.axhline(y=max_cover * 0.25, lw=0.5, c='black', ls=':') ax3.axhline(y=max_cover * 0.5, lw=0.5, c='black', ls=':') ax3.axhline(y=max_cover * 0.75, lw=0.5, c='black', ls=':') ax3.fill([0] + range(ref_len) + [ref_len - 1], [0] + coverage_lst + [0], facecolor='gray', lw=0, alpha=0.5) #ax3.xaxis.tick_top() ax3.set_yticks([0, max_cover]) ax3.tick_params(labelbottom='off') ax2.tick_params(axis='y', right='off', direction='out', left='on') #ax3.spines['top'].set_visible(False) #ax3.spines['right'].set_visible(False) #ax.get_xaxis().tick_top() #ax.get_yaxis().tick_left() ax3.grid() ax3.set_xlim([-unit, ref_len]) ### plot secondary structure along axis if given if psipred_horiz_fname or psipred_vert_fname: if psipred_horiz_fname: ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r')) else: ss = parse_psipred.vertical(open(psipred_vert_fname, 'r')) ss = ss[start:end] assert len(ss) == ref_len ax.axhline(y=0, lw=1, c='black') ax.axvline(x=0, lw=1, c='black') for i in range(len(ss)): if ss[i] == 'H': #ax.plot(-unit/2, i, 's', c='#8B0043', mec="#8B0043")#, markersize=2) #ax.plot(i, -unit/2, 's', c='#8B0043', mec="#8B0043")#, markersize=2) #ax.plot(i, -unit/2, 's', c='#8B0043', mec="#8B0043")#, markersize=2) ax.add_patch( plt.Rectangle((-unit, i - 0.5), unit, 1, edgecolor='#8B0043', facecolor="#8B0043")) ax.add_patch( plt.Rectangle((i - 0.5, -unit), 1, unit, edgecolor='#8B0043', facecolor="#8B0043")) if ss[i] == 'E': ax.add_patch( plt.Rectangle((-unit, i - 0.5), unit, 1, edgecolor='#0080AD', facecolor="#0080AD")) ax.add_patch( plt.Rectangle((i - 0.5, -unit), 1, unit, edgecolor='#0080AD', facecolor="#0080AD")) #ax.plot(-unit/2, i, 's', c='#0080AD', mec="#0080AD")#, markersize=2) #ax.plot(i, -unit/2, 's', c='#0080AD', mec="#0080AD")#, markersize=2) if ss[i] == 'C': continue ### plot reference contacts in the background if given if pdb_filename: res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) # if not true there is some serious problem with the provided pdb file assert len(res_lst) == len(cb_lst) == len(atom_seq) ### trim PDB sequence according to given positions ### default: take full sequence if pdb_end == -1: pdb_end = len(res_lst) res_lst = res_lst[pdb_start:pdb_end] cb_lst = cb_lst[pdb_start:pdb_end] atom_seq = atom_seq[pdb_start:pdb_end] #print atom_seq #print seq if noalign: dist_mat = get_cb_contacts(cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff PPV, TP, FP = get_ppvs(contacts_x, contacts_y, ref_contact_map, ref_len, factor) tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map) else: if pdb_alignment and pdb_id: #align = parse_hhblits_hhr.parse_alignments(pdb_alignment)[pdb_id] #atom_seq_ali = align[0][0] #seq_ali = align[0][1] seq_ali, atom_seq_ali = parse_a3m.get_pairwise( pdb_alignment, pdb_id) seqres_seq = atom_seq_ali.replace('-', '') #print seqres_seq #print atom_seq_ali #print seq_ali #print "" align_seqres = pairwise2.align.globalms( atom_seq, seqres_seq, 2, -1, -0.5, -0.1) atom_seq_ali0 = align_seqres[-1][0] seqres_seq_ali0 = align_seqres[-1][1] #print "" #print atom_seq_ali0 #print seqres_seq_ali0 #print "" atom_seq_ali, seq_ali = embedd_alignment( atom_seq_ali0, seqres_seq_ali0, atom_seq_ali, seq_ali) #print atom_seq_ali1 #print seq_ali1 #print "" else: matrix = matlist.blosum62 #align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) #align = pairwise2.align.localds(atom_seq, seq, matrix, -11, -1) align = pairwise2.align.globalds(atom_seq, seq, matrix, -25, -1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] #print atom_seq_ali #print seq_ali #print len(atom_seq), len(seq), len(res_lst), len(cb_lst) #print len(atom_seq_ali), len(seq_ali) j = 0 gapped_res_lst = [] gapped_cb_lst = [] for i in xrange(len(atom_seq_ali)): if atom_seq_ali[i] == '-': if seq_ali[i] == '-': continue gapped_res_lst.append('-') gapped_cb_lst.append('-') elif seq_ali[i] == '-': j += 1 else: gapped_res_lst.append(res_lst[j]) gapped_cb_lst.append(cb_lst[j]) j += 1 if is_heavy: dist_mat = get_heavy_contacts(gapped_res_lst) heavy_cutoff = 5 ref_contact_map = dist_mat < heavy_cutoff ref_contacts = np.where(dist_mat < heavy_cutoff) else: dist_mat = get_cb_contacts(gapped_cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff ref_contacts = np.where(dist_mat < cb_cutoff) #ref_contacts = np.where(np.ma.array(dist_mat, mask=np.tri(dist_mat.shape[0]), fill_value=float("inf")) < cb_cutoff) ref_contacts_x = ref_contacts[0] ref_contacts_y = ref_contacts[1] # if f_obs given, take top f_obs * num_obs contacts: if f_obs != -1: num_obs = sum(ref_contacts_x - ref_contacts_y >= 5) num_top = int(ceil(f_obs * num_obs)) contacts_x = contacts_x[:num_top] contacts_y = contacts_y[:num_top] scores = scores[:num_top] th_obs = scores[-1] else: th_obs = th PPVs, TPs, FPs = get_ppvs(contacts_x, contacts_y, ref_contact_map, ref_len, factor, atom_seq_ali=atom_seq_ali) tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali=atom_seq_ali) if not c2_filename: img = get_colors(contacts_np, ref_contact_map=dist_mat, th=th_obs, binary=binary) sc = ax.imshow(img, interpolation='none') else: # plot native contacts in background img = get_ref_img(dist_mat) sc = ax.imshow(img, interpolation='none') print '%s %s %s %s' % (acc, PPVs[-1], TPs[-1], FPs[-1]) cmap = cm.get_cmap("binary") cmap.set_bad([1, 1, 1, 0]) dist_mat_masked = np.ma.array(dist_mat, mask=np.tri(dist_mat.shape[0], k=-1)) #sc = ax.imshow(s_score_vec(dist_mat_masked, 5), cmap=cmap, interpolation='none') ref_contacts_diag_x = [] ref_contacts_diag_y = [] for i in range(len(ref_contacts_x)): x_i = ref_contacts_x[i] y_i = ref_contacts_y[i] if not dist_mat_masked.mask[x_i, y_i] and abs(x_i - y_i) >= 5: ref_contacts_diag_x.append(x_i) ref_contacts_diag_y.append(y_i) #ax.scatter(ref_contacts_diag_x, ref_contacts_diag_y, marker='+', c='#000000') ### plot predicted contacts from second contact map if given if c2_filename: contacts2 = parse_contacts.parse(open(c2_filename, 'r')) contacts2_x = [] contacts2_y = [] scores2 = [] contact_dict2 = {} count = 0 for i in range(len(contacts2)): score = contacts2[i][0] c_x = contacts2[i][1] - 1 c_y = contacts2[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts2_x.append(c_x - start) contacts2_y.append(c_y - start) scores2.append(score) count += 1 if count >= ref_len * factor and th == -1 and f_obs == -1: th = score break if score < th and not th == -1 and f_obs == -1: factor = count / float(ref_len) break # if cutoff by fraction of observed contacts: # stop at num_top = f_obs * num_obs if pdb_filename and f_obs != -1: if count >= num_top: factor = count / float(ref_len) th = score break ### use TP/FP color coding if reference contacts given if pdb_filename: #PPVs2, TPs2, FPs2 = get_ppvs(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp2_colors = get_tp_colors(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali) #print '%s %s %s %s' % (acc, PPVs2[-1], TPs2[-1], FPs2[-1]) #fig.suptitle('%s\nPPV (upper left) = %.2f | PPV (lower right) = %.2f' % (acc, PPVs[-1], PPVs2[-1])) sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='s', c=tp2_colors[::-1], s=4, alpha=1, lw=0, edgecolor=tp2_colors[::-1]) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='s', c=tp_colors[::-1], s=4, alpha=1, lw=0, edgecolor=tp_colors[::-1]) else: sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='0', c='#D70909', edgecolor='#D70909', s=6, linewidths=0.5) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=6, linewidths=0.5) ### plot predicted contacts from first contact map on both triangles ### if no second contact map given else: if pdb_filename: pdb_acc = parse_pdb.get_acc(open(pdb_filename)) if pdb_acc: if chain: fig.suptitle('%s (PDB: %s, chain %s)\nPPV = %.2f' % (acc, pdb_acc, chain, PPVs[-1])) else: fig.suptitle('%s (PDB: %s)\nPPV = %.2f' % (acc, pdb_acc, PPVs[-1])) else: fig.suptitle('%s\nPPV = %.2f' % (acc, PPVs[-1])) #cmap = cm.get_cmap("binary") #cmap.set_bad([1,1,1,0]) #contacts_np_masked = np.ma.array(contacts_np, mask=np.tri(contacts_np.shape[0], k=-1)) #sc = ax.imshow(contacts_np_masked.T, cmap=cmap) #sc = ax.imshow(contacts_np, cmap=cmap) #sc = ax.imshow(contacts_np + contacts_np.T, cmap=cm.binary, vmin=0.2, vmax=1.0, interpolation='none') #sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) #sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) else: #if c_filename.startswith('data'): # acc = c_filename.split('/')[1] #else: # acc = c_filename.split('/')[-1] fig.suptitle('%s' % acc) #sc = ax.imshow(contacts_np + contacts_np.T, cmap=cm.hot_r) #sc = ax.imshow(contacts_np + contacts_np.T, # cmap=cm.binary, vmin=th, vmax=1.0, interpolation='none') img = get_colors(contacts_np, th=th) sc = ax.imshow(img, interpolation='none') #divider1 = make_axes_locatable(ax) #cax1 = divider1.append_axes("right", size="2%", pad=0.05) #plt.colorbar(sc, cax=cax1) #plt.colorbar(sc, ax=ax) #sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], # marker='o', c="black", s=6, alpha=0.75, # linewidths=0.1, edgecolors='none') #sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.hot_r, linewidths=0.1, edgecolors='none') #plt.gca().set_xlim([0,ref_len]) #plt.gca().set_ylim([0,ref_len]) ax.grid() ax.set_xlim([-unit, ref_len]) ax.set_ylim([-unit, ref_len]) #print ax.axis() ax.axis([-unit, ref_len, -unit, ref_len]) #ax.invert_yaxis() ax.set_autoscale_on(False) if outfilename: if outfilename.endswith('.pdf'): pp = PdfPages(outfilename) pp.savefig(fig) pp.close() elif outfilename.endswith('.eps'): plt.savefig(outfilename, format='eps', dpi=300) elif outfilename.endswith('.png'): plt.savefig(outfilename) else: pp = PdfPages('%s.pdf' % outfilename) pp.savefig(fig) pp.close() else: pp = PdfPages('%s_ContactMap.pdf' % c_filename) pp.savefig(fig) pp.close()
return c_filt def cysteine_filter(c_lst, seq): """Filters contacts from cysteines @param c_lst contact list (as given by parsing/parse_contacts.py) @param seq string of one-letter coded amino acid sequence Ensures: len(c_lst) == len(c_filt), only contact weights are changed @return [(score, residue a, residue b)] """ c_filt = [] return c_filt if __name__ == "__main__": c_filename = sys.argv[1] psipred_filename = sys.argv[2] cfilt_filename = sys.argv[3] #seq_filename = sys.argv[3] c_lst = parse_contacts.parse(open(c_filename, 'r')) ss_seq = parse_psipred.horizontal(open(psipred_filename, 'r')) c_filt = secstruct_filter(c_lst, ss_seq) cfilt_file = open(cfilt_filename, 'w') parse_contacts.write(c_filt, cfilt_file) cfilt_file.close()