Пример #1
0
def plot_map(fasta_filename,
             c_filename,
             factor,
             c2_filename='',
             psipred_filename='',
             pdb_filename='',
             is_heavy=False,
             chain='',
             sep=' '):

    acc = c_filename.split('.')[0]

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep)

    contacts_x = []
    contacts_y = []
    scores = []
    contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1

        if count >= ref_len * factor:
            break

    ### start plotting
    fig = plt.figure(figsize=(8, 8), dpi=100)
    ax = fig.add_subplot(111)
    #ax = plt.axes([.1, .1, .8, .8], frameon=False)
    ax.set_xlim(xmin=-1)
    ax.set_ylim(ymin=-1)

    ### plot secondary structure on the diagonal if given
    if psipred_filename:
        ss = parse_psipred.horizontal(open(psipred_filename, 'r'))
        for i in range(len(ss)):
            if ss[i] == 'H':
                plt.plot(i, i, 'o', c='#8B0043',
                         mec="#8B0043")  #, markersize=8)
            if ss[i] == 'E':
                plt.plot(i, i, 'D', c='#0080AD',
                         mec="#0080AD")  #, markersize=8)
            if ss[i] == 'C':
                plt.plot(i, i, 'D', c='#CCCCCC', mec="#CCCCCC", markersize=4)

    ### plot reference contacts in the background if given
    if pdb_filename:
        res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
        cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)

        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)

        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]

        j = 0
        gapped_res_lst = []
        gapped_cb_lst = []

        for i in xrange(len(atom_seq_ali)):
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                gapped_cb_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                gapped_cb_lst.append(cb_lst[j])
                j += 1

        if is_heavy:
            dist_mat = get_heavy_contacts(gapped_res_lst)
            heavy_cutoff = 5
            ref_contact_map = dist_mat < heavy_cutoff
            ref_contacts = np.where(dist_mat < heavy_cutoff)
        else:
            dist_mat = get_cb_contacts(gapped_cb_lst)
            cb_cutoff = 8
            ref_contact_map = dist_mat < cb_cutoff
            ref_contacts = np.where(dist_mat < cb_cutoff)

        ref_contacts_x = ref_contacts[0]
        ref_contacts_y = ref_contacts[1]

        PPVs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali,
                        ref_len, factor)
        tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map,
                                  atom_seq_ali)

        print '%s\t%s' % (acc, PPVs[-1])

        ax.scatter(ref_contacts_x,
                   ref_contacts_y,
                   marker='o',
                   c='#DDDDDD',
                   lw=0)

    ### plot predicted contacts from second contact map if given
    if c2_filename:
        contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep)
        contacts2_x = []
        contacts2_y = []
        scores2 = []
        contact_dict2 = {}

        count = 0

        for i in range(len(contacts2)):
            score = contacts2[i][0]
            c_x = contacts2[i][1] - 1
            c_y = contacts2[i][2] - 1

            pos_diff = abs(c_x - c_y)
            too_close = pos_diff < 5

            if not too_close:
                contacts2_x.append(c_x)
                contacts2_y.append(c_y)
                scores2.append(score)
                count += 1

            if count >= ref_len * factor:
                break

        ### use TP/FP color coding if reference contacts given
        if pdb_filename:
            PPVs2 = get_ppvs(contacts2_x, contacts2_y, ref_contact_map,
                             atom_seq_ali, ref_len, factor)
            tp2_colors = get_tp_colors(contacts2_x, contacts2_y,
                                       ref_contact_map, atom_seq_ali)
            print '%s\t%s' % (acc, PPVs2[-1])
            fig.suptitle(
                '%s\n%s (upper left) PPV = %.2f | %s (lower right) PPV = %.2f'
                % (acc, c_filename, PPVs[-1], c2_filename, PPVs2[-1]))
            sc = ax.scatter(contacts2_y[::-1],
                            contacts2_x[::-1],
                            marker='o',
                            c=tp2_colors[::-1],
                            linewidths=0.0)
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c=tp_colors[::-1],
                            linewidths=0.0)
        else:
            fig.suptitle('%s\n%s (upper left) | %s (lower right)' %
                         (acc, c_filename, c2_filename))
            sc = ax.scatter(contacts2_y[::-1],
                            contacts2_x[::-1],
                            marker='o',
                            c='#D70909',
                            edgecolor='#D70909',
                            s=8,
                            linewidths=0.5)
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c='#004F9D',
                            edgecolor='#004F9D',
                            s=8,
                            linewidths=0.5)

    ### plot predicted contacts from first contact map on both triangles
    ### if no second contact map given
    else:
        if pdb_filename:
            fig.suptitle('%s (%s)\nPPV = %.2f' % (acc, c_filename, PPVs[-1]))
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c=tp_colors[::-1],
                            linewidths=0.0)
            sc = ax.scatter(contacts_y[::-1],
                            contacts_x[::-1],
                            marker='o',
                            c=tp_colors[::-1],
                            linewidths=0.0)
        else:
            #fig.suptitle('%s (%s)' % (acc, c_filename))
            #sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=scores[::-1], s=6, alpha=0.75, cmap=cm.jet, linewidths=0.5)
            #sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=6, alpha=0.75, cmap=cm.jet, linewidths=0.5)
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c='#004F9D',
                            edgecolor='#004F9D',
                            s=8,
                            linewidths=0.5)
            sc = ax.scatter(contacts_y[::-1],
                            contacts_x[::-1],
                            marker='o',
                            c='#004F9D',
                            edgecolor='#004F9D',
                            s=8,
                            linewidths=0.5)
            #plt.colorbar(sc)

    plt.gca().set_xlim([-1, ref_len])
    plt.gca().set_ylim([-1, ref_len])

    plt.savefig('%s.cm.png' % c_filename, bbox_inches=0)
def contactanalysis(fasta_filename,
                    c_filename,
                    factor=1.0,
                    cutoff=9999.99,
                    th=-1,
                    c2_filename='',
                    psipred_horiz_fname='',
                    psipred_vert_fname='',
                    iupred_fname='',
                    pdb_filename='',
                    is_heavy=False,
                    chain='',
                    sep=',',
                    outfilename='',
                    ali_filename='',
                    meff_filename='',
                    name='',
                    start=0,
                    end=-1):
    #acc = c_filename.split('.')[0]
    #acc = fasta_filename.split('.')[0][:4]
    if name == '':
        acc = '.'.join(os.path.basename(fasta_filename).split('.')[:-1])
    else:
        acc = name

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### trim sequence according to given positions
    ### default: take full sequence
    if end == -1:
        end = ref_len
    seq = seq[start:end]
    ref_len = len(seq)
    unit = (ref_len / 50.0)

    if ali_filename:
        coverage_lst = get_ali_coverage(ali_filename)
        max_cover = max(coverage_lst)
    elif meff_filename:
        coverage_lst = get_meff_coverage(meff_filename)
        max_cover = max(coverage_lst)
    else:
        max_cover = 0
    average_disorder = 0.
    fraction_disorder = 0.
    if iupred_fname:
        disorder = parse_iupred.pred(open(iupred_fname, 'r'))
    else:
        disorder = np.zeros(ref_len)
    average_disorder = np.sum(disorder) / ref_len
    fraction_disorder = 0.0
    for i in disorder:
        if (i > 0.5):
            fraction_disorder += 1 / ref_len

    ### get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep, 1)
    contacts_np = parse_contacts.get_numpy_cmap(contacts)
    contacts_np = contacts_np[start:end, start:end]

    contacts_x = []
    contacts_y = []
    scores = []
    mixscores = []
    disoscores = []
    tooclose = []
    contact_dict = {}

    count = 0
    mixcount = 0
    disocount = 0
    highscore = 0
    numbins = 20
    sum = 0.0
    disosum = 0.0
    mixsum = 0.0
    average = 0.0
    mixaverage = 0.0
    disoaverage = 0.0
    histo = np.zeros(numbins)
    disotop = 0
    doubletop = 0
    mixcount = 0
    mixtop = 0

    # We actually divide the analysis into three groups (ordered,disordered and mixed)
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        # only look at contacts within given range
        # default: take full sequence range into account
        if c_x < start or c_x >= end:
            continue
        if c_y < start or c_y >= end:
            continue

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5
        if not too_close:
            if score > cutoff:
                contacts_x.append(c_x - start)
                contacts_y.append(c_y - start)
                if (disorder[c_x] > 0.5 and disorder[c_y] > 0.5):
                    disocount += 1
                    disoscores.append(score)
                    if (disocount <= ref_len * factor):
                        disosum += score
                        disoaverage = disosum / disocount
                elif (disorder[c_x] > 0.5 or disorder[c_y] > 0.5):
                    mixcount += 1
                    mixscores.append(score)
                    if (mixcount <= ref_len * factor):
                        mixsum += score
                        mixaverage = mixsum / mixcount
                count += 1
                scores.append(score)
                if (count <= ref_len * factor):
                    sum += score
                    average = sum / count

        else:
            tooclose.append(score)

    statline = "Highs: %.1f %.3f %.3f\t average:  %.2f %.2f %.2f\t Meff: %.0f\t Diso: %.3f \t" % (
        count / ref_len, mixcount / count, disocount / count, average,
        mixaverage, disoaverage, max_cover, fraction_disorder)
    statline = "%d\t%d\t%d\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\n" % (
        ref_len, max_cover, (count - mixcount - disocount), mixcount,
        disocount, sum, mixsum, disosum, fraction_disorder)
    statline = "%d\t%.3f\t%.3f\t%.3f\t%3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % (
        max_cover, count / ref_len, (count - mixcount - disocount) /
        (1.e-20 + ref_len * (1 - fraction_disorder)), mixcount /
        (1.e-20 + sqrt(ref_len * ref_len *
                       (1 - fraction_disorder) * fraction_disorder)),
        disocount / (1.e-20 + ref_len * (fraction_disorder)), average,
        mixaverage, disoaverage, fraction_disorder)
    fig = plt.figure(figsize=(8, 8), dpi=96, facecolor='w')
    plt.hist((tooclose, scores),
             numbins,
             range=(0, 1),
             histtype='bar',
             normed=(numbins, numbins),
             alpha=0.75,
             label=['Too_Close', 'Contacts'])
    plt.xlabel('Score')
    plt.ylabel('Normalized count')
    fig.suptitle('%s\n%s\n' % (c_filename, statline))

    ### Calculate reference contacts in the background if given
    if pdb_filename:
        chain = '*'
        # We try to get all chains...
        res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
        #        cb_lst = parse_pdb.get_ca_coordinates(open(pdb_filename, 'r'), chain)
        cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)

        #        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -10.5, -10.1)

        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]

        print atom_seq_ali
        print seq_ali

        j = 0
        gapped_res_lst = []
        gapped_cb_lst = []

        for i in xrange(len(atom_seq_ali)):
            #            print i,j
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                gapped_cb_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                gapped_cb_lst.append(cb_lst[j])
                j += 1

        if is_heavy:
            dist_mat = get_heavy_contacts(gapped_res_lst)
            heavy_cutoff = 5
            ref_contact_map = dist_mat < heavy_cutoff
            ref_contacts = np.where(dist_mat < heavy_cutoff)
        else:
            dist_mat = get_cb_contacts(gapped_cb_lst)
            cb_cutoff = 8
            ref_contact_map = dist_mat < cb_cutoff
            ref_contacts = np.where(dist_mat < cb_cutoff)
            #ref_contacts = np.where(np.ma.array(dist_mat, mask=np.tri(dist_mat.shape[0]), fill_value=float("inf")) < cb_cutoff)

        ref_contacts_x = ref_contacts[0]
        ref_contacts_y = ref_contacts[1]

        PPVs, TPs, FPs, mixPPVs, mixTPs, mixFPs, disoPPVs, disoTPs, disoFPs = get_ppvs(
            contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len,
            factor, disorder)

        #        ppv='PPV: %.2f %.2f %.2f' % (float(PPVs[-1]), float(TPs[-1]), float(FPs[-1]))
        ppv = 'PPV: %.2f %d\t%.2f %d\t%.2f %d ' % (float(
            PPVs[-1]), len(PPVs), float(
                mixPPVs[-1]), len(mixPPVs), float(disoPPVs[-1]), len(disoPPVs))
        print "STATs: %s \t %s\t%s\t" % (fasta_filename, statline, ppv)
    else:
        print "STATs: %s \t %s\t" % (fasta_filename, statline)

    # We should print statistics for each residue..

    if outfilename:
        if outfilename.endswith('.pdf'):

            pp.savefig(fig)
            pp.close()
        elif outfilename.endswith('.png'):
            plt.savefig(outfilename + "_statistics.png")
        else:
            pp = PdfPages('%s_statistics.pdf' % outfilename)
            pp.savefig(fig)
            pp.close()
    else:
        pp = PdfPages('%s_statistics.pdf' % c_filename)
        pp.savefig(fig)
        pp.close()
def plot_map(fasta_filename, c_filename, factor, c2_filename='', psipred_filename='', pdb_filename='', is_heavy=False, chain='', sep=' '):  
   
    acc = c_filename.split('.')[0]

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep)

    contacts_x = []
    contacts_y = []
    scores = []
    contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1
           
        if count >= ref_len * factor:
            break
 

    ### start plotting
    fig = plt.figure(figsize=(8,8), dpi=100)
    ax = fig.add_subplot(111)
    #ax = plt.axes([.1, .1, .8, .8], frameon=False)
    ax.set_xlim(xmin=-1)
    ax.set_ylim(ymin=-1)

    ### plot secondary structure on the diagonal if given
    if psipred_filename:
        ss = parse_psipred.horizontal(open(psipred_filename, 'r'))
        for i in range(len(ss)):
            if ss[i] == 'H':
                plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043")#, markersize=8)
            if ss[i] == 'E':
                plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD")#, markersize=8)
            if ss[i] == 'C':
                plt.plot(i, i, 'D', c='#CCCCCC', mec="#CCCCCC", markersize=4)

    ### plot reference contacts in the background if given
    if pdb_filename:
        res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
        cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)

        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)

        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]

        j = 0
        gapped_res_lst = []
        gapped_cb_lst = []

        for i in xrange(len(atom_seq_ali)):
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                gapped_cb_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                gapped_cb_lst.append(cb_lst[j])
                j += 1

        if is_heavy:
            dist_mat = get_heavy_contacts(gapped_res_lst)
            heavy_cutoff = 5
            ref_contact_map = dist_mat < heavy_cutoff
            ref_contacts = np.where(dist_mat < heavy_cutoff)
        else:
            dist_mat = get_cb_contacts(gapped_cb_lst)
            cb_cutoff = 8
            ref_contact_map = dist_mat < cb_cutoff
            ref_contacts = np.where(dist_mat < cb_cutoff)
        
        ref_contacts_x = ref_contacts[0]
        ref_contacts_y = ref_contacts[1]
       
        PPVs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor)
        tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali)
   
        print '%s\t%s' % (acc, PPVs[-1])
      
        ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#DDDDDD', lw=0)


    ### plot predicted contacts from second contact map if given
    if c2_filename:
        contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep)
        contacts2_x = []
        contacts2_y = []
        scores2 = []
        contact_dict2 = {}

        count = 0

        for i in range(len(contacts2)):
            score = contacts2[i][0]
            c_x = contacts2[i][1] - 1
            c_y = contacts2[i][2] - 1

            pos_diff = abs(c_x - c_y)
            too_close = pos_diff < 5

            if not too_close:
                contacts2_x.append(c_x)
                contacts2_y.append(c_y)
                scores2.append(score)
                count += 1
               
            if count >= ref_len * factor:
                break

        ### use TP/FP color coding if reference contacts given
        if pdb_filename:
            PPVs2 = get_ppvs(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali, ref_len, factor)
            tp2_colors = get_tp_colors(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali)
            print '%s\t%s' % (acc, PPVs2[-1])
            fig.suptitle('%s\n%s (upper left) PPV = %.2f | %s (lower right) PPV = %.2f' % (acc, c_filename, PPVs[-1], c2_filename, PPVs2[-1]))
            sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c=tp2_colors[::-1], linewidths=0.0)
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], linewidths=0.0)
        else:
            fig.suptitle('%s\n%s (upper left) | %s (lower right)' % (acc, c_filename, c2_filename))
            sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c='#D70909', edgecolor='#D70909', s=8, linewidths=0.5)
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=8, linewidths=0.5)


    ### plot predicted contacts from first contact map on both triangles
    ### if no second contact map given
    else:
        if pdb_filename:
            fig.suptitle('%s (%s)\nPPV = %.2f' % (acc, c_filename, PPVs[-1]))
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], linewidths=0.0)
            sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], linewidths=0.0)
        else:
            fig.suptitle('%s (%s)' % (acc, c_filename))
            #sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=scores[::-1], s=6, alpha=0.75, cmap=cm.jet, linewidths=0.5)
            #sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=6, alpha=0.75, cmap=cm.jet, linewidths=0.5)
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=8, linewidths=0.5)
            sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=8, linewidths=0.5)
            #plt.colorbar(sc)

    plt.gca().set_xlim([-1,ref_len])
    plt.gca().set_ylim([-1,ref_len])

    plt.savefig('%s.cm.png' % c_filename, bbox_inches=0) 
Пример #4
0
def plot_map(fasta_filename,
             c_filename,
             factor=1.0,
             th=-1,
             c2_filename='',
             psipred_horiz_fname='',
             psipred_vert_fname='',
             iupred_fname='',
             pdb_filename='',
             is_heavy=False,
             chain='',
             sep=',',
             outfilename='',
             ali_filename='',
             meff_filename='',
             name='',
             start=0,
             end=-1):
    #acc = c_filename.split('.')[0]
    #acc = fasta_filename.split('.')[0][:4]
    if name == '':
        acc = '.'.join(os.path.basename(fasta_filename).split('.')[:-1])
    else:
        acc = name

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### trim sequence according to given positions
    ### default: take full sequence
    if end == -1:
        end = ref_len
    seq = seq[start:end]
    ref_len = len(seq)
    unit = (ref_len / 50.0)

    if ali_filename:
        coverage_lst = get_ali_coverage(ali_filename)
        max_cover = max(coverage_lst)
        # Current Meff format does not work
        #    elif meff_filename:
        #        coverage_lst = get_meff_coverage(meff_filename)
        #        max_cover = max(coverage_lst)
    else:
        coverage_lst = np.zeros(ref_len)
        max_cover = 0
    average_disorder = 0.
    average_order = 0.
    fraction_disorder = 0.
    cover_order = 0.
    cover_disorder = 0.
    if iupred_fname:
        disorder = parse_iupred.pred(open(iupred_fname, 'r'))
    else:
        disorder = np.zeros(ref_len)
    average_disorder = np.sum(disorder) / ref_len
    fraction_disorder = 0.0
    num_disorder = 0
    num_order = 0
    j = 0
    for i in disorder:
        if (i > 0.5):
            fraction_disorder += 1 / ref_len
            num_disorder += 1
            cover_disorder += coverage_lst[j]
        else:
            num_order += 1
            cover_order += coverage_lst[j]
        j += 1

    if (num_disorder > 0):
        cover_disorder = cover_disorder / num_disorder
    if (num_order > 0):
        cover_order = cover_order / num_order

    ### get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep, 1)
    contacts_np = parse_contacts.get_numpy_cmap(contacts)
    contacts_np = contacts_np[start:end, start:end]

    contacts_x = []
    contacts_y = []
    scores = []
    mixscores = []
    disoscores = []
    orderscores = []
    tooclose = []
    contact_dict = {}
    if iupred_fname:
        disorder = parse_iupred.pred(open(iupred_fname, 'r'))
    else:
        disorder = np.zeros(ref_len)

    count = 1.e-20
    mixcount = 1.e-20
    disocount = 1.e-20
    ordercount = 1.e-20
    longcount = 1.e-20
    highscore = 0
    numbins = 20
    sum = 0.0
    longsum = 0.0
    disosum = 0.0
    ordersum = 0.0
    mixsum = 0.0
    average = 0.0
    longaverage = 0.0
    mixaverage = 0.0
    disoaverage = 0.0
    orderaverage = 0.0
    histo = np.zeros(numbins)
    disotop = 0
    ordertop = 0
    doubletop = 0
    mixcount = 0
    mixtop = 0
    separation = 0.0

    # We actually divide the analysis into three groups (ordered,disordered and mixed)
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        # only look at contacts within given range
        # default: take full sequence range into account
        if c_x < start or c_x >= end:
            continue
        if c_y < start or c_y >= end:
            continue

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5
        long_dist = pos_diff > 24
        if not too_close:
            if score > th:
                contacts_x.append(c_x - start)
                contacts_y.append(c_y - start)
                if (disorder[c_x] > 0.5 and disorder[c_y] > 0.5):
                    disocount += 1
                    disoscores.append(score)
                    if (disocount <= ref_len * factor):
                        disosum += score
                        disoaverage = disosum / disocount
                elif (disorder[c_x] > 0.5 or disorder[c_y] > 0.5):
                    mixcount += 1
                    mixscores.append(score)
                    if (mixcount <= ref_len * factor):
                        mixsum += score
                        mixaverage = mixsum / mixcount
                else:
                    ordercount += 1
                    orderscores.append(score)
                    if (ordercount <= ref_len * factor):
                        ordersum += score
                        orderaverage = ordersum / ordercount
                count += 1
                scores.append(score)
                if (count <= ref_len * factor):
                    sum += score
                    average = sum / count
                separation += pos_diff
                if long_dist:
                    longcount += 1
                    if (longcount <= ref_len * factor):
                        longsum += score
                        longaverage = longsum / longcount
        else:
            tooclose.append(score)

#    statline="Highs: %.1f (%.1f%%) (%.1f%%)  average:  %.2f (%.2f) (%.2f)  Meff: %.0f  Diso: %.1f%%  " % (count/ref_len,100*mixcount/count,100*disocount/count,average,mixaverage,disoaverage,max_cover,100*fraction_disorder)
#    statline="Highs: %.1f %.3f %.3f  average:  %.2f %.2f %.2f  Meff: %.0f  Diso: %.3f  " % (count/ref_len,mixcount/count,disocount/count,average,mixaverage,disoaverage,max_cover,fraction_disorder)
#    statline="Length: %d NumAli: %d Counts: %d %d %d %.3f %.3f %.3f %.3f\n"  % ( ref_len,max_cover,(count-mixcount-disocount),mixcount,disocount,sum,mixsum,disosum,fraction_disorder)
    statline = "NumAli: %d %d %d Length: %d %d %d Counts: %d %d %d %d  RelContacts: %.3f %.3f %.3f Disorder: %.3f Long: %.3f %.3f %.3f \n" % (
        max_cover, cover_order, cover_disorder, ref_len, num_order,
        num_disorder, count, ordercount, mixcount, disocount, count /
        (ref_len + 1.e-20), ordercount / (1.e-20 + num_order), disocount /
        (1.e-20 + num_disorder), fraction_disorder, longcount / count,
        longcount / (ref_len + 1.e-20), separation / count)
    statfig = plt.figure(figsize=(8, 8), dpi=96, facecolor='w')
    plt.hist((tooclose, scores),
             numbins,
             range=(0, 1),
             histtype='bar',
             normed=(numbins, numbins),
             alpha=0.75,
             label=['Too_Close', 'Contacts'])
    plt.xlabel('Score')
    plt.ylabel('Normalized count')
    statfig.suptitle('%s\n%s\n' % (c_filename, line))

    ### start plotting
    fig = plt.figure(figsize=(8, 8), dpi=96, facecolor='w')
    ax = fig.add_subplot(111)  #, aspect='auto')
    ax.set_adjustable('box-forced')
    ax.tick_params(direction='out', right='off', top='off')
    ax.set_xlim([-unit, ref_len])
    ax.set_ylim([-unit, ref_len])
    max_cover = 0
    ### plot alignment coverage if alignemnt given (only on Y-axis)
    if ali_filename:  # or meff_filename:
        # adjust overall canvas
        ax = plt.subplot2grid((8, 8), (1, 1), colspan=7,
                              rowspan=7)  #, aspect='auto')
        #ax.set_adjustable('box-forced')
        #ax.set_autoscale_on(False)
        ax.autoscale(False)
        ax.tick_params(direction='out',
                       labelleft='off',
                       right='off',
                       top='off')
        ax.set_xlim([-unit, ref_len])
        ax.set_ylim([-unit, ref_len])

        if ali_filename:
            coverage_lst = get_ali_coverage(ali_filename)
        #elif meff_filename:
        #    coverage_lst = get_meff_coverage(meff_filename)
        max_cover = max(coverage_lst)

        #lt = pow(10, max(1,floor(log10(max_cover)) - 1))
        #upper = int(ceil(max_cover/float(lt)) * lt)
        ax2 = plt.subplot2grid((8, 8), (1, 0), rowspan=7, sharey=ax)
        #ax2.set_adjustable('box-forced')
        #ax2.set_autoscale_on(False)
        ax2.autoscale(False)
        #print len([0]+coverage_lst+[0])
        #print len([0]+range(ref_len)+[ref_len-1])

        ax2.plot([0] + coverage_lst + [0],
                 [0] + range(ref_len) + [ref_len - 1],
                 'k',
                 lw=0)
        ax2.axvline(x=max_cover * 0.25, lw=0.5, c='black', ls=':')
        ax2.axvline(x=max_cover * 0.5, lw=0.5, c='black', ls=':')
        ax2.axvline(x=max_cover * 0.75, lw=0.5, c='black', ls=':')
        ax2.fill([0] + coverage_lst + [0],
                 [0] + range(ref_len) + [ref_len - 1],
                 facecolor='gray',
                 lw=0,
                 alpha=0.5)
        ax2.set_xticks([0, max_cover])
        ax2.tick_params(axis='x', top='off', direction='out')
        ax2.invert_xaxis()
        #ax2.spines['top'].set_visible(False)
        #ax2.spines['left'].set_visible(False)
        #ax.get_xaxis().tick_bottom()
        #ax.get_yaxis().tick_right()
        ax2.grid()
        ax2.set_ylim([-unit, ref_len])

        #ax3 = plt.subplot2grid((8,8), (0,1), colspan=7, sharex=ax)
        #ax3.set_adjustable('box-forced')
        #ax3.set_autoscale_on(False)
        #ax3.autoscale(False)
        #ax3.plot([0]+range(ref_len)+[ref_len-1], [0]+coverage_lst+[0], 'k', lw=0)
        #ax3.axhline(y=max_cover*0.25, lw=0.5, c='black', ls=':')
        #ax3.axhline(y=max_cover*0.5, lw=0.5, c='black', ls=':')
        #ax3.axhline(y=max_cover*0.75, lw=0.5, c='black', ls=':')
        #ax3.fill([0]+range(ref_len)+[ref_len-1], [0]+coverage_lst+[0], facecolor='gray', lw=0, alpha=0.5)
        #ax3.xaxis.tick_top()
        #ax3.set_yticks([0, max_cover])
        #ax3.tick_params(labelbottom='off')
        ax2.tick_params(axis='y', right='off', direction='out', left='on')
        #ax3.spines['top'].set_visible(False)
        #ax3.spines['right'].set_visible(False)
        #ax.get_xaxis().tick_top()
        #ax.get_yaxis().tick_left()
        #ax3.grid()
        #ax3.set_xlim([-unit,ref_len])

    ### plot secondary structure along axis if given
    average_disorder = 0.
    fraction_disorder = 0.
    #statline = "Highs: %.1f Aver: %.2f  Meff: %.0f" % (count/ref_len,average,max_cover)
    ax.get_xaxis().tick_top()
    ax.get_yaxis().tick_left()
    if iupred_fname:
        ax = plt.subplot2grid((8, 8), (1, 1), colspan=7,
                              rowspan=7)  #, aspect='auto')
        #ax.set_adjustable('box-forced')
        #ax.set_autoscale_on(False)
        ax.autoscale(False)
        ax.tick_params(direction='out',
                       labelleft='off',
                       right='off',
                       top='off')
        ax.set_xlim([-unit, ref_len])
        ax.set_ylim([-unit, ref_len])

        average_disorder = np.sum(disorder) / ref_len
        fraction_disorder = 0.0
        for d in disorder:
            if (d > 0.5):
                fraction_disorder += 1 / ref_len

        ax3 = plt.subplot2grid((8, 8), (0, 1), colspan=7, sharex=ax)
        ax3.set_adjustable('box-forced')
        ax3.set_autoscale_on(False)
        ax3.autoscale(False)
        ax3.plot([0] + range(ref_len) + [ref_len - 1], [0] + disorder + [0],
                 'b',
                 lw=2)
        ax3.axhline(y=0.5, lw=0.5, c='black', ls=':')
        #ax3.fill([0]+range(ref_len)+[ref_len-1], [0]+disorder+[0], facecolor='gray', lw=0, alpha=0.5)
        ax3.xaxis.tick_top()
        ax3.set_yticks([0, 1])
        ax3.tick_params(labelbottom='off')
        ax3.spines['top'].set_visible(False)
        ax3.spines['right'].set_visible(False)
        ax3.grid()
        ax3.set_xlim([-unit, ref_len])
        #statline = "Highs: %.3f  %.3f   %.3f    Aver: %.2f  Meff: %.0f  Diso: %.3f  " % (count/ref_len,disocount/count,doublecount/count,average,max_cover,fraction_disorder)

    print "STATs: %s %s" % (c_filename, statline)
    if psipred_horiz_fname or psipred_vert_fname:
        if psipred_horiz_fname:
            ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r'))
        else:
            ss = parse_psipred.vertical(open(psipred_vert_fname, 'r'))

        ss = ss[start:end]
        assert len(ss) == ref_len

        ax.axhline(y=0, lw=1, c='black')
        ax.axvline(x=0, lw=1, c='black')
        for i in range(len(ss)):
            if ss[i] == 'H':
                #ax.plot(-unit/2, i, 's', c='#8B0043', mec="#8B0043")#, markersize=2)
                #ax.plot(i, -unit/2, 's', c='#8B0043', mec="#8B0043")#, markersize=2)
                #ax.plot(i, -unit/2, 's', c='#8B0043', mec="#8B0043")#, markersize=2)
                ax.add_patch(
                    plt.Rectangle((-unit, i - 0.5),
                                  unit,
                                  1,
                                  edgecolor='#8B0043',
                                  facecolor="#8B0043"))
                ax.add_patch(
                    plt.Rectangle((i - 0.5, -unit),
                                  1,
                                  unit,
                                  edgecolor='#8B0043',
                                  facecolor="#8B0043"))
            if ss[i] == 'E':
                ax.add_patch(
                    plt.Rectangle((-unit, i - 0.5),
                                  unit,
                                  1,
                                  edgecolor='#0080AD',
                                  facecolor="#0080AD"))
                ax.add_patch(
                    plt.Rectangle((i - 0.5, -unit),
                                  1,
                                  unit,
                                  edgecolor='#0080AD',
                                  facecolor="#0080AD"))
                #ax.plot(-unit/2, i, 's', c='#0080AD', mec="#0080AD")#, markersize=2)
                #ax.plot(i, -unit/2, 's', c='#0080AD', mec="#0080AD")#, markersize=2)
            if ss[i] == 'C':
                continue

    ### plot reference contacts in the background if given
    if pdb_filename:
        chain = '*'
        # We try to get all chains...
        res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
        #        cb_lst = parse_pdb.get_ca_coordinates(open(pdb_filename, 'r'), chain)
        cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)

        #        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -10.5, -10.1)

        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]

        print atom_seq_ali
        print seq_ali

        j = 0
        gapped_res_lst = []
        gapped_cb_lst = []

        for i in xrange(len(atom_seq_ali)):
            #            print i,j
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                gapped_cb_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                gapped_cb_lst.append(cb_lst[j])
                j += 1

        if is_heavy:
            dist_mat = get_heavy_contacts(gapped_res_lst)
            heavy_cutoff = 5
            ref_contact_map = dist_mat < heavy_cutoff
            ref_contacts = np.where(dist_mat < heavy_cutoff)
        else:
            dist_mat = get_cb_contacts(gapped_cb_lst)
            cb_cutoff = 8
            ref_contact_map = dist_mat < cb_cutoff
            ref_contacts = np.where(dist_mat < cb_cutoff)
            #ref_contacts = np.where(np.ma.array(dist_mat, mask=np.tri(dist_mat.shape[0]), fill_value=float("inf")) < cb_cutoff)

        ref_contacts_x = ref_contacts[0]
        ref_contacts_y = ref_contacts[1]

        PPVs, TPs, FPs, orderPPVs, orderTPs, orderFPs, mixPPVs, mixTPs, mixFPs, disoPPVs, disoTPs, disoFPs = get_ppvs(
            contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len,
            factor, disorder)
        tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map,
                                  atom_seq_ali)
        img = get_colors(contacts_np,
                         ref_contact_map=dist_mat,
                         atom_seq_ali=atom_seq_ali,
                         th=th,
                         factor=factor)
        sc = ax.imshow(img, interpolation='none')

        #
        print 'PPV: %s %s %s %s %s' % (acc, PPVs[-1], orderPPVs[-1],
                                       mixPPVs[-1], disoPPVs[-1])

        cmap = cm.get_cmap("binary")
        cmap.set_bad([1, 1, 1, 0])
        dist_mat_masked = np.ma.array(dist_mat,
                                      mask=np.tri(dist_mat.shape[0], k=-1))
        #sc = ax.imshow(s_score_vec(dist_mat_masked, 5), cmap=cmap, interpolation='none')

        ref_contacts_diag_x = []
        ref_contacts_diag_y = []
        for i in range(len(ref_contacts_x)):
            x_i = ref_contacts_x[i]
            y_i = ref_contacts_y[i]
            if not dist_mat_masked.mask[x_i, y_i] and abs(x_i - y_i) >= 5:
                ref_contacts_diag_x.append(x_i)
                ref_contacts_diag_y.append(y_i)

    #ax.scatter(ref_contacts_diag_x, ref_contacts_diag_y, marker='+', c='#000000')

    ### plot predicted contacts from second contact map if given
    if c2_filename:
        contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep)
        contacts2_x = []
        contacts2_y = []
        scores2 = []
        contact_dict2 = {}

        count = 0

        for i in range(len(contacts2)):
            score = contacts2[i][0]
            c_x = contacts2[i][1] - 1
            c_y = contacts2[i][2] - 1

            pos_diff = abs(c_x - c_y)
            too_close = pos_diff < 5

            if not too_close:
                contacts2_x.append(c_x)
                contacts2_y.append(c_y)
                scores2.append(score)
                count += 1

            if count >= ref_len * factor:
                break

        ### use TP/FP color coding if reference contacts given
        if pdb_filename:
            PPVs, TPs, FPs, orderPPVs, orderTPs, orderFPs, mixPPVs, mixTPs, mixFPs, disoPPVs, disoTPs, disoFPs = get_ppvs(
                contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len,
                factor, disorder)
            tp2_colors = get_tp_colors(contacts2_x, contacts2_y,
                                       ref_contact_map, atom_seq_ali)
            print '%s %s %s %s' % (acc, PPVs2[-1], TPs2[-1], FPs2[-1])
            fig.suptitle(
                '%s\nPPV (upper left) = %.2f | PPV (lower right) = %.2f' %
                (acc, PPVs[-1], PPVs2[-1]))
            sc = ax.scatter(contacts2_y[::-1],
                            contacts2_x[::-1],
                            marker='o',
                            c=tp2_colors[::-1],
                            s=6,
                            alpha=0.75,
                            lw=0)
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c=tp_colors[::-1],
                            s=6,
                            alpha=0.75,
                            lw=0)
        else:
            sc = ax.scatter(contacts2_y[::-1],
                            contacts2_x[::-1],
                            marker='0',
                            c='#D70909',
                            edgecolor='#D70909',
                            s=6,
                            linewidths=0.5)
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c='#004F9D',
                            edgecolor='#004F9D',
                            s=6,
                            linewidths=0.5)


#        ppv='PPV: %.2f %.2f %.2f' % (float(PPVs[-1]), float(TPs[-1]), float(FPs[-1]))
        ppv = 'PPV: %.2f (%d) %.2f (%d) %.2f (%d) ' % (float(
            PPVs[-1]), len(PPVs), float(
                mixPPVs[-1]), len(mixPPVs), float(disoPPVs[-1]), len(disoPPVs))
    else:
        if pdb_filename:
            pdb_acc = parse_pdb.get_acc(open(pdb_filename))
            if pdb_acc:
                if chain:
                    fig.suptitle(
                        '%s (PDB: %s, chain %s)\nPPV = %.2f\n%s' %
                        (c_filename, pdb_acc, chain, PPVs[-1], statline),
                        fontsize=8)
                else:
                    fig.suptitle('%s (PDB: %s)\nPPV = %.2f \n%s' %
                                 (c_filename, pdb_acc, PPVs[-1], statline),
                                 fontsize=8)
            else:
                fig.suptitle('%s\nPPV = %.2f\n%s' %
                             (c_filename, PPVs[-1], statline),
                             fontsize=8)
            #cmap = cm.get_cmap("binary")
            #cmap.set_bad([1,1,1,0])
            #contacts_np_masked = np.ma.array(contacts_np, mask=np.tri(contacts_np.shape[0], k=-1))
            #sc = ax.imshow(contacts_np_masked.T, cmap=cmap)
            #sc = ax.imshow(contacts_np, cmap=cmap)
            #sc = ax.imshow(contacts_np + contacts_np.T, cmap=cm.binary, vmin=0.2, vmax=1.0, interpolation='none')
            #sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
            #sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
        else:
            #if c_filename.startswith('data'):
            #    acc = c_filename.split('/')[1]
            #else:
            #    acc = c_filename.split('/')[-1]
            fig.suptitle('%s\n%s' % (c_filename, statline), fontsize=8)
            #sc = ax.imshow(contacts_np + contacts_np.T, cmap=cm.hot_r)
            #sc = ax.imshow(contacts_np + contacts_np.T,
            #        cmap=cm.binary, vmin=th, vmax=1.0, interpolation='none')
            img = get_colors(contacts_np, th=th, factor=factor)
            sc = ax.imshow(img, interpolation='none')
            #divider1 = make_axes_locatable(ax)
            #cax1 = divider1.append_axes("right", size="2%", pad=0.05)
            #plt.colorbar(sc, cax=cax1)
            #plt.colorbar(sc, ax=ax)
            #sc = ax.scatter(contacts_x[::-1], contacts_y[::-1],
            #        marker='o', c="black", s=6, alpha=0.75,
            #        linewidths=0.1, edgecolors='none')
            #sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=10, alpha=0.75, cmap=cm.hot_r, linewidths=0.1, edgecolors='none')

    #plt.gca().set_xlim([0,ref_len])
    #plt.gca().set_ylim([0,ref_len])

    ax.grid()
    ax.set_xlim([-unit, ref_len])
    ax.set_ylim([-unit, ref_len])
    #print ax.axis()
    ax.axis([-unit, ref_len, -unit, ref_len])
    #ax.invert_yaxis()
    ax.set_autoscale_on(False)

    if outfilename:
        if outfilename.endswith('.pdf'):
            pp = PdfPages(outfilename)
            pp.savefig(fig)
            pp.close()
            ppstat = PdfPages(outfilename + "_statistics.pdf")
            ppstat.savefig(statfig)
            ppstat.close()
        elif outfilename.endswith('.png'):
            plt.savefig(outfilename)
        else:
            pp = PdfPages('%s.pdf' % outfilename)
            pp.savefig(fig)
            pp.close()
            ppstat = PdfPages(outfilename + "_statistics.pdf")
            ppstat.savefig(statfig)
            ppstat.close()
    else:
        pp = PdfPages('%s_ContactMap.pdf' % c_filename)
        pp.savefig(fig)
        pp.close()
        ppstat = PdfPages('%s_statistics.pdf' % c_filename)
        ppstat.savefig(statfig)
        ppstat.close()
Пример #5
0
def fix(pdb1_filename, pdb2_filename, out_filename, chain1='', chain2=''):

    if not chain1:
        chain1 = parse_pdb.get_first_chain(open(pdb1_filename, 'r'))
    pdb1 = parse_pdb.read(open(pdb1_filename, 'r'), chain1)
    seq1 = parse_pdb.get_atom_seq(open(pdb1_filename, 'r'), chain1)

    if not chain2:
        chain2 = parse_pdb.get_first_chain(open(pdb2_filename, 'r'))
    pdb2 = parse_pdb.read(open(pdb2_filename, 'r'), chain2)
    seq2 = parse_pdb.get_atom_seq(open(pdb2_filename, 'r'), chain2)

    align = pairwise2.align.globalms(seq1, seq2, 2, -1, -0.5, -0.1)
    #print seq1
    #print seq2

    #print align
    seq1_ali = align[-1][0]
    seq2_ali = align[-1][1]

    #print pdb2
    pdb2_idx = []
    offset = 0
    for i in xrange(len(seq2_ali)):
        if seq1_ali[i] == '-':
            offset -= 1
            idx = i + 1 + offset
            pdb2_idx.append(idx)
        elif seq2_ali[i] == '-':
            continue
            #offset += 1
            #idx = i+1 + offset
            #pdb2_idx.append(idx)
        else:
            idx = i + 1 + offset
            pdb2_idx.append(idx)
        #else:
    pdb2_new = ['', [], pdb2[2]]
    i = 0
    prev_idx = -1
    #print len(pdb2_idx)
    #print len(pdb2[1])
    for res in pdb2[1]:
        if i >= len(pdb2_idx):
            break
        new_res = []
        new_idx = pdb2_idx[i]
        if new_idx == 0:
            i = i + 1
            continue
        elif new_idx == prev_idx:
            i = i + 1
            continue
        else:
            for atm in res:
                new_idx_str = str(pdb2_idx[i])
                #print atm
                #print new_idx_str
                lendiff = 4 - len(new_idx_str)
                new_atm = atm[:22] + lendiff * ' ' + new_idx_str + atm[26:]
                new_res.append(new_atm)
            pdb2_new[1].append(new_res)

        prev_idx = new_idx
        i = i + 1

    #print pdb1_filename
    #print pdb2_filename
    #print pdb2_idx
    #print len(pdb2_idx)
    #print align[-1]
    #print len(align[-1][1])
    if out_filename:
        pdb2_outfile = open(out_filename, 'w')
    else:
        pdb2_outfile = open(
            '.'.join(pdb2_filename.split('.')[:-1]) + '.aligned.pdb', 'w')
    #print pdb2_new
    parse_pdb.write(pdb2_new, pdb2_outfile)
Пример #6
0
def plot_map(acc,
             fasta_filename,
             contact_filename,
             psipred_filename,
             sep=',',
             pdb_filename='',
             chain='A'):

    pdb_flag = pdb_filename.strip() != ''

    rep_len = 1000
    psipred_filename = '%s.horiz' % '.'.join(fasta_filename.split('.')[:-1])
    ss = parse_psipred.horizontal(open(psipred_filename, 'r'))
    ss_lst = get_ss_pos(ss)
    print ss
    print ss_lst
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    #ss = seq
    ref_len = len(seq)

    if pdb_flag:
        #pdb_code = pdb_filename.split('/')[-1].split('.')[0]
        res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
        #cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)

        #print seq
        #print atom_seq
        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
        #print align[-1]
        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]
        print atom_seq_ali
        print seq_ali
        """
        # get 1D-domain assignments as char sequence
        # 'D' = in domain / 'N' = not in domain
        acc = fasta_filename.split('/')[-1].split('.')[0]
        dom_seq, dom_lst = get_dom_seq(acc, ref_len, open('/bubo/home/h9/mircomic/glob/2013-05-29_human_repeats/query.txt', 'r'))
        dom_seq_lst = map(int, list(dom_seq))
        print dom_seq
        """

        #dist_mat = calc_dist_matrix_heavy(ref_chain, ref_chain)
        j = 0
        gapped_res_lst = []
        #gapped_cb_lst = []
        for i in xrange(len(atom_seq_ali)):
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                #gapped_cb_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                #gapped_cb_lst.append(cb_lst[j])
                j += 1
        #print gapped_res_lst
        print len(gapped_res_lst)
        print len(res_lst)
        #print len(gapped_cb_lst)
        #print len(cb_lst)
        print len(atom_seq)
        print len(seq)
        dist_mat = get_heavy_contacts(gapped_res_lst)
        #dist_mat = get_cb_contacts(gapped_cb_lst)
        heavy_cutoff = 5
        cb_cutoff = 8

        #ref_contact_map = (dist_mat < 8) & (dist_mat > 4)
        ref_contact_map = dist_mat < heavy_cutoff
        ref_contacts = np.where(dist_mat < heavy_cutoff)
        #ref_contact_map = dist_mat < cb_cutoff
        #ref_contacts = np.where(dist_mat < cb_cutoff)

        ref_contacts_x = ref_contacts[0]
        ref_contacts_y = ref_contacts[1]
    """
    tmp_x = []
    tmp_y = []

    for i in range(len(ref_contacts_x)):
        x = ref_contacts_x[i]
        y = ref_contacts_y[i]
        if y > x:
            tmp_x.append(x)
            tmp_y.append(y)

    ref_contacts_x = tmp_x
    ref_contacts_y = tmp_y
    """
    #print dist_mat
    #print ref_contact_map
    #print 'ref_contacts=' + str(ref_contacts[1])

    contacts = parse_contacts.parse(open(contact_filename, 'r'), sep)
    #contacts_cut = contacts[0:ref_len]

    contacts_x = []
    contacts_y = []
    scores = []
    contact_dict = {}

    count = 0
    #for i in range(ref_len * 1):
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5
        too_far = pos_diff > rep_len * 1.5

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1

        if count >= ref_len * 1.0:
            break

    if pdb_flag:
        PPVs = []
        inter_PPVs = []
        intra_PPVs = []
        TPs = []
        inter_TPs = []
        intra_TPs = []
        FPs = []
        inter_FPs = []
        intra_FPs = []

        for num_c in range(min(len(contacts_x), ref_len * 1))[1:]:
            TP = 0.0
            intra_TP = 0.0
            inter_TP = 0.0
            FP = 0.0
            intra_FP = 0.0
            inter_FP = 0.0
            for i in range(num_c):
                c_x = contacts_x[i]
                c_y = contacts_y[i]
                if atom_seq_ali[c_x] == '-':
                    continue
                if atom_seq_ali[c_y] == '-':
                    continue
                c_x_dom = in_dom(c_x, dom_lst)
                c_y_dom = in_dom(c_y, dom_lst)
                #print c_x_dom
                #print c_y_dom
                if ref_contact_map[c_x, c_y] > 0:
                    TP += 1.0
                    if c_x_dom != c_y_dom and c_x_dom != 0 and c_y_dom != 0:
                        inter_TP += 1.0
                    if c_x_dom == c_y_dom and c_x_dom != 0 and c_y_dom != 0:
                        intra_TP += 1.0
                else:
                    FP += 1.0
                    if c_x_dom != c_y_dom and c_x_dom != 0 and c_y_dom != 0:
                        inter_FP += 1.0
                    if c_x_dom == c_y_dom and c_x_dom != 0 and c_y_dom != 0:
                        intra_FP += 1.0

            #print '%s, %s, %s, %s, %s, %s' % (TP, FP, inter_TP, inter_FP, intra_TP, intra_FP)
            if TP > 0 and FP > 0:
                PPVs.append(TP / (TP + FP))
                if inter_TP > 0 or inter_FP > 0:
                    inter_PPVs.append(inter_TP / (inter_TP + inter_FP))
                if intra_TP > 0 or intra_FP > 0:
                    intra_PPVs.append(intra_TP / (intra_TP + intra_FP))
                TPs.append(TP / ref_len)
                FPs.append(FP / ref_len)

        print len(PPVs)
        if len(PPVs) > 0:
            print PPVs[-1]
        else:
            PPVs.append(0.0)
            print PPVs[-1]

        if len(inter_PPVs) > 0:
            print inter_PPVs[-1]
        else:
            inter_PPVs.append(0.0)
            print inter_PPVs[-1]

        if len(intra_PPVs) > 0:
            print intra_PPVs[-1]
        else:
            intra_PPVs.append(0.0)
            print intra_PPVs

        #print TPs[-1]
        #print FPs[-1]

        PPV_file = open(
            '/bubo/home/h9/mircomic/glob/2013-05-29_human_repeats/pconsc_predictions/PPV.txt',
            'a')
        PPV_file.write('%s\t%s\n' % (acc, PPVs[-1]))
        PPV_file.close()
        intra_PPV_file = open(
            '/bubo/home/h9/mircomic/glob/2013-05-29_human_repeats/pconsc_predictions/intra_PPV.txt',
            'a')
        intra_PPV_file.write('%s\t%s\n' % (acc, intra_PPVs[-1]))
        intra_PPV_file.close()
        inter_PPV_file = open(
            '/bubo/home/h9/mircomic/glob/2013-05-29_human_repeats/pconsc_predictions/inter_PPV.txt',
            'a')
        inter_PPV_file.write('%s\t%s\n' % (acc, inter_PPVs[-1]))
        inter_PPV_file.close()

    ### get pairwise distances from the alignment
    #alinum = parse_jones.get_numeric('%s.jones' % '.'.join(contact_filename.split('.')[:-1]))
    #alidist = squareform(pdist(alinum.T, 'euclidean'))
    #alidistlog = np.log(alidist)
    """
    dot_matrix = dotter.calc_dot_matrix(seq)
    tmp_dot_matrix = dot_matrix
    for (i,j), score in np.ndenumerate(dot_matrix):
        if i > j:
            tmp_dot_matrix[i, j] = 0.0
    dot_matrix = tmp_dot_matrix
    """

    fig = plt.figure()
    ax = fig.add_subplot(111)

    for i in range(len(ss)):
        if ss[i] == 'H':
            plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2)
            #plt.plot(i, i, 'o', c='#999999', mec="#444444")
        if ss[i] == 'E':
            plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2)
            #plt.plot(i, i, 'D', c='#999999', mec="#444444", markersize=6)
        if ss[i] == 'C':
            continue
            #plt.plot(i, i, 'D', c='#999999', mec='#999999', markersize=3)
            #plt.plot(i, i, 'D', c='#999999', mec='#999999', markersize=3)
    """
    for dom in dom_lst:
        start = dom[0] - 1
        end = dom[1] - 1
        dom_len = end - start
        print '%d - %d' % (start, end)
        ax.add_patch(plt.Rectangle((start, start), dom_len, dom_len, facecolor='#EEEEEE', edgecolor='black', lw=0.5, zorder=0))
        for dom2 in dom_lst:
            if dom2 == dom:
                continue
            start2 = dom2[0] - 1
            end2 = dom2[1] - 1
            dom_len2 = end2 - start2
            ax.add_patch(plt.Rectangle((start, start2), dom_len, dom_len2, facecolor='#EEEEEE', edgecolor='black', lw=0.5, zorder=0, alpha=0.3))
    """

    ss_col_dict = {'H': '#8B0043', 'E': '#0080AD', 'HE': '#3F467D'}
    for ss_elem in ss_lst:
        start = ss_elem[0] - 1
        end = ss_elem[1] - 1
        elem_len = end - start
        curr_ss = ss[start]
        #print '%d - %d' % (start, end)
        ax.add_patch(
            plt.Rectangle((start, start),
                          elem_len,
                          elem_len,
                          facecolor=ss_col_dict[curr_ss],
                          edgecolor='black',
                          lw=0.5,
                          zorder=0,
                          alpha=0.5))
        for ss_elem2 in ss_lst:
            if ss_elem2 == ss_elem:
                continue
            start2 = ss_elem2[0] - 1
            end2 = ss_elem2[1] - 1
            elem_len2 = end2 - start2
            curr_ss2 = ss[start2]
            if curr_ss != curr_ss2:  # interaction between sheet and helix
                curr_ss2 = 'HE'
            ax.add_patch(
                plt.Rectangle((start, start2),
                              elem_len,
                              elem_len2,
                              facecolor=ss_col_dict[curr_ss2],
                              edgecolor='black',
                              lw=0.5,
                              zorder=0,
                              alpha=0.2))

    #ax.imshow(dot_matrix, origin='lower', cmap=cm.binary)
    if pdb_flag:
        ax.scatter(ref_contacts_x,
                   ref_contacts_y,
                   marker='o',
                   c='#CCCCCC',
                   lw=0)
        #ax.scatter(range(ref_len), range(ref_len), marker='d', c=dom_seq_lst, lw=0, edgecolor=dom_seq_lst, cmap=cm.spectral_r)
        #plt.plot(ref_contacts_x, ref_contacts_y, 'o', c='#CCCCCC', mec='#CCCCCC')
        fig.suptitle(
            '%s\nPPV = %.2f   intra-PPV = %.2f   inter-PPV = %.2f' %
            (contact_filename, PPVs[-1], intra_PPVs[-1], inter_PPVs[-1]))
    else:
        fig.suptitle('%s - %s' % (acc, contact_filename))

    sc = ax.scatter(contacts_x[::-1],
                    contacts_y[::-1],
                    marker='o',
                    c=scores[::-1],
                    s=4,
                    alpha=0.75,
                    cmap=cm.jet,
                    linewidths=0.5)
    #sc = ax.scatter(contacts_nf_y[::-1], contacts_nf_x[::-1], marker='o', c=scores_nf[::-1], s=8, alpha=0.75, cmap=cm.jet, linewidths=0.5)

    plt.gca().set_xlim([0, ref_len])
    plt.gca().set_ylim([0, ref_len])
    plt.colorbar(sc)
    #cbar = plt.colorbar(ax, ticks=[min(scores), max(scores)])
    #cbar.ax.set_yticklabels(['Low', 'High'])
    #cbar.set_label(r'Contact Score')

    pp = PdfPages('%s_ContactMap.pdf' % contact_filename)
    pp.savefig(fig)
    pp.close()

    #outfile = open('%s.contacts' % '.'.join(contact_filename.split('.')[0:-1]),'w')
    #for i in range(len(scores)):
    #    outfile.write('%s,%s,%s\n' % (int(contacts_x[i] + 1), int(contacts_y[i] + 1), scores[i]))

    if pdb_flag:
        fig.clf()
        ax2 = fig.add_subplot(111)
        ax2.plot(PPVs)
        pp = PdfPages('%s_PPVs.pdf' % contact_filename)
        pp.savefig(fig)
        pp.close()
Пример #7
0
def plot_map(fasta_filename,
             c_filename,
             factor=1.0,
             th=-1,
             f_obs=-1,
             c2_filename='',
             psipred_horiz_fname='',
             psipred_vert_fname='',
             pdb_filename='',
             is_heavy=False,
             chain='',
             sep=',',
             outfilename='',
             ali_filename='',
             meff_filename='',
             name='',
             start=0,
             end=-1,
             pdb_start=0,
             pdb_end=-1,
             noalign=False,
             pdb_alignment='',
             pdb_id='',
             binary=False):

    #acc = c_filename.split('.')[0]
    #acc = fasta_filename.split('.')[0][:4]
    if name == '':
        acc = '.'.join(os.path.basename(fasta_filename).split('.')[:-1])
    else:
        acc = name

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### trim sequence according to given positions
    ### default: take full sequence
    if end == -1:
        end = ref_len
    seq = seq[start:end]
    ref_len = len(seq)
    unit = (ref_len / 50.0)

    ### get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep)
    contacts_np = parse_contacts.get_numpy_cmap(contacts, seq_len=ref_len)
    contacts_np = contacts_np[start:end, start:end]

    contacts_x = []
    contacts_y = []
    scores = []
    contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        # only look at contacts within given range
        # default: take full sequence range into account
        if c_x < start or c_x >= end:
            continue
        if c_y < start or c_y >= end:
            continue

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5

        if not too_close:
            contacts_x.append(c_x - start)
            contacts_y.append(c_y - start)
            scores.append(score)
            count += 1

        if count >= ref_len * factor and th == -1 and f_obs == -1:
            th = score
            break

        if score < th and not th == -1 and f_obs == -1:
            factor = count / float(ref_len)
            break

        # if cutoff by fraction of observed contacts:
        # take all contacts and cut list after reading pdb
        #if f_obs != -1:
        #    factor = ref_len
        #    th = -1

    ### start plotting
    fig = plt.figure(figsize=(8, 8), dpi=96, facecolor='w')
    ax = fig.add_subplot(111)  #, aspect='auto')
    ax.set_adjustable('box-forced')
    ax.tick_params(direction='out', right='off', top='off')
    ax.set_xlim([-unit, ref_len])
    ax.set_ylim([-unit, ref_len])

    ### plot alignment coverage if alignemnt given
    if ali_filename or meff_filename:
        # adjust overall canvas
        ax = plt.subplot2grid((8, 8), (1, 1), colspan=7,
                              rowspan=7)  #, aspect='auto')
        #ax.set_adjustable('box-forced')
        #ax.set_autoscale_on(False)
        ax.autoscale(False)
        ax.tick_params(direction='out',
                       labelleft='off',
                       right='off',
                       top='off')
        ax.set_xlim([-unit, ref_len])
        ax.set_ylim([-unit, ref_len])

        if ali_filename:
            coverage_lst, M = get_ali_coverage(ali_filename)
            max_cover = M
        elif meff_filename:
            coverage_lst, Meff = get_meff_coverage(meff_filename)
            max_cover = Meff
        #max_cover = max(coverage_lst)

        coverage_lst = coverage_lst[start:end]

        #lt = pow(10, max(1,floor(log10(max_cover)) - 1))
        #upper = int(ceil(max_cover/float(lt)) * lt)
        ax2 = plt.subplot2grid((8, 8), (1, 0), rowspan=7, sharey=ax)
        #ax2.set_adjustable('box-forced')
        #ax2.set_autoscale_on(False)
        ax2.autoscale(False)
        #print len([0]+coverage_lst+[0])
        #print len([0]+range(ref_len)+[ref_len-1])

        ax2.plot([0] + coverage_lst + [0],
                 [0] + range(ref_len) + [ref_len - 1],
                 'k',
                 lw=0)
        ax2.axvline(x=max_cover * 0.25, lw=0.5, c='black', ls=':')
        ax2.axvline(x=max_cover * 0.5, lw=0.5, c='black', ls=':')
        ax2.axvline(x=max_cover * 0.75, lw=0.5, c='black', ls=':')
        ax2.fill([0] + coverage_lst + [0],
                 [0] + range(ref_len) + [ref_len - 1],
                 facecolor='gray',
                 lw=0,
                 alpha=0.5)
        ax2.set_xticks([0, max_cover])
        ax2.tick_params(axis='x', top='off', direction='out')
        ax2.invert_xaxis()
        #ax2.spines['top'].set_visible(False)
        #ax2.spines['left'].set_visible(False)
        #ax.get_xaxis().tick_bottom()
        #ax.get_yaxis().tick_right()
        ax2.grid()
        ax2.set_ylim([-unit, ref_len])

        ax3 = plt.subplot2grid((8, 8), (0, 1), colspan=7, sharex=ax)
        #ax3.set_adjustable('box-forced')
        #ax3.set_autoscale_on(False)
        ax3.autoscale(False)
        ax3.plot([0] + range(ref_len) + [ref_len - 1],
                 [0] + coverage_lst + [0],
                 'k',
                 lw=0)
        ax3.axhline(y=max_cover * 0.25, lw=0.5, c='black', ls=':')
        ax3.axhline(y=max_cover * 0.5, lw=0.5, c='black', ls=':')
        ax3.axhline(y=max_cover * 0.75, lw=0.5, c='black', ls=':')
        ax3.fill([0] + range(ref_len) + [ref_len - 1],
                 [0] + coverage_lst + [0],
                 facecolor='gray',
                 lw=0,
                 alpha=0.5)
        #ax3.xaxis.tick_top()
        ax3.set_yticks([0, max_cover])
        ax3.tick_params(labelbottom='off')
        ax2.tick_params(axis='y', right='off', direction='out', left='on')
        #ax3.spines['top'].set_visible(False)
        #ax3.spines['right'].set_visible(False)
        #ax.get_xaxis().tick_top()
        #ax.get_yaxis().tick_left()
        ax3.grid()
        ax3.set_xlim([-unit, ref_len])

    ### plot secondary structure along axis if given
    if psipred_horiz_fname or psipred_vert_fname:
        if psipred_horiz_fname:
            ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r'))
        else:
            ss = parse_psipred.vertical(open(psipred_vert_fname, 'r'))

        ss = ss[start:end]
        assert len(ss) == ref_len

        ax.axhline(y=0, lw=1, c='black')
        ax.axvline(x=0, lw=1, c='black')
        for i in range(len(ss)):
            if ss[i] == 'H':
                #ax.plot(-unit/2, i, 's', c='#8B0043', mec="#8B0043")#, markersize=2)
                #ax.plot(i, -unit/2, 's', c='#8B0043', mec="#8B0043")#, markersize=2)
                #ax.plot(i, -unit/2, 's', c='#8B0043', mec="#8B0043")#, markersize=2)
                ax.add_patch(
                    plt.Rectangle((-unit, i - 0.5),
                                  unit,
                                  1,
                                  edgecolor='#8B0043',
                                  facecolor="#8B0043"))
                ax.add_patch(
                    plt.Rectangle((i - 0.5, -unit),
                                  1,
                                  unit,
                                  edgecolor='#8B0043',
                                  facecolor="#8B0043"))
            if ss[i] == 'E':
                ax.add_patch(
                    plt.Rectangle((-unit, i - 0.5),
                                  unit,
                                  1,
                                  edgecolor='#0080AD',
                                  facecolor="#0080AD"))
                ax.add_patch(
                    plt.Rectangle((i - 0.5, -unit),
                                  1,
                                  unit,
                                  edgecolor='#0080AD',
                                  facecolor="#0080AD"))
                #ax.plot(-unit/2, i, 's', c='#0080AD', mec="#0080AD")#, markersize=2)
                #ax.plot(i, -unit/2, 's', c='#0080AD', mec="#0080AD")#, markersize=2)
            if ss[i] == 'C':
                continue

    ### plot reference contacts in the background if given
    if pdb_filename:
        res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
        cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)

        # if not true there is some serious problem with the provided pdb file
        assert len(res_lst) == len(cb_lst) == len(atom_seq)

        ### trim PDB sequence according to given positions
        ### default: take full sequence
        if pdb_end == -1:
            pdb_end = len(res_lst)
        res_lst = res_lst[pdb_start:pdb_end]
        cb_lst = cb_lst[pdb_start:pdb_end]
        atom_seq = atom_seq[pdb_start:pdb_end]

        #print atom_seq
        #print seq

        if noalign:
            dist_mat = get_cb_contacts(cb_lst)
            cb_cutoff = 8
            ref_contact_map = dist_mat < cb_cutoff
            PPV, TP, FP = get_ppvs(contacts_x, contacts_y, ref_contact_map,
                                   ref_len, factor)
            tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map)

        else:
            if pdb_alignment and pdb_id:
                #align = parse_hhblits_hhr.parse_alignments(pdb_alignment)[pdb_id]
                #atom_seq_ali = align[0][0]
                #seq_ali = align[0][1]
                seq_ali, atom_seq_ali = parse_a3m.get_pairwise(
                    pdb_alignment, pdb_id)
                seqres_seq = atom_seq_ali.replace('-', '')
                #print seqres_seq

                #print atom_seq_ali
                #print seq_ali
                #print ""
                align_seqres = pairwise2.align.globalms(
                    atom_seq, seqres_seq, 2, -1, -0.5, -0.1)
                atom_seq_ali0 = align_seqres[-1][0]
                seqres_seq_ali0 = align_seqres[-1][1]
                #print ""
                #print atom_seq_ali0
                #print seqres_seq_ali0
                #print ""
                atom_seq_ali, seq_ali = embedd_alignment(
                    atom_seq_ali0, seqres_seq_ali0, atom_seq_ali, seq_ali)
                #print atom_seq_ali1
                #print seq_ali1
                #print ""
            else:
                matrix = matlist.blosum62
                #align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
                #align = pairwise2.align.localds(atom_seq, seq, matrix, -11, -1)
                align = pairwise2.align.globalds(atom_seq, seq, matrix, -25,
                                                 -1)
                atom_seq_ali = align[-1][0]
                seq_ali = align[-1][1]

            #print atom_seq_ali
            #print seq_ali
            #print len(atom_seq), len(seq), len(res_lst), len(cb_lst)
            #print len(atom_seq_ali), len(seq_ali)

            j = 0
            gapped_res_lst = []
            gapped_cb_lst = []

            for i in xrange(len(atom_seq_ali)):
                if atom_seq_ali[i] == '-':
                    if seq_ali[i] == '-':
                        continue
                    gapped_res_lst.append('-')
                    gapped_cb_lst.append('-')
                elif seq_ali[i] == '-':
                    j += 1
                else:
                    gapped_res_lst.append(res_lst[j])
                    gapped_cb_lst.append(cb_lst[j])
                    j += 1

            if is_heavy:
                dist_mat = get_heavy_contacts(gapped_res_lst)
                heavy_cutoff = 5
                ref_contact_map = dist_mat < heavy_cutoff
                ref_contacts = np.where(dist_mat < heavy_cutoff)
            else:
                dist_mat = get_cb_contacts(gapped_cb_lst)
                cb_cutoff = 8
                ref_contact_map = dist_mat < cb_cutoff
                ref_contacts = np.where(dist_mat < cb_cutoff)
                #ref_contacts = np.where(np.ma.array(dist_mat, mask=np.tri(dist_mat.shape[0]), fill_value=float("inf")) < cb_cutoff)

            ref_contacts_x = ref_contacts[0]
            ref_contacts_y = ref_contacts[1]

            # if f_obs given, take top f_obs * num_obs contacts:
            if f_obs != -1:
                num_obs = sum(ref_contacts_x - ref_contacts_y >= 5)
                num_top = int(ceil(f_obs * num_obs))
                contacts_x = contacts_x[:num_top]
                contacts_y = contacts_y[:num_top]
                scores = scores[:num_top]
                th_obs = scores[-1]
            else:
                th_obs = th

            PPVs, TPs, FPs = get_ppvs(contacts_x,
                                      contacts_y,
                                      ref_contact_map,
                                      ref_len,
                                      factor,
                                      atom_seq_ali=atom_seq_ali)
            tp_colors = get_tp_colors(contacts_x,
                                      contacts_y,
                                      ref_contact_map,
                                      atom_seq_ali=atom_seq_ali)

        if not c2_filename:
            img = get_colors(contacts_np,
                             ref_contact_map=dist_mat,
                             th=th_obs,
                             binary=binary)
            sc = ax.imshow(img, interpolation='none')
        else:
            # plot native contacts in background
            img = get_ref_img(dist_mat)
            sc = ax.imshow(img, interpolation='none')

        print '%s %s %s %s' % (acc, PPVs[-1], TPs[-1], FPs[-1])

        cmap = cm.get_cmap("binary")
        cmap.set_bad([1, 1, 1, 0])
        dist_mat_masked = np.ma.array(dist_mat,
                                      mask=np.tri(dist_mat.shape[0], k=-1))
        #sc = ax.imshow(s_score_vec(dist_mat_masked, 5), cmap=cmap, interpolation='none')

        ref_contacts_diag_x = []
        ref_contacts_diag_y = []
        for i in range(len(ref_contacts_x)):
            x_i = ref_contacts_x[i]
            y_i = ref_contacts_y[i]
            if not dist_mat_masked.mask[x_i, y_i] and abs(x_i - y_i) >= 5:
                ref_contacts_diag_x.append(x_i)
                ref_contacts_diag_y.append(y_i)

        #ax.scatter(ref_contacts_diag_x, ref_contacts_diag_y, marker='+', c='#000000')

    ### plot predicted contacts from second contact map if given
    if c2_filename:
        contacts2 = parse_contacts.parse(open(c2_filename, 'r'))
        contacts2_x = []
        contacts2_y = []
        scores2 = []
        contact_dict2 = {}

        count = 0

        for i in range(len(contacts2)):
            score = contacts2[i][0]
            c_x = contacts2[i][1] - 1
            c_y = contacts2[i][2] - 1

            pos_diff = abs(c_x - c_y)
            too_close = pos_diff < 5

            if not too_close:
                contacts2_x.append(c_x - start)
                contacts2_y.append(c_y - start)
                scores2.append(score)
                count += 1

            if count >= ref_len * factor and th == -1 and f_obs == -1:
                th = score
                break

            if score < th and not th == -1 and f_obs == -1:
                factor = count / float(ref_len)
                break

            # if cutoff by fraction of observed contacts:
            # stop at num_top = f_obs * num_obs
            if pdb_filename and f_obs != -1:
                if count >= num_top:
                    factor = count / float(ref_len)
                    th = score
                    break

        ### use TP/FP color coding if reference contacts given
        if pdb_filename:
            #PPVs2, TPs2, FPs2 = get_ppvs(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali, ref_len, factor)
            tp2_colors = get_tp_colors(contacts2_x, contacts2_y,
                                       ref_contact_map, atom_seq_ali)
            #print '%s %s %s %s' % (acc, PPVs2[-1], TPs2[-1], FPs2[-1])
            #fig.suptitle('%s\nPPV (upper left) = %.2f | PPV (lower right) = %.2f' % (acc, PPVs[-1], PPVs2[-1]))
            sc = ax.scatter(contacts2_y[::-1],
                            contacts2_x[::-1],
                            marker='s',
                            c=tp2_colors[::-1],
                            s=4,
                            alpha=1,
                            lw=0,
                            edgecolor=tp2_colors[::-1])
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='s',
                            c=tp_colors[::-1],
                            s=4,
                            alpha=1,
                            lw=0,
                            edgecolor=tp_colors[::-1])
        else:
            sc = ax.scatter(contacts2_y[::-1],
                            contacts2_x[::-1],
                            marker='0',
                            c='#D70909',
                            edgecolor='#D70909',
                            s=6,
                            linewidths=0.5)
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c='#004F9D',
                            edgecolor='#004F9D',
                            s=6,
                            linewidths=0.5)

    ### plot predicted contacts from first contact map on both triangles
    ### if no second contact map given
    else:
        if pdb_filename:
            pdb_acc = parse_pdb.get_acc(open(pdb_filename))
            if pdb_acc:
                if chain:
                    fig.suptitle('%s (PDB: %s, chain %s)\nPPV = %.2f' %
                                 (acc, pdb_acc, chain, PPVs[-1]))
                else:
                    fig.suptitle('%s (PDB: %s)\nPPV = %.2f' %
                                 (acc, pdb_acc, PPVs[-1]))
            else:
                fig.suptitle('%s\nPPV = %.2f' % (acc, PPVs[-1]))
            #cmap = cm.get_cmap("binary")
            #cmap.set_bad([1,1,1,0])
            #contacts_np_masked = np.ma.array(contacts_np, mask=np.tri(contacts_np.shape[0], k=-1))
            #sc = ax.imshow(contacts_np_masked.T, cmap=cmap)
            #sc = ax.imshow(contacts_np, cmap=cmap)
            #sc = ax.imshow(contacts_np + contacts_np.T, cmap=cm.binary, vmin=0.2, vmax=1.0, interpolation='none')
            #sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
            #sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
        else:
            #if c_filename.startswith('data'):
            #    acc = c_filename.split('/')[1]
            #else:
            #    acc = c_filename.split('/')[-1]
            fig.suptitle('%s' % acc)
            #sc = ax.imshow(contacts_np + contacts_np.T, cmap=cm.hot_r)
            #sc = ax.imshow(contacts_np + contacts_np.T,
            #        cmap=cm.binary, vmin=th, vmax=1.0, interpolation='none')
            img = get_colors(contacts_np, th=th)
            sc = ax.imshow(img, interpolation='none')
            #divider1 = make_axes_locatable(ax)
            #cax1 = divider1.append_axes("right", size="2%", pad=0.05)
            #plt.colorbar(sc, cax=cax1)
            #plt.colorbar(sc, ax=ax)
            #sc = ax.scatter(contacts_x[::-1], contacts_y[::-1],
            #        marker='o', c="black", s=6, alpha=0.75,
            #        linewidths=0.1, edgecolors='none')
            #sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.hot_r, linewidths=0.1, edgecolors='none')

    #plt.gca().set_xlim([0,ref_len])
    #plt.gca().set_ylim([0,ref_len])

    ax.grid()
    ax.set_xlim([-unit, ref_len])
    ax.set_ylim([-unit, ref_len])
    #print ax.axis()
    ax.axis([-unit, ref_len, -unit, ref_len])
    #ax.invert_yaxis()
    ax.set_autoscale_on(False)

    if outfilename:
        if outfilename.endswith('.pdf'):
            pp = PdfPages(outfilename)
            pp.savefig(fig)
            pp.close()
        elif outfilename.endswith('.eps'):
            plt.savefig(outfilename, format='eps', dpi=300)
        elif outfilename.endswith('.png'):
            plt.savefig(outfilename)
        else:
            pp = PdfPages('%s.pdf' % outfilename)
            pp.savefig(fig)
            pp.close()
    else:
        pp = PdfPages('%s_ContactMap.pdf' % c_filename)
        pp.savefig(fig)
        pp.close()
def fix(pdb1_filename, pdb2_filename, out_filename, chain1='', chain2=''):

    if not chain1:
        chain1 = parse_pdb.get_first_chain(open(pdb1_filename, 'r'))
    pdb1 = parse_pdb.read(open(pdb1_filename, 'r'), chain1)
    seq1 = parse_pdb.get_atom_seq(open(pdb1_filename, 'r'), chain1)

    if not chain2:
        chain2 = parse_pdb.get_first_chain(open(pdb2_filename, 'r'))
    pdb2 = parse_pdb.read(open(pdb2_filename, 'r'), chain2)
    seq2 = parse_pdb.get_atom_seq(open(pdb2_filename, 'r'), chain2)

    align = pairwise2.align.globalms(seq1, seq2, 2, -1, -0.5, -0.1)
    #print seq1
    #print seq2

    #print align
    seq1_ali = align[-1][0]
    seq2_ali = align[-1][1]

    #print pdb2
    pdb2_idx = []
    offset = 0
    for i in xrange(len(seq2_ali)):
        if seq1_ali[i] == '-':
            offset -= 1
            idx = i+1 + offset
            pdb2_idx.append(idx)
        elif seq2_ali[i] == '-':
            continue
            #offset += 1
            #idx = i+1 + offset
            #pdb2_idx.append(idx)
        else:
            idx = i+1 + offset
            pdb2_idx.append(idx)
        #else:
    pdb2_new = ['', [], pdb2[2]]
    i = 0
    prev_idx = -1
    #print len(pdb2_idx)
    #print len(pdb2[1])
    for res in pdb2[1]:
        if i >= len(pdb2_idx):
            break
        new_res = []
        new_idx = pdb2_idx[i]
        if new_idx == 0:
            i = i+1
            continue
        elif new_idx == prev_idx:
            i = i+1
            continue
        else:
            for atm in res:
                new_idx_str = str(pdb2_idx[i])
                #print atm
                #print new_idx_str
                lendiff = 4 - len(new_idx_str)
                new_atm = atm[:22] + lendiff * ' ' + new_idx_str + atm[26:]
                new_res.append(new_atm)
            pdb2_new[1].append(new_res)

        prev_idx = new_idx
        i = i+1

    #print pdb1_filename
    #print pdb2_filename
    #print pdb2_idx
    #print len(pdb2_idx)
    #print align[-1]
    #print len(align[-1][1])
    if out_filename:
        pdb2_outfile = open(out_filename, 'w')
    else:
        pdb2_outfile = open('.'.join(pdb2_filename.split('.')[:-1]) + '.aligned.pdb', 'w')
    #print pdb2_new
    parse_pdb.write(pdb2_new, pdb2_outfile)