示例#1
0
def observed_lengths(args):
    """
    :param args: run arguments
    :return: draws histograms of lengths of either PDBTM or OPM win lengths and inter helix loops
    """
    rost_db = parse_rostlab_db()
    lengths = []
    between_helices_lengths = []

    for k, v in rost_db.items():
        topc = spc_parser(k)
        signal_peptide = topc['topcons'].count('S') + topc['topcons'].count(
            's')
        obs_loc_list = pdbtm_opm_loc_list(v[args['data_base']], signal_peptide)

        for i, w in enumerate(obs_loc_list):
            lengths.append(w[1] - w[0])

            if i + 1 in range(0, len(obs_loc_list)):
                between_helices_lengths.append(obs_loc_list[i + 1][0] - w[1])

    plt.hist(lengths, 30, normed=1, facecolor='green', alpha=0.75)
    plt.hist(between_helices_lengths,
             100,
             normed=1,
             facecolor='blue',
             alpha=0.5)
    plt.xlabel('Window lengths in %s dataset' % args['data_base'])
    plt.ylabel('Frequency')
    plt.xlim([0, 100])
    plt.grid(True)
    plt.show()
def observed_lengths(args):
    """
    :param args: run arguments
    :return: draws histograms of lengths of either PDBTM or OPM win lengths and inter helix loops
    """
    rost_db = parse_rostlab_db()
    lengths = []
    between_helices_lengths = []

    for k, v in rost_db.items():
        topc = spc_parser(k)
        signal_peptide = topc['topcons'].count('S') + topc['topcons'].count('s')
        obs_loc_list = pdbtm_opm_loc_list(v[args['data_base']], signal_peptide)

        for i, w in enumerate(obs_loc_list):
            lengths.append(w[1]-w[0])

            if i+1 in range(0, len(obs_loc_list)):
                between_helices_lengths.append(obs_loc_list[i+1][0] - w[1])

    plt.hist(lengths, 30, normed=1, facecolor='green', alpha=0.75)
    plt.hist(between_helices_lengths, 100, normed=1, facecolor='blue', alpha=0.5)
    plt.xlabel('Window lengths in %s dataset' % args['data_base'])
    plt.ylabel('Frequency')
    plt.xlim([0, 100])
    plt.grid(True)
    plt.show()
示例#3
0
def check_all_aa_points_as_boxplot():
    from TMpredict_WinGrade import parse_rostlab_db
    import matplotlib.pyplot as plt
    import numpy as np
    rostlab_dict = parse_rostlab_db()
    membranal = []
    non_membranal = []
    for k, v in rostlab_dict.items():
        psipred = parse_psipred(k)
        for i, aa in enumerate(v['pdbtm']):
            if aa.lower() == 'h':
                membranal.append(psipred[i + 1])
            else:
                non_membranal.append(psipred[i + 1])
    positions = np.arange(6)
    plt.subplot(111)
    labels = [
        'MM Coil', 'MM sheet', 'MM Helix', 'no Coil', 'no sheet', 'no Helix'
    ]
    mm_coil = [a['c'] for a in membranal]
    mm_sheet = [a['e'] for a in membranal]
    mm_helix = [a['h'] for a in membranal]
    no_coil = [a['c'] for a in non_membranal]
    no_sheet = [a['e'] for a in non_membranal]
    no_helix = [a['h'] for a in non_membranal]
    plt.boxplot([mm_coil, mm_sheet, mm_helix, no_coil, no_sheet, no_helix],
                labels=labels,
                positions=positions)
    plt.ylim([-0.5, 1.5])
    plt.show()
def check_all_aa_points_as_boxplot():
    from TMpredict_WinGrade import parse_rostlab_db
    import matplotlib.pyplot as plt
    import numpy as np
    rostlab_dict = parse_rostlab_db()
    membranal = []
    non_membranal = []
    for k, v in rostlab_dict.items():
        psipred = parse_psipred(k)
        for i, aa in enumerate(v['pdbtm']):
            if aa.lower() == 'h':
                membranal.append(psipred[i+1])
            else:
                non_membranal.append(psipred[i+1])
    positions = np.arange(6)
    plt.subplot(111)
    labels = ['MM Coil', 'MM sheet', 'MM Helix', 'no Coil', 'no sheet', 'no Helix']
    mm_coil = [a['c'] for a in membranal]
    mm_sheet = [a['e'] for a in membranal]
    mm_helix = [a['h'] for a in membranal]
    no_coil = [a['c'] for a in non_membranal]
    no_sheet = [a['e'] for a in non_membranal]
    no_helix = [a['h'] for a in non_membranal]
    plt.boxplot([mm_coil, mm_sheet, mm_helix, no_coil, no_sheet, no_helix], labels=labels, positions=positions)
    plt.ylim([-0.5, 1.5])
    plt.show()
示例#5
0
def main():
    # check_all_aa_points_as_boxplot()
    from TMpredict_WinGrade import parse_rostlab_db
    import matplotlib.pyplot as plt
    import numpy as np
    IS_BETA_CUTOFF = 0.3
    BETA_NUM_CUTOFF = 5
    missed_h = 0
    rostlab_dict = parse_rostlab_db()
    for k, v in rostlab_dict.items():
        psipred = parse_psipred(k)
        # print v['seq']
        # print v['pdbtm']
        tms = ts2hp_seq(v['seq'], v['pdbtm'])
        for tm in tms:
            are_beta = 0
            for i in range(tm[0], tm[1] + 1):
                if psipred[i]['e'] >= IS_BETA_CUTOFF:
                    are_beta += 1
            if are_beta >= BETA_NUM_CUTOFF:
                missed_h += 1
                print "MISSED ME!!!!", k, tm, [
                    psipred[a]['e'] for a in range(tm[0], tm[1] + 1)
                ], are_beta
    print "total misses (helices)", missed_h
def compare_just_one():
    """
    mode: one
    path: path to .prd
    name: protein id
    :return: compares a single prediction to it's database input.
    """
    from TMpredict_WinGrade import parse_rostlab_db
    from topcons_result_parser import topcons2rostlab_ts_format
    M = 10
    rostlab_db_dict = parse_rostlab_db()
    pred = prd_parser(args['path'], args['name'])
    obse = rostlab_db_dict[pred['name']]
    topc = spc_parser(pred['name'])
    predictors = {k: topcons2rostlab_ts_format(v) for k, v in topc.items() if k not in ['name', 'seq']}
    predictors_results = {k: None for k in topc.keys() if k not in ['name', 'seq']}
    # print 'in one'
    # print 'obse', obse['pdbtm']
    # print 'topo', pred['pred_ts']
    # print 'topcons', predictors['topcons']
    for predictor in predictors:
        print 'predictor', predictor
        comp_pdbtm = comparer(obse['pdbtm'], predictors[predictor], M, predictors['topcons'], pred['seq'])
        comp_opm = comparer(obse['opm'], predictors[predictor], M, predictors['topcons'], pred['seq'])
        predictors_results[predictor] = comp_pdbtm['overlapM_ok'] or comp_opm['overlapM_ok']
    comp_pdbtm = comparer(obse['pdbtm'], pred['pred_ts'], M, predictors['topcons'], pred['seq'])
    comp_opm = comparer(obse['opm'], pred['pred_ts'], M, predictors['topcons'], pred['seq'])
    if comp_opm['overlapM_ok'] and comp_pdbtm['overlapM_ok']: print 'TopoGraph is correct by both'
    elif comp_opm['overlapM_ok']: print 'TopoGraph is correct ONLY by OPM'
    elif comp_pdbtm['overlapM_ok']: print 'TopoGraph is correct ONLY by PDBTM'
    print 'com pdbtm', comp_pdbtm
    print 'com opm', comp_opm
    print predictors_results
def blast2fasta():
    '''
    :return: takes one blast .xml result from rost_msa_prep/blast and makes a multiple fasta file of the same sequences
    in the same folder
    '''
    from TMpredict_WinGrade import parse_rostlab_db
    name = args['name']
    path_bl = '/home/labs/fleishman/jonathaw/membrane_prediciton/data_sets/rostlab_db/rost_msa_prep/blast/'
    output_bl = path_bl + name + '_blast.xml'
    seq_dict = ncbiXML_parser(output_bl)
    print name
    query = {k: v for k, v in parse_rostlab_db().items() if k == name.split('_')[0]}.values()[0]
    # print query
    # print seq_dict
    with open('/home/labs/fleishman/jonathaw/membrane_prediciton/data_sets/rostlab_db/rost_msa_prep/blast/'
                      +name+'_blast.fa', 'wr+') as o:
        o.writelines('>%s\n' % name)
        o.writelines('%s\n' % query['seq'])
        for k, v in seq_dict.items():

            o.writelines('>%s\n' % k)
            o.writelines('%s\n' % v['hit_seq'])
    with open('/home/labs/fleishman/jonathaw/membrane_prediciton/data_sets/rostlab_db/rost_msa_prep/names.txt',
              'a') as o:
        o.write(name+'\n')
示例#8
0
def blast2fasta():
    '''
    :return: takes one blast .xml result from rost_msa_prep/blast and makes a multiple fasta file of the same sequences
    in the same folder
    '''
    from TMpredict_WinGrade import parse_rostlab_db
    name = args['name']
    path_bl = '/home/labs/fleishman/jonathaw/membrane_prediciton/data_sets/rostlab_db/rost_msa_prep/blast/'
    output_bl = path_bl + name + '_blast.xml'
    seq_dict = ncbiXML_parser(output_bl)
    print name
    query = {
        k: v
        for k, v in parse_rostlab_db().items() if k == name.split('_')[0]
    }.values()[0]
    # print query
    # print seq_dict
    with open(
            '/home/labs/fleishman/jonathaw/membrane_prediciton/data_sets/rostlab_db/rost_msa_prep/blast/'
            + name + '_blast.fa', 'wr+') as o:
        o.writelines('>%s\n' % name)
        o.writelines('%s\n' % query['seq'])
        for k, v in seq_dict.items():

            o.writelines('>%s\n' % k)
            o.writelines('%s\n' % v['hit_seq'])
    with open(
            '/home/labs/fleishman/jonathaw/membrane_prediciton/data_sets/rostlab_db/rost_msa_prep/names.txt',
            'a') as o:
        o.write(name + '\n')
def main_rost():
    prd_files = [a for a in os.listdir('./') if '.prd' in a and '_msa' not in a]
    rost_db = parse_rostlab_db()
    new_old = rost_new_old()
    topgraph_none = []

    follow = 'q8dkp6'

    old_new_totals = {'new': 0, 'old': 0}
    results = {}
    for prd_file in prd_files:
        name = prd_file.split('.')[0].lower()
        best_wgp, sec_wgp = parse_prd(prd_file)

        if best_wgp is None:
            topgraph_none.append(name)
            continue

        topc = spc_parser(name)

        signal_peptide = topc['topcons'].count('S') + topc['topcons'].count('s')

        best_wgp_loc_list = wgp_to_loc_list(best_wgp, signal_peptide)
        sec_wgp_loc_list = wgp_to_loc_list(sec_wgp, signal_peptide)

        old_new_totals[new_old[name]] += 1

        if name == follow:
            print 'at %s found loc list %r' % (name, best_wgp_loc_list)

        best_tgr_qok, best_tgr_ovm = qok_pdbtm_opm(rost_db[name], best_wgp_loc_list, signal_peptide, verbose=name==follow)
        sec_tgr_qok, sec_tgr_ovm = qok_pdbtm_opm(rost_db[name], sec_wgp_loc_list, signal_peptide)

        best_or_sec_qok = best_tgr_qok or sec_tgr_qok
        best_or_sec_ovm = best_tgr_ovm or sec_tgr_ovm

        results[name] = {'old_new': new_old[name],
                         'tm_num': len(pdbtm_opm_loc_list(rost_db[name]['pdbtm'], signal_peptide)),
                         'topgraph': {'qok': best_tgr_qok, 'ovm': best_tgr_ovm},
                         'best_or_sec': {'qok': best_or_sec_qok, 'ovm': best_or_sec_ovm}}

        for predictor in predictors:
            prd_qok, prd_ovm = qok_pdbtm_opm(rost_db[name], ts_loc_list(topc[predictor], signal_peptide), signal_peptide)
            results[name][predictor] = {'qok': prd_qok, 'ovm': prd_ovm}

    # prints resutls sliced by old/new
    print_results_by_old_new(results, predictors, old_new_totals)

    # prints results sliced by 1, 2-4 >4 TMHs
    print_results_by_tm_num(results)

    # print names TopGraph got wrong
    print_names_topgraph_got_wrong(results)

    # prints namse TopGraph got wrong by both best and sec best
    print_names_topgraph_got_wrong_best_and_sec(results)

    # print total percentage correct for TopGraph, TopGraph best or sec, and TOPCONS
    print_total_results(results)
示例#10
0
def main():
    import os
    import re
    from TMpredict_WinGrade import result_comparer, results_writer, parse_rostlab_db, result_comparer_10overlap
    topcons_path = '/home/labs/fleishman/jonathaw/membrane_topcons/topo_VH_topcons/all_results/'
    rostlab_db_dict = parse_rostlab_db()
    file_list = [x for x in os.listdir(topcons_path) if re.match('.*\.txt', x)]
    for file_i in file_list:
        entry = topcons_parser(topcons_path, file_i)
        topo_string = entry['rost_format_scampi']
        with open(topcons_path + entry['name'].lower() + '.prd', 'wr+') as o:
            o.writelines('name %s\n' % entry['name'].lower())
            o.writelines('top %s\n' % topo_string)
def main():
    import os
    import re
    from TMpredict_WinGrade import result_comparer, results_writer, parse_rostlab_db, result_comparer_10overlap
    topcons_path = '/home/labs/fleishman/jonathaw/membrane_topcons/topo_VH_topcons/all_results/'
    rostlab_db_dict = parse_rostlab_db()
    file_list = [x for x in os.listdir(topcons_path)
                 if re.match('.*\.txt', x)]
    for file_i in file_list:
        entry = topcons_parser(topcons_path, file_i)
        topo_string = entry['rost_format_scampi']
        with open(topcons_path+entry['name'].lower()+'.prd', 'wr+') as o:
            o.writelines('name %s\n' % entry['name'].lower())
            o.writelines('top %s\n' % topo_string)
def download_pdbs():
    """
    :return: downloads all PDBs for the rostlab database. only the ones actually available...
             print the names of those it failed
    """
    from Bio.PDB import PDBParser, PDBIO, PDBList
    from TMpredict_WinGrade import parse_rostlab_db
    rost_db = parse_rostlab_db()
    pdbl = PDBList()
    failed = []
    for k, v in rost_db.items():
        print k, v
        try:
            pdbl.retrieve_pdb_file(v['pdb'], pdir='PDB')
        except:
            failed.append(v['pdb'])
    print failed
def main():
    # check_all_aa_points_as_boxplot()
    from TMpredict_WinGrade import parse_rostlab_db
    import matplotlib.pyplot as plt
    import numpy as np
    IS_BETA_CUTOFF = 0.3
    BETA_NUM_CUTOFF = 5
    missed_h = 0
    rostlab_dict = parse_rostlab_db()
    for k, v in rostlab_dict.items():
        psipred = parse_psipred(k)
        # print v['seq']
        # print v['pdbtm']
        tms = ts2hp_seq(v['seq'], v['pdbtm'])
        for tm in tms:
            are_beta = 0
            for i in range(tm[0], tm[1]+1):
                if psipred[i]['e'] >= IS_BETA_CUTOFF:
                    are_beta += 1
            if are_beta >= BETA_NUM_CUTOFF:
                missed_h += 1
                print "MISSED ME!!!!", k, tm, [psipred[a]['e'] for a in range(tm[0], tm[1]+1)], are_beta
    print "total misses (helices)", missed_h
def main():
    from TMpredict_WinGrade import parse_rostlab_db
    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D
    import operator
    psi_helix = [0.001, 0.005, 0.01, 0.1, 0.2, 0.3, 0.4]
    psi_res_num = [1, 2, 3, 4]
    total = 0
    results = {}
    for ph in psi_helix:
        for prn in psi_res_num:
            results[(ph, prn)] = 0

    rostlab_dict = parse_rostlab_db()
    for name, dict in rostlab_dict.items():
        psipred = PsiReaderHelix('/home/labs/fleishman/jonathaw/membrane_prediciton/data_sets/rostlab_db/psipred/'
                                 +name+'.ss2')
        assert len(psipred) == len(dict['seq']) == len(dict['pdbtm']) == len(dict['opm']), 'length unequal %s' % name
        for typ in ['pdbtm', 'opm']:
            helices = split_to_helices(dict['seq'], dict[typ], psipred)
            for h_seq, h_ss2 in helices:
                total += 1
                for ph in psi_helix:
                    for prn in psi_res_num:
                        if pass_helix(h_ss2, ph, prn):
                            results[(ph, prn)] += 1
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    # for par, res in results.items():
    for par, res in sorted(results.items(), key=operator.itemgetter(1)):
        print 'psi_helix: %f psi_res_num %i result: %f' % (par[0], par[1], float(res)/float(total))
        ax.scatter(par[0], par[1], float(res)/float(total))
    ax.set_xlabel('psi_helix')
    ax.set_ylabel('psi_res_num')
    ax.set_zlabel('percent')
    plt.show()
def main():
    # check_all_aa_points_as_boxplot()
    from TMpredict_WinGrade import parse_rostlab_db
    import matplotlib.pyplot as plt
    import numpy as np
    rostlab_dict = parse_rostlab_db()
    passed = []
    didnt_pass = []
    could_pass = []
    for k, v in rostlab_dict.items():
        # print k, v
        psipred = parse_psipred(k, v['seq'])
        tms = ts2hp_seq(v['seq'], v['pdbtm'])
        # print v['seq']
        # print v['pdbtm']
        # print ''.join([str(a) for a in range(10)]*50)
        # print tms
        for tm in tms:
            # print 'testing', tm
            if is_not_helical(v['seq'], tm, psipred):
                # print tm, 'didnt pass'
                didnt_pass.append(tm)
                if try_to_pass(tm, v['seq'], psipred):
                    could_pass.append(tm)
                else:
                    print v['name'], tm, np.mean([psipred[a]['e'] for a in range(tm[0], tm[1]+1)]), np.mean([psipred[a]['c'] for a in range(tm[0], tm[1]+1)]), np.mean([psipred[a]['h'] for a in range(tm[0], tm[1]+1)])
                    for i in range(tm[0], tm[1]+1):
                        print 'c: %f h: %f e: %f' % (psipred[i]['c'], psipred[i]['h'], psipred[i]['e'])
            else:
                # print tm, 'has passed'
                passed.append(tm)

        # break
    print 'failed %i times' % len(didnt_pass)
    print 'could have passed %i' % len(could_pass)
    print 'succeeded %i times' % len(passed)
    res = 0
    for i in range(ind, ind+20):
        if ts[i].lower() == 'h':
            res += 1
    return res / 20 >= 0.9

def is_not_helical(pos, psi):
    import numpy as np
    return False if (np.mean([psi[a]['e'] for a in range(pos[0], pos[1])]) <= 0.3 and
                    np.mean([psi[a]['c'] for a in range(pos[0], pos[1])]) <= 0.48 and
                    np.mean([psi[a]['h'] for a in range(pos[0], pos[1])]) >= 0.3) else True

from psipred_vs_mm_nomm import parse_psipred, psipred_avg
from TMpredict_WinGrade import parse_rostlab_db
import matplotlib.pyplot as plt
rost_db = parse_rostlab_db()
pdb_name = 'p02722'
psi = parse_psipred(pdb_name)
avg_b = []
avg_c = []
avg_h = []
indices = []
tmh = []
passed = []
for i in range(len(psi)-20):

    avg_b.append(psipred_avg(range(i+1, i+21), psi, 'e'))
    avg_c.append(psipred_avg(range(i+1, i+21), psi, 'c'))
    avg_h.append(psipred_avg(range(i+1, i+21), psi, 'h'))

    tmh.append(-0.1 if is_tmh(i, rost_db[pdb_name]['pdbtm']) else None)
def prd_directory(dir_path):
    """
    :param dir_path: path to directory to analyse
    :return: if in ROC mode returns prediction results. if in single mode, shows a graph of the results
    """
    import re, os
    # from TMpredict_WinGrade import parse_rostlab_db
    from topcons_result_parser import topcons2rostlab_ts_format
    import matplotlib.pyplot as plt
    import matplotlib
    import numpy as np
    M = 10
    file_list = [x for x in os.listdir(dir_path) if re.match('.*\.prd', x) and '_msa' not in x]
    if len(file_list) < args['num_prd']: return {'tm_1': 0, 'tm_2_5': 0, 'tm_5': 0}, {'tm_1': 0, 'tm_2_5': 0, 'tm_5': 0}
    rostlab_db_dict = parse_rostlab_db()
    # print rostlab_db_dict
    predictors = ['polyphobius', 'topcons', 'spoctopus', 'philius', 'octopus', 'scampi', 'pred_ts']
    results = {a: {'tm_1': 0, 'tm_2_5': 0, 'tm_5': 0} for a in predictors}
    totals = {'tm_1': 0, 'tm_2_5': 0, 'tm_5': 0}

    errors = {'over': 0, 'miss': 0, 'exact': 0, 'total': 0}
    we_got_wrong = []
    we_got_right = []
    topcons_got_right = []
    topgraph_c_term, topcons_c_term, c_term_total = 0, 0, 0
    for file_name in file_list:
        pred = prd_parser(dir_path, file_name)
        # if pred['name'] != 'p0c7b7': continue
        try:
            obse = rostlab_db_dict[pred['name']]
            topc = spc_parser(pred['name'])
        except:
            obse = rostlab_db_dict[pred['name'].lower()]
            topc = spc_parser(pred['name'].lower())
        predictors = {k: topcons2rostlab_ts_format(v) for k, v in topc.items() if k not in ['name', 'seq']}
        predictors['pred_ts'] = pred['pred_ts']
        first_passage = True

        topgraph_c_term += 1 if test_c_term(obse['pdbtm'], obse['opm'], predictors['pred_ts']) else 0
        topcons_c_term += 1 if test_c_term(obse['pdbtm'], obse['opm'], predictors['topcons']) else 0
        c_term_total += 1

        for predictor in predictors:
            # print "predictor", predictor, pred['name']
            comp_pdbtm = comparer(obse['pdbtm'], predictors[predictor], M, predictors['topcons'], pred['seq'])
            comp_opm = comparer(obse['opm'], predictors[predictor], M, predictors['topcons'], pred['seq'])
            overM = comp_pdbtm['overlapM_ok'] or comp_opm['overlapM_ok']

            if predictor == 'pred_ts' and overM:
                we_got_right.append(pred['name'])

            if predictor == 'pred_ts' and not overM:
                # print 'AAAAAAAHHHHHHH !!!!!! :('
                # print 'obse_tm_num', comp_pdbtm['obse_tm_num']
                # print 'pred_tm_num', comp_pdbtm['pred_tm_num']
                # print 'ok', comp_pdbtm['overlapM_ok_helices']
                # print '\n'
                we_got_wrong.append(pred['name'])
                if comp_pdbtm['obse_tm_num'] > comp_pdbtm['pred_tm_num']:
                    print 'MISS', pred['name'], comp_pdbtm['obse_tm_num']
                    errors['miss'] += 1
                elif comp_pdbtm['obse_tm_num'] < comp_pdbtm['pred_tm_num']:
                    print 'OVER', pred['name'], comp_pdbtm['obse_tm_num']
                    errors['over'] += 1
                else:
                    errors['exact'] += 1
                errors['total'] += 1
                print pred['name'], obse['pdb']
                print 'pred_ts', predictors['pred_ts']
                print 'AA seq ', pred['seq']
                print 'pdbtm  ', obse['pdbtm']

            if predictor == 'topcons' and overM:
                topcons_got_right.append(pred['name'])

            if comp_pdbtm['obse_tm_num'] == 0 or comp_opm['obse_tm_num'] == 0: continue

            if comp_pdbtm['obse_tm_num'] != comp_opm['obse_tm_num']:
                if comp_pdbtm['overlapM_ok']:
                    results[predictor][tm_num2range(comp_pdbtm['obse_tm_num'])] += 1
                    if first_passage:
                        totals[tm_num2range(comp_pdbtm['obse_tm_num'])] += 1
                elif comp_opm['overlapM_ok']:
                    results[predictor][tm_num2range(comp_opm['obse_tm_num'])] += 1
                    if first_passage:
                        totals[tm_num2range(comp_opm['obse_tm_num'])] += 1
            else:
                results[predictor][tm_num2range(comp_opm['obse_tm_num'])] += 1 \
                    if (comp_pdbtm['overlapM_ok'] or comp_opm['overlapM_ok']) else 0
                if first_passage: totals[tm_num2range(comp_pdbtm['obse_tm_num'])] += 1
            first_passage = False
    if args['mode'] == 'ROC':
        data = {k: v for k, v in results['pred_ts'].items()}
        return data, totals
    else:
        print 'these are the names we got right:', we_got_right
        print 'results', results
        print 'totals', totals
        print 'errors', errors

        print 'at total topgrph got right', float(sum(results['pred_ts'].values())) / float(sum(totals.values()))
        print 'at total topcons got right', float(sum(results['topcons'].values())) / float(sum(totals.values()))

        print 'topcons got c_term right', topcons_c_term, 100.*topcons_c_term/c_term_total
        print 'topgraph got c_term right', topgraph_c_term, 100.*topgraph_c_term/c_term_total
        print 'total c_term tested', c_term_total

        print 'ASSAF!!!! TOPCONS GOT THESE RIGHT:', topcons_got_right

        plt.figure()
        data = {}
        for predictor, results_d in results.items():
            data[predictor] = {k: 100*float(v)/float(totals[k]) for k, v in results_d.items()}
        print 'pps', results['polyphobius']
        # font = {'family': 'normal', 'size': 22}
        # matplotlib.rc('font', **font)
        print data
        print 'range', np.arange(0, 1./3., 1./(7.*3.)), len(np.arange(0, 1./3., 1./(7.*3.)))
        ind = np.arange(3)
        width = 1./3. * (1./7.)
        incs = np.arange(0, 1./3., 1./(7.*3.))
        colors = ['red', 'blue', 'green', 'black', 'orange', 'pink', 'grey']
        print ind
        plots = {}
        for predictor, details, inc, col in zip(data.keys(), data.values(), incs, colors):
            # print predictor, details, inc
            plots[predictor] = plt.bar(ind + inc, details.values(), width, color=col)
        plt.ylim((0, 105))
        plt.xlim((-0.15, 3.7))
        plt.xticks(np.arange(3)+0.15, ['1', '2-5', '5<'])
        plt.xlabel('Number of TMH')
        plt.ylabel('Overlap 10 Accuracy (%)')
        plt.title('TMH prediction comparison')
        names = [k for k in plots.keys()]
        names[0] = 'TopoGraph'
        plt.legend(plots.values(), names, loc='upper right')
        plt.show()
def check_beta_average():
    '''
    main function here. tests every TM in the Rost data base for its average sheet propensity, also every non-TM.
    outputs the number of windows that will be discarded for each.
    :return:
    '''
    from TMpredict_WinGrade import parse_rostlab_db
    import matplotlib.pyplot as plt
    import numpy as np
    global IS_BETA_CUTOFF, IS_COIL_CUTOFF, IS_HELIX_CUTOFF
    IS_BETA_CUTOFF = 0.3
    IS_COIL_CUTOFF = 0.48
    IS_HELIX_CUTOFF = 0.3

    tm_missed_h = 0
    non_ym_missed_h = 0
    how_many_non_tms = 0
    tot_passed_of_tmh = 0
    tot_NOT_passed_of_tmh = 0
    tot_passed_of_NON_tmh = 0
    tot_NOT_passed_of_NON_tmh = 0
    rostlab_dict = parse_rostlab_db()
    for k, v in rostlab_dict.items():
        psipred = parse_psipred(k)
        tms = ts2hp_seq(v['seq'], v['pdbtm'])
        for tm in tms:
            # avg = sum([psipred[a]['e'] for a in range(tm[0], tm[1]+1)]) / len(range(tm[0], tm[1]+1))
            # avg = psipred_avg(range(tm[0], tm[1]+1), psipred, 'e')
            # avg_c = psipred_avg(range(tm[0], tm[1]+1), psipred, 'c')
            # avg_h = psipred_avg(range(tm[0], tm[1]+1), psipred, 'h')
            # med = psipred_median(range(tm[0], tm[1]+1), psipred)
            # avgs.append(avg)
        # if avg >= IS_BETA_CUTOFF or avg_c >= IS_COIL_CUTOFF or avg_h <= IS_HELIX_CUTOFF:
        #     print avg,avg_c,avg_h
            if not pass_thresholds(psipred, tm[0], tm[1]):
                tm_missed_h += 1

        ## check how many non_tms will be canceled thanks to threshold
        non_tms = ts2non_tms(v['seq'], v['pdbtm'])
        for non_tm in non_tms:
            rng = range(non_tm[0], non_tm[1]+1)
            for i in rng:
                if i+20 in rng:
                    # avg = psipred_avg(range(i, i+20), psipred, 'e')
                    # avg_c = psipred_avg(range(i, i+20), psipred, 'c')
                    # avg_h = psipred_avg(range(i, i+20), psipred, 'h')
                    # med = psipred_median(range(i, i+20), psipred)
                    how_many_non_tms += 1
                    # if (avg >= IS_BETA_CUTOFF or avg_c >= IS_COIL_CUTOFF or avg_h >= IS_HELIX_CUTOFF):
                    if not pass_thresholds(psipred, i, i+20):
                        non_ym_missed_h += 1

        for i in range(len(v['seq'])-20):
            if do_range_overlap_ranges(range(i+1, i+21), tms):
                if pass_thresholds(psipred, i+1, i+21):
                    tot_passed_of_tmh += 1
                else:
                    tot_NOT_passed_of_tmh += 1
            else:
                if pass_thresholds(psipred, i+1, i+21):
                    tot_passed_of_NON_tmh += 1
                else:
                    tot_NOT_passed_of_NON_tmh += 1
        # break

    print 'TM totalt misses (helices)', tm_missed_h
    print 'NON TM total misses (helices)', non_ym_missed_h
    print "overall %i non tms examined" % how_many_non_tms
    print "\nTotal helices passed and are TMHs %i, Total helices not pass and are TMHs %i" % (tot_passed_of_tmh, tot_NOT_passed_of_tmh)
    print "Total helices passed and are not TMHs %i, Total helices not pass and not TMHs %i" % (tot_passed_of_NON_tmh, tot_NOT_passed_of_NON_tmh)
示例#19
0
def check_beta_average():
    '''
    main function here. tests every TM in the Rost data base for its average sheet propensity, also every non-TM.
    outputs the number of windows that will be discarded for each.
    :return:
    '''
    from TMpredict_WinGrade import parse_rostlab_db
    import matplotlib.pyplot as plt
    import numpy as np
    global IS_BETA_CUTOFF, IS_COIL_CUTOFF, IS_HELIX_CUTOFF
    IS_BETA_CUTOFF = 0.3
    IS_COIL_CUTOFF = 0.48
    IS_HELIX_CUTOFF = 0.3

    tm_missed_h = 0
    non_ym_missed_h = 0
    how_many_non_tms = 0
    tot_passed_of_tmh = 0
    tot_NOT_passed_of_tmh = 0
    tot_passed_of_NON_tmh = 0
    tot_NOT_passed_of_NON_tmh = 0
    rostlab_dict = parse_rostlab_db()
    for k, v in rostlab_dict.items():
        psipred = parse_psipred(k)
        tms = ts2hp_seq(v['seq'], v['pdbtm'])
        for tm in tms:
            # avg = sum([psipred[a]['e'] for a in range(tm[0], tm[1]+1)]) / len(range(tm[0], tm[1]+1))
            # avg = psipred_avg(range(tm[0], tm[1]+1), psipred, 'e')
            # avg_c = psipred_avg(range(tm[0], tm[1]+1), psipred, 'c')
            # avg_h = psipred_avg(range(tm[0], tm[1]+1), psipred, 'h')
            # med = psipred_median(range(tm[0], tm[1]+1), psipred)
            # avgs.append(avg)
            # if avg >= IS_BETA_CUTOFF or avg_c >= IS_COIL_CUTOFF or avg_h <= IS_HELIX_CUTOFF:
            #     print avg,avg_c,avg_h
            if not pass_thresholds(psipred, tm[0], tm[1]):
                tm_missed_h += 1

        ## check how many non_tms will be canceled thanks to threshold
        non_tms = ts2non_tms(v['seq'], v['pdbtm'])
        for non_tm in non_tms:
            rng = range(non_tm[0], non_tm[1] + 1)
            for i in rng:
                if i + 20 in rng:
                    # avg = psipred_avg(range(i, i+20), psipred, 'e')
                    # avg_c = psipred_avg(range(i, i+20), psipred, 'c')
                    # avg_h = psipred_avg(range(i, i+20), psipred, 'h')
                    # med = psipred_median(range(i, i+20), psipred)
                    how_many_non_tms += 1
                    # if (avg >= IS_BETA_CUTOFF or avg_c >= IS_COIL_CUTOFF or avg_h >= IS_HELIX_CUTOFF):
                    if not pass_thresholds(psipred, i, i + 20):
                        non_ym_missed_h += 1

        for i in range(len(v['seq']) - 20):
            if do_range_overlap_ranges(range(i + 1, i + 21), tms):
                if pass_thresholds(psipred, i + 1, i + 21):
                    tot_passed_of_tmh += 1
                else:
                    tot_NOT_passed_of_tmh += 1
            else:
                if pass_thresholds(psipred, i + 1, i + 21):
                    tot_passed_of_NON_tmh += 1
                else:
                    tot_NOT_passed_of_NON_tmh += 1
        # break

    print 'TM totalt misses (helices)', tm_missed_h
    print 'NON TM total misses (helices)', non_ym_missed_h
    print "overall %i non tms examined" % how_many_non_tms
    print "\nTotal helices passed and are TMHs %i, Total helices not pass and are TMHs %i" % (
        tot_passed_of_tmh, tot_NOT_passed_of_tmh)
    print "Total helices passed and are not TMHs %i, Total helices not pass and not TMHs %i" % (
        tot_passed_of_NON_tmh, tot_NOT_passed_of_NON_tmh)

def is_not_helical(pos, psi):
    import numpy as np
    return False if (
        np.mean([psi[a]['e'] for a in range(pos[0], pos[1])]) <= 0.3
        and np.mean([psi[a]['c'] for a in range(pos[0], pos[1])]) <= 0.48
        and np.mean([psi[a]['h']
                     for a in range(pos[0], pos[1])]) >= 0.3) else True


from psipred_vs_mm_nomm import parse_psipred, psipred_avg
from TMpredict_WinGrade import parse_rostlab_db
import matplotlib.pyplot as plt

rost_db = parse_rostlab_db()
pdb_name = 'p02722'
psi = parse_psipred(pdb_name)
avg_b = []
avg_c = []
avg_h = []
indices = []
tmh = []
passed = []
for i in range(len(psi) - 20):

    avg_b.append(psipred_avg(range(i + 1, i + 21), psi, 'e'))
    avg_c.append(psipred_avg(range(i + 1, i + 21), psi, 'c'))
    avg_h.append(psipred_avg(range(i + 1, i + 21), psi, 'h'))

    tmh.append(-0.1 if is_tmh(i, rost_db[pdb_name]['pdbtm']) else None)
def analyse():
    import pickle
    import os
    import sys
    import operator
    import random
    import matplotlib.pyplot as plt
    from TMpredict_WinGrade import parse_rostlab_db
    from WinGrade import topo_string_to_WGP
    from topo_strings_comparer import prd_parser, spc_parser
    total_sasa_dict = parse_standard_data()
    rost_db = parse_rostlab_db()
    neighbors = 0
    single = 0
    without_neighbours = 0
    accessible_vs_ddg = []
    for k, v in rost_db.items():
        if v['name'] in ['p01730', 'p19054', 'e1c9k9', 'q9qug3', 'p07471']:
            continue
        is_single = os.path.isfile('single_chains/%s_1.pdb' % v['pdb'])
        is_neighbor = os.path.isfile(
            'with_neighbours/%s_%s_with_neighbors.pdb' %
            (v['pdb'], v['chain'].upper()))
        is_no_neighbour = os.path.isfile(
            'without_neighbours/%s_%s_without_neighbours.pdb' %
            (v['pdb'], v['chain'].upper()))
        neighbors += 1 if is_neighbor else 0
        without_neighbours += 1 if is_no_neighbour else 0
        single += 1 if is_single else 0

        # before cutting out the spring:
        # prediction = prd_parser('/home/labs/fleishman/elazara/benchmark_paper_new/Mean/Plain', v['name']+'.prd')
        # after cutting out spring, with MSA:
        # prediction = prd_parser('/home/labs/fleishman/elazara/length_21/w_0_with_MSA/', v['name']+'.prd')
        # after cutting out spring, without MSA:
        prediction = prd_parser('/home/labs/fleishman/elazara/length_21/',
                                v['name'] + '.prd')
        wgp_pred = topo_string_to_WGP(prediction['best_path_ts'], v['seq'])

        spoc = spc_parser(v['name'])['spoctopus']
        signal = [0, spoc.count('s') + spoc.count('S')]

        if is_single:
            naccess = parse_rsa('single_chains/%s_1.rsa' % v['pdb'])
        else:
            naccess = parse_rsa('with_neighbours/%s_%s_with_neighbors.rsa' %
                                (v['pdb'], v['chain'].upper()))

        wgp_pdbtm = topo_string_to_WGP(v['pdbtm'], v['seq'])
        rost_aln, naccess_aln, score, beg, end = \
            pair_wise_aln_from_seqs(v['seq'], ''.join([a['type'] for a in naccess[v['chain']].values()]))
        for w in wgp_pdbtm.path:
            if w.begin <= signal[1]:
                print 'signes', w.begin, w.end, signal
                continue
            naccess_win = nacces_for_win(naccess, w, naccess_aln, rost_aln,
                                         total_sasa_dict, v['chain'])
            predicted = observed_found_in_prediction(w, wgp_pred)
            accessible_vs_ddg.append({
                'access': naccess_win,
                'predicted': predicted,
                'grade': w.grade
            })
    with open('pickled.obj', 'wb') as pkl:
        pickle.dump(accessible_vs_ddg, pkl)
    print 'its pickled'
    '''
    result = {}
    print file_name
    with open(file_name, 'r') as f:
        cont = f.read().split('\n')
    for item in cont:
        split = item.split()
        if len(split) > 1:
            result[split[0]] = split[1]
    return result

if __name__ == '__main__':
    import argparse
    import os
    from TMpredict_WinGrade import parse_rostlab_db
    global args
    parser = argparse.ArgumentParser()
    parser.add_argument('-name', type=str)
    parser.add_argument('-path', default=os.getcwd(), type=str)
    parser.add_argument('-tech', default='opm', type=str)
    args = vars(parser.parse_args())
    # pymol_mark_segments('4k1c', 'a', '222222222222222222222222222222222222222HHHHHHHHHHHHHHHHHHH1HHHHHHHHHHHHHHHHHHHHHH22222222222222HHHHHHHHHHHHHHHHHHHHH11HHHHHHHHHHHHHHHHHHHHHH22222222222222222HHHHHHHHHHHHHHHHHHHHHHHHH1111111111111111HHHHHHHHHHHHHHHHHHHHH2222222222222222222222222222HHHHHHHHHHHHHHHHHHHHHHHHH111111111111111111111111111111111111111111111111111111111111111111111111111111111111HHHHHHHHHHHHHHHHHH222222HHHHHHHHHHHHHHHHHH1111111111111',
    #                     'u111111111111111111111111111111111hhhhhhhhhhhhhhhhhh222222222222hhhhhhhhhhhhhhhhhhhhhh111111hhhhhhhhhhhhhhhhhhhh2222222222hhhhhhhhhhhhhhhhhhhhhh111111111111111hhhhhhhhhhhhhhhhh2222222222222222222222222hhhhhhhhhhhhhhhhhh1111111111111111111111111111111hhhhhhhhhhhhhhhhhhh222222222222222222222hhhhhhhhhhhhhhhhhh1111111111hhhhhhhhhhhhhhhhhhhhhh2222222222222222hhhhhhhhhhhhhhhhh111hhhhhhhhhhhhhhhhhh22222222222222222',
    #                     'MDATTPLLTVANSHPARNPKHTAWRAAVYDLQYILKASPLNFLLVFVPLGLIWGHFQLSHTLTFLFNFLAIIPLAAILANATEELADKAGNTIGGLLNATFGNAVELIVSIIALKKGQVRIVQASMLGSLLSNLLLVLGLCFIFGGYNRVQQTFNQTAAQTMSSLLAIACASLLIPAAFRATLPHGKEDHFIDGKILELSRGTSIVILIVYVLFLYFQLGSHHALFEQQEEETDEVMSTISRNPHHSLSVKSSLVILLGTTVIISFCADFLVGTIDNVVESTGLSKTFIGLIVIPIVGNAAEHVTSVLVAMKDKMDLALGVAIGSSLQVALFVTPFMVLVGWMIDVPMTLNFSTFETATLFIAVFLSNYLILDGESNWLEGVMSLAMYILIAMAFFYYPDEKTLDSIGNSL')
    # entry = TMpredict_reader('/home/labs/fleishman/jonathaw/membrane_prediction_DBs/ROC_6.4.2015/ROC_-3.0_18_0.2_2/p00423.prd')
    entry = TMpredict_reader(args['path']+'/'+args['name']+'.prd')
    # print entry
    rostlab_data = parse_rostlab_db()[args['name']]
    # print 'aaa', rostlab_data
    pymol_mark_segments(rostlab_data['pdb'], rostlab_data['chain'], entry['pred_ts'], rostlab_data[args['tech']],
                        rostlab_data['seq'], args['tech'])
示例#23
0
def main_rost():
    prd_files = [
        a for a in os.listdir('./') if '.prd' in a and '_msa' not in a
    ]
    rost_db = parse_rostlab_db()
    new_old = rost_new_old()
    topgraph_none = []

    follow = 'q8dkp6'

    old_new_totals = {'new': 0, 'old': 0}
    results = {}
    for prd_file in prd_files:
        name = prd_file.split('.')[0].lower()
        best_wgp, sec_wgp = parse_prd(prd_file)

        if best_wgp is None:
            topgraph_none.append(name)
            continue

        topc = spc_parser(name)

        signal_peptide = topc['topcons'].count('S') + topc['topcons'].count(
            's')

        best_wgp_loc_list = wgp_to_loc_list(best_wgp, signal_peptide)
        sec_wgp_loc_list = wgp_to_loc_list(sec_wgp, signal_peptide)

        old_new_totals[new_old[name]] += 1

        if name == follow:
            print 'at %s found loc list %r' % (name, best_wgp_loc_list)

        best_tgr_qok, best_tgr_ovm = qok_pdbtm_opm(rost_db[name],
                                                   best_wgp_loc_list,
                                                   signal_peptide,
                                                   verbose=name == follow)
        sec_tgr_qok, sec_tgr_ovm = qok_pdbtm_opm(rost_db[name],
                                                 sec_wgp_loc_list,
                                                 signal_peptide)

        best_or_sec_qok = best_tgr_qok or sec_tgr_qok
        best_or_sec_ovm = best_tgr_ovm or sec_tgr_ovm

        results[name] = {
            'old_new':
            new_old[name],
            'tm_num':
            len(pdbtm_opm_loc_list(rost_db[name]['pdbtm'], signal_peptide)),
            'topgraph': {
                'qok': best_tgr_qok,
                'ovm': best_tgr_ovm
            },
            'best_or_sec': {
                'qok': best_or_sec_qok,
                'ovm': best_or_sec_ovm
            }
        }

        for predictor in predictors:
            prd_qok, prd_ovm = qok_pdbtm_opm(
                rost_db[name], ts_loc_list(topc[predictor], signal_peptide),
                signal_peptide)
            results[name][predictor] = {'qok': prd_qok, 'ovm': prd_ovm}

    # prints resutls sliced by old/new
    print_results_by_old_new(results, predictors, old_new_totals)

    # prints results sliced by 1, 2-4 >4 TMHs
    print_results_by_tm_num(results)

    # print names TopGraph got wrong
    print_names_topgraph_got_wrong(results)

    # prints namse TopGraph got wrong by both best and sec best
    print_names_topgraph_got_wrong_best_and_sec(results)

    # print total percentage correct for TopGraph, TopGraph best or sec, and TOPCONS
    print_total_results(results)