예제 #1
0
def get_out_search():
    with open(sys.argv[-1]) as wdsp_f:

        CUTOFF = 20

        wdsp = Wdsp(wdsp_f)
        pros,seqs,wdsps,hotspots = wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots

        best = {}
        pro_num = 1000000
        while pro_num > CUTOFF:
            a = PatchSearchSpecific(pros,seqs,wdsps,hotspots,CUTOFF)
            a.get_patches()
            a.classify_patches()
            shape,patch,pro_list,pro_num = a.get_best()
            if not shape in best.keys():
                best[shape] = {}
                best[shape][patch] = pro_list
            else:
                best[shape][patch] = pro_list

            for pro in pro_list:
                if pro in seqs.keys():
                    pros.pop(pros.index(pro))
                    seqs.pop(pro)
                    wdsps.pop(pro)
                    hotspots.pop(pro)

    with open(sys.argv[-1]) as wdsp_f:
        wdsp = Wdsp(wdsp_f)
        write_results(best,wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots,CUTOFF)
예제 #2
0
def main():
    with open(sys.argv[-2]) as wdsp_f:
        tem_wdsp = Wdsp(wdsp_f)
        tem_hots = tem_wdsp.hotspots
    with open(sys.argv[-1]) as wdsp_f:
        all_wdsp = Wdsp(wdsp_f)
        all_hots = all_wdsp.hotspots

    tem_all_hots = get_similar_hots(tem_hots, all_hots)
    write_result(tem_all_hots, tem_hots, all_hots)
예제 #3
0
def main():

    with open(sys.argv[-1]) as wdsp_f:
        all_wdsp = Wdsp(wdsp_f)
        all_hots = all_wdsp.hotspots
        all_seqs = all_wdsp.seqs

    clusters = cluster_topface(all_hots)
    regressions = []
    for pros in clusters:
        c_size = len(pros)
        if c_size > 10:
            filename = str(c_size) + '_' + pros[0] + '_' + str(cutoff)
            hots = [[pro, all_hots[pro]] for pro in pros]
            seqs = [[pro, all_wdsp.seqs[pro]] for pro in pros]
            hots_score = align_hots([hot[1] for hot in hots])
            seqs_score = align_seqs([seq[1] for seq in seqs])
            regressions.append(linregress(seqs_score, hots_score))
            plot_scatter(seqs_score, hots_score, filename + '_scatter')

            hots = adjust_hots(hots)
            hots = [(pro, ''.join(hot)) for pro, hot in hots]
            plotlogo(hots, filename + '_logo')

    with open('regressions.txt', 'w') as w_f:
        'slop,intercept,r-value,p-value,stderr'
        for r in regressions:
            print >> w_f, ';'.join(map(str, r))
예제 #4
0
def main():
    with open('test.wdsp') as wdsp_f:

        wdsp = Wdsp(wdsp_f)
        a = PatchSearch(wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots,1)
        a.get_patches()
        a.classify_patches()
        a.write_results()
예제 #5
0
def main():
    with open(sys.argv[1]) as wdsp_f:
        CUTOFF = 20
        wdsp = Wdsp(wdsp_f)
        a = PatchSearch(wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots,CUTOFF)
        a.get_patches()
        a.classify_patches()
        write_results(a.shape_patch_pros,wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots,CUTOFF)
예제 #6
0
def main():

    with open(sys.argv[-2]) as wdsp_f:
        tem_wdsp = Wdsp(wdsp_f)
        tem_hots = tem_wdsp.hotspots
        with open(sys.argv[-1]) as wdsp_f:
            all_wdsp = Wdsp(wdsp_f)
            all_hots = all_wdsp.hotspots

    cutoff = 0.0
    for cutoff in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        # for cutoff in [30,40,50,60,70,80,90]:
        tem_all_hots = get_similar_hots(tem_hots, all_hots, cutoff)
        for tem_pro, all_pros in tem_all_hots.iteritems():
            hots = [[pro, all_hots[pro]] for pro, _ in all_pros]
            seqs = [[pro, all_wdsp.seqs[pro]] for pro, _ in all_pros]
            hots_score = get_hot_similarity(hots)
            seqs_score = get_seq_similarity(seqs)
            plot_scatter(seqs_score, hots_score, tem_pro + '_' + str(cutoff))

            hots = adjust_hots(hots)
            hots_len = len(hots)
            hots = [(pro, ''.join(hot)) for pro, hot in hots]
            plotlogo(hots, str(hots_len) + '_' + tem_pro + '_' + str(cutoff))

            f, ax = plt.subplots()
            fig = plt.figure(figsize=(5, 4))
            ax = fig.add_subplot(111)
            # sns.distplot(hots_score,hist=False,label='Topface',kde_kws={'linestyle':'-.'})
            # sns.distplot(seqs_score,hist=False,label='Sequence',kde_kws={'linestyle':'--'})
            sns.distplot(hots_score,
                         hist=False,
                         label='Topface',
                         kde_kws={'marker': ' '})
            sns.distplot(seqs_score,
                         hist=False,
                         label='Sequence',
                         kde_kws={'marker': '*'})
            ax.set(xlabel='Similarity',
                   ylabel='Frequency',
                   title='WD40 Protein Topface and Sequence Similarity')
            # h.figure.subplots_adjust(top=0.9,bottom=0.05,left=0.18,right=0.98)
            plt.savefig(tem_pro + '_' + str(cutoff) +
                        'hot_seq_similarity_dist.png',
                        dpi=300)
예제 #7
0
def main():
    with open(sys.argv[1]) as wdsp_f:
        CUTOFF = 10
        wdsp = Wdsp(wdsp_f)
        a = PatchSearchSpecific(wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots,CUTOFF)
        a.get_patches()
        a.classify_patches()
        # a.deredundant_patches()
        a.write_results()
예제 #8
0
def main():
    with open(sys.argv[-1]) as o_f:
        w = Wdsp(o_f)
        sims = OrderedDict()
        for pro, repeats in w.repeats.iteritems():
            sims[pro] = repeat_similarity(repeats)

        with open('sims.txt', 'w') as w_f:
            for k, v in sims.iteritems():
                print >> w_f, '{0:<20}{1:<}'.format(k, v[0])
예제 #9
0
def main():

    with open(sys.argv[-2]) as wdsp_f:
        tem_wdsp = Wdsp(wdsp_f)
        tem_hots = tem_wdsp.hotspots
        tem_repeats_similarity = wdsp_repeat_similarity(tem_wdsp.repeats)
        with open(sys.argv[-1]) as wdsp_f:
            all_wdsp = Wdsp(wdsp_f)
            all_hots = all_wdsp.hotspots
            all_repeats_similarity = wdsp_repeat_similarity(all_wdsp.repeats)
            tem_all_seq_similarity = seq_similarity(tem_wdsp, all_wdsp)

    cutoff = 70
    # for cutoff in [30,40,50,60,70,80,90]:
    for cutoff in [10]:
        tem_all_hots = get_similar_hots(tem_hots, all_hots, cutoff)
        write_result(tem_all_hots, tem_hots, tem_wdsp, all_hots, all_wdsp,
                     tem_repeats_similarity, all_repeats_similarity,
                     tem_all_seq_similarity, cutoff)
예제 #10
0
def main():
    with open(sys.argv[-2]) as o_f:
        tem = Wdsp(o_f)
        tem_seq = tem.seqs
    with open(sys.argv[-1]) as o_f:
        all1 = Wdsp(o_f)
        all_seq = all1.seqs

    similarity = OrderedDict()
    for t_name, t_seq in tem_seq.iteritems():
        sim = []
        for a_name, a_seq in all_seq.iteritems():
            sim.append((a_name, align(t_seq, a_seq)))
        # sim = sorted(sim, key=operator.itemgetter(1),reverse=True)
        similarity[t_name] = sim

    for k, v in similarity.iteritems():
        with lt.open_file(k) as w_f:
            for a_name, a_identity in v:
                print >> w_f, '{0:<15}{1:<}'.format(a_name, a_identity)
예제 #11
0
def main():

    with open(sys.argv[-1]) as wdsp_f:
        wdsp = Wdsp(wdsp_f)
        a = PatchSearchSpecific(wdsp.pros,
                                wdsp.seqs,
                                wdsp.wdsps,
                                wdsp.hotspots,
                                cutoff=1)
        a.get_patches()
        a.classify_patches()
        a.write_results()
예제 #12
0
def main():
    with open('test.wdsp') as wdsp_f:

        wdsp = Wdsp(wdsp_f)
        a = PatchSearch(wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots,2)
        a.get_patches()
        a.classify_patches()
        write_results(a.shape_patch_pros,wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots)

        print 'shape_pro_patches'
        print a.shape_pro_patches
        print 'shape_patch_pros'
        print a.shape_patch_pros
예제 #13
0
def classify_blade(wdsp_f):
    with open(wdsp_f) as o_f:
        wdsp = Wdsp(o_f)
        hotspots = ' '.join(
            [' '.join(v) for k, v in wdsp.hotspots.iteritems()]).split()

        aa = {
            'K', 'R', 'H', 'D', 'E', 'F', 'W', 'Y', 'S', 'T', 'N', 'Q', 'V',
            'L', 'I', 'M', 'A', 'C', 'P', 'G', '*', 'X', 'B'
        }
        aa_combi = itertools.product(aa, repeat=3)
        blades = {}
        for c in aa_combi:
            blades[''.join(c)] = 0

        for hot in hotspots:
            blades[hot] += 1

        blades = [(v, k) for k, v in blades.iteritems()]
        blades = sorted(blades, reverse=True)

        return blades
예제 #14
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
select wd40s with repeats 6n,7n,8n
"""
from wdsp import Wdsp
import numpy as np
from numpy.random import randint

with open('wd648_uniprot_select_cd-hit_90_7.wdsp') as wdsp_f:
    wdsp = Wdsp(wdsp_f)
    pro_num = len(wdsp.pros)
    for i in range(100):
        with open('random_' + str(i) + '.fa', 'w') as w_f:
            for j in range(2):
                pro = wdsp.pros[randint(0, pro_num)]
                seq = wdsp.seqs[pro]
                print >> w_f, '> ', pro
                for s in [seq[i:i + 80] for i in range(0, len(seq), 80)]:
                    print >> w_f, s
예제 #15
0
def read_hots(wdsp_f):
    with open(sys.argv[-2]) as wdsp_f:
        wdsp = Wdsp(wdsp_f)
        hots = wdsp.hotspots

        return hots
예제 #16
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
usage: python get_hotspot.py *.wdsp
output hotspots in following format
pro xxx xxx xxx xxx xxx xxx
"""

import os
import sys
import lt
from wdsp import Wdsp

with open(sys.argv[-1]) as wdsp_f:
    w = Wdsp(wdsp_f)
    with lt.open_file(file_suffix='hotspots') as w_f:
        for pro, hots in w.hotspots.iteritems():
            print >> w_f, '{0:<25}{1:<}'.format(pro, ' '.join(hots))
예제 #17
0
    # visual_style['vertex_label'] = labels
    # visual_style['vertex_label_size'] = 2
    visual_style['layout'] = graph.layout('kk')
    filename = 'sim_top_seq_' + str(cluster_num) + '_' + str(label)
    igraph.plot(graph, filename + '_graph.png', **visual_style)
    # slope,intercept,rvalue,pvalue,stderr = linregress(seqs_score,hots_score)
    # return [slope,intercept,rvalue,pvalue,stderr]
    # plot_scatter(seqs_score,hots_score,filename+'_scatter')

    # hots = adjust_hots(nr_hots)
    # hots = [(pro,''.join(hot)) for pro,hot in hots]
    # plotlogo(hots,filename+'_logo')


with open(sys.argv[-1]) as wdsp_f:
    all_wdsp = Wdsp(wdsp_f)
    all_hots = all_wdsp.hotspots
    all_seqs = all_wdsp.seqs
    pros = all_wdsp.pros

import lt


@lt.run_time
def main():

    for cluster_num in range(10, 100, 50):
        clusters = []
        for i in range(10):
            clusters.append(
                [[pros[randint(0, len(pros))] for i in range(cluster_num)], i])
예제 #18
0
def main():

    fname = os.path.split(sys.argv[-1])[1].split('.')[0]

    with open(sys.argv[-1]) as wdsp_f:
        wdsp = Wdsp(wdsp_f)
        pros = wdsp.pros
        hots = wdsp.hotspots
        seqs = wdsp.seqs

        parameters = []
        for i1, pro1 in enumerate(pros):
            for i2, pro2 in enumerate(pros):
                if i2 > i1:
                    parameters.append([
                        pro1, pro2, hots[pro1], hots[pro2], seqs[pro1],
                        seqs[pro2]
                    ])

        p = Pool(6)
        result = p.map(top_seq_align, parameters)
        p.close()

        # # result = []
        # # for p in parameters:
        # # r = top_seq_align(p)
        # # result.append(r)

        hots_score = [r[2][1] for r in result]
        seqs_score = [r[3][1] for r in result]
        pickle.dump([hots_score, seqs_score],
                    open('hots_seqs_score.pickle', 'w'))
        hots_score, seqs_score = pickle.load(open('hots_seqs_score.pickle'))

        plot_scatter(seqs_score, hots_score, fname + '_scatter')

        hots = [[pro, hot] for pro, hot in hots.iteritems()]
        hots = adjust_hots(hots)
        hots = [(pro, ''.join(hot)) for pro, hot in hots]
        plotlogo(hots, fname + '_logo')

        regression = linregress(seqs_score, hots_score)
        with open(fname + '_regression.txt', 'w') as w_f:
            'slop,intercept,r-value,p-value,stderr'
            print >> w_f, ';'.join(map(str, regression))

        f, ax = plt.subplots()
        fig = plt.figure(figsize=(5, 4))
        ax = fig.add_subplot(111)
        # sns.distplot(hots_score,hist=False,label='Topface',kde_kws={'linestyle':'-.'})
        # sns.distplot(seqs_score,hist=False,label='Sequence',kde_kws={'linestyle':'--'})
        sns.distplot(hots_score,
                     hist=False,
                     label='Topface',
                     kde_kws={'marker': ' '})
        sns.distplot(seqs_score,
                     hist=False,
                     label='Sequence',
                     kde_kws={'marker': '*'})
        ax.set(xlabel='Similarity',
               ylabel='Frequency',
               title='WD40 Protein Topface and Sequence Similarity')
        # h.figure.subplots_adjust(top=0.9,bottom=0.05,left=0.18,right=0.98)
        plt.savefig(fname + 'hot_seq_similarity_dist.png', dpi=300)
예제 #19
0
파일: wdsp_sta.py 프로젝트: lituan/Topface
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
calculate statistics for WDSP output file
usage: python wdsp_sta.py wdsp_f
"""
import lt
import sys
import os

from wdsp import Wdsp

with open(sys.argv[-1]) as o_f:
    wdsp = Wdsp(o_f)

    scores_sta = lt.lis_sta(wdsp.scores.values())
    with lt.open_file(file_suffix='total_score_sta') as w_f:
        for num, freq in scores_sta:
            print >> w_f, '{0:<10}{1}'.format(num, freq)

    tetrad_sta = [
        len([vi for vi in v if vi >= 44.0])
        for k, v in wdsp.blade_scores.iteritems()
    ]
    tetrad_sta = lt.lis_sta(tetrad_sta)
    with lt.open_file(file_suffix='tetrad_num_sta') as w_f:
        for num, freq in tetrad_sta:
            print >> w_f, '{0:<5}{1}'.format(num, freq)

    blades_sta = [len(blades) for pro, blades in wdsp.blades.iteritems()]
    blades_sta = lt.lis_sta(blades_sta)