예제 #1
0
import sys,os, pickle
import seqtools as st
from shared import *
from glob import *
import networkx as nx
from IPython import embed

np_2_aa_seq = {}
for fname in glob('dat_nuccore/*.faa'):
	nc = os.path.splitext(fname)[0]
	for h,s in zip(*st.read_fasta(fname)):
		np = h[1:-1]
		np_2_aa_seq[np] = s

gene_graph = nx.Graph()
ncnc2rsds = {}  # maps (nc,nc) -> {(np,np)->rsd, ...}
for fname in glob('dat_rsd/*-*'):
	nc1,nc2 = os.path.basename(fname).split('-')
	nc1,nc2 = nc1,nc2 if nc1<nc2 else nc2,nc1
	rsds = {}
	ncnc2rsds[(nc1,nc2)] = rsds
	cum_rsd = 0
	cum_cnt = 1
	for l in open(fname):
		l = l.split('\t')
		if l[0] != 'OR': continue
		np1,np2,rsd = l[1:4]
		rsd = float(rsd)
		np1,np2 = np1,np2 if np1<np2 else np2,np1
		rsds[(np1,np2)] = rsd
		gene_graph.add_edge(np1,np2,weight=rsd)
import seqtools as st
from shared import *

eh,es = st.read_fasta('eugene_seqs.faa')
eh = [h.strip()[1:].split(' ')[0] for h in eh]
eh2s = {}
reh,res = [],[]
beh,bes = [],[]
for h,s in zip(eh,es):
    if h.endswith('_RED'):
        reh.append(h[:-4])
        res.append(s)
        eh2s[h[:-4]]=s
    elif h.endswith('_BLK'):
        eh2s[h[:-4]]=s
        beh.append(h[:-4])
        bes.append(s)
    else:
        print "Couldn't classify seq "+repr(h)

print 'Red symmetric difference: ' + repr(list(set(reh).symmetric_difference(set(rah))))
print 'Black symmetric difference: ' + repr(list(set(beh).symmetric_difference(set(bah))))

for h in rah+bah:
    try:
        aln_seq = ''.join([c for c in asmap[h] if c!='-'])
        e_seq = ''.join([c for c in eh2s[h] if c!='-'])
    except KeyError:
        print "Couldn't find eugene seq corresponding to "+h
        continue
    if aln_seq!=e_seq:
예제 #3
0
YP_003262789
YP_003518975
ZP_01544314
ZP_02464292
ZP_05320398
ZP_06350196""".splitlines()

alphabet = ['A','C','E','D','G','F','I','M','K','V','-','L','N','Q','P','S','R','T','W','H','Y']
alphabet3 = ['ALA', 'CYS', 'GLU', 'ASP', 'GLY', 'PHE', 'ILE', 'MET', 'LYS', 'VAL', '---', 'LEU', 'ASN', 'GLN', 'PRO', 'SER', 'ARG', 'THR', 'TRP', 'HIS', 'TYR']
la = len(alphabet)
chr2idx = np.ones(256)*-1
for i in range(la):
	chr2idx[ord(alphabet[i])] = i
idx2chr = alphabet

aligned_hdrs, aligned_seqs = st.read_fasta(open('pdb_seqs_aln.faa'))
aligned_hdrs = [h[1:-1] for h in aligned_hdrs]
rah, ras = [],[]  # Red Aligned Headers, Red Aligned Sequences
bah, bas = [],[]
rahp, rasp = [],[] # Red Aligned Headers from PDB
bahp, basp = [],[]
for h,s in zip(aligned_hdrs,aligned_seqs):
    if h in rnames:
        rah.append(h.strip())
        ras.append(s.strip())
    elif h in bnames:
        bah.append(h.strip())
        bas.append(s.strip())
    elif h.startswith('RED'):
        rahp.append(h.strip())
        rasp.append(s.strip())
import sys,os
from Bio.PDB.PDBParser import PDBParser
from glob import glob
import seqtools as st
import numpy as np
from IPython import embed

aln_h, aln_s = st.read_fasta('pdb_seqs_aln.faa')
aln_h = [h[1:-1] for h in aln_h]  # Strip > and \n from '>SEQID_STR\n'
id2aln = dict(zip(aln_h,aln_s))

pdbs = glob('/mnt/hgfs/D/jon/Documents/2012/summer/pdbs/o_*')
pdbparser = PDBParser(PERMISSIVE=1)
ASA_ids = []
ASA_vals = []  # Array of (ASA array of len(aligned sequence))
# pdb_seqs.faa headers look like           BLK_NP_24567
# ORDERED.fasta (Adrian) headers look like NP_24567
for fpath in pdbs:
	name = os.path.basename(fpath)
	protid = '_'.join(name.split('_')[1:-2]).upper()  # Like BLK_NP_12345
	structure = pdbparser.get_structure('struct',fpath)
	models = structure.get_list()    # Hardcoded to get model 0, chain 0
	chain = models[0].get_list()[0]
	residues = chain.get_list()
	ASAs = [r['CA'].get_bfactor() for r in residues]
	try:
		aa_alignment = id2aln[protid]
	except:
		print "FAILED TO EXTRACT ASA FROM PDB"
		print "Couldn't maatch protein id %s to an alignment"%protid
		embed()
Gln Q
Gly G
His H
Ile I
Leu L
Lys K
Met M
Phe F
Pro P
Ser S
Thr T
Trp W
Tyr Y
Val V""".upper().splitlines()])

aln_h, aln_s = st.read_fasta('PROTEIN/ALIGNED.fasta')
aln_h = [h[1:-1] for h in aln_h]  # Strip > and \n from '>SEQID_STR\n'
id2aln = dict(zip(aln_h,aln_s))

pdbs = glob('/mnt/hgfs/D/jon/Documents/2012/summer/pdbs/o_*')
pdbparser = PDBParser(PERMISSIVE=1)
AA_hdrs = []
AA_seqs = []  # Array of (ASA array of len(aligned sequence))
for fpath in pdbs:
	name = os.path.basename(fpath)
	protid = '_'.join(name.split('_')[1:-2]).upper()
	structure = pdbparser.get_structure('struct',fpath)
	models = structure.get_list()    # Hardcoded to get model 0, chain 0
	chain = models[0].get_list()[0]
	residues = chain.get_list()
	AAs = ''.join([three2one[r.get_resname()] for r in residues])