示例#1
0
def LoadTrainData4Properties(tplFile, angFile):

    tpl = LoadTPL(tplFile)
    Ang = LoadAngleFile(angFile)

    assert tpl['sequence'] == Ang['sequence']
    assert all(tpl['missing'] == Ang['Missing'])
    assert (tSS == aSS
            for tSS, aSS, m in zip(tpl['SS_str'], Ang['SS'], tpl['missing'])
            if m != 1)

    ## merge tpl and ang to generate a new training and validation file

    protein = dict()

    protein['name'] = tpl['name']
    protein['sequence'] = tpl['sequence']
    protein['length'] = tpl['length']
    """
	protein['NEFF'] = tpl['NEFF']
	protein['PSFM'] = tpl['PSFM']
	protein['PSSM'] = tpl['PSSM']
	"""

    protein['ACC'] = tpl['ACC']
    protein['pACC'] = tpl['pACC']
    protein['CNa'] = tpl['CNa']
    protein['CNb'] = tpl['CNb']
    #protein['Ca'] = tpl['Ca']
    #protein['Cb'] = tpl['Cb']

    protein['SS'] = Ang['SS']
    protein['DISO'] = Ang['DISO']
    protein['CLE'] = Ang['CLE']
    protein['Phi'] = Ang['Phi']
    protein['Psi'] = Ang['Psi']
    protein['Theta'] = Ang['Theta']
    protein['Tau'] = Ang['Tau']
    protein['Omg'] = Ang['Omg']

    ##merge Phi and Psi
    protein['PhiPsi'] = np.transpose(np.array([protein['Phi'],
                                               protein['Psi']]))

    ##merge Theta and Tau
    protein['ThetaTau'] = np.transpose(
        np.array([protein['Theta'], protein['Tau']]))

    ## the missing residues have no 3D coordinates and thus, angles and solvent accessibility
    protein['Missing'] = Ang['Missing']

    protein['SS8'] = protein['SS']

    protein['SS3'] = ''.join(
        [PropertyUtils.SS8Letter2SS3Letter[c] for c in protein['SS8']])

    protein['ForTrain'] = True

    return protein
def Usage():
    print 'Usage: python CheckIfSequenceHasXs.py inputFile'
    print '	this script checks if the following types of files have Xs in sequence: .hhm, .hhm.pkl, .tgt, .tgt.pkl, .tpl, .tpl.pkl'
    print '	if Xs are contained in the sequence, the file name will be printed out as well as the sequence'


if len(sys.argv) < 2:
    Usage()
    exit(1)

infile = sys.argv[1]

if infile.endswith('.pkl'):
    with open(infile, 'rb') as fh:
        seq = cPickle.load(fh)['sequence']
elif infile.endswith('.hhm'):
    hhm = LoadHHM(infile)
    seq = hhm['sequence']
elif infile.endswith('.tgt'):
    tgt = LoadTGT(infile)
    seq = tgt['sequence']
elif infile.endswith('.tpl'):
    tpl = LoadTPL(infile)
    seq = tpl['sequence']
else:
    Usage()
    exit(1)

if 'X' in seq:
    print infile, seq
示例#3
0
"""

if len(name) != 5:
    print 'the template name is incorrect. It must be composed of PDB ID and chain letter'
    exit(-1)

chain = name[4]
residues = structure[0][chain].get_residues()
residueList = [r for r in residues if is_aa(r)]
#numResidues = len(residueList)
pdbseq = ''.join([three_to_one(r.get_resname()) for r in residueList])

#print pdbseq

#from Bio import SeqIO
tpl = LoadTPL(tplfile)
tplseq = tpl['sequence']

#record = SeqIO.read(seqfile, "fasta")
##print(record.seq)

##align two sequences

from Bio import pairwise2
from Bio.SubsMat.MatrixInfo import blosum80
##alignments = pairwise2.align.localds(pdbseq, tplseq, blosum80, -5, -1)
alignments = pairwise2.align.localms(pdbseq, tplseq, 3, -1, -0.5, -0.0)

#print '#alignments:', len(alignments)

##find the alignment with the minimum residue number difference
if __name__ == "__main__":
    if len(sys.argv) < 3:
        Usage()
        exit(1)

    tplfile = sys.argv[1]
    pdbfile = sys.argv[2]

    ResDir = os.getcwd()
    if len(sys.argv) >= 4:
        ResDir = sys.argv[3]
        if not os.path.isdir(ResDir):
            os.mkdir(ResDir)

    protein = LoadTPL(tplfile)

    result, pdbseq, numMisMatches, numMatches = PDBUtils.ExtractCoordinatesNDSSPBySeq(
        protein['sequence'], pdbfile)

    if numMisMatches > 5:
        print 'ERROR: too many mismatches between TPL sequence and ATOM record in ', pdbfile
        exit(1)

    if numMatches < min(30, 0.5 * len(protein['sequence'])):
        print 'ERROR: more than half of TPL sequence not covered by ATOM record in ', pdbfile
        exit(1)

    coordInfo, dssp = result
    coordinates, numInvalidAtoms = coordInfo
def ExtractCoordinatesFromTPLPDB(tplfile, pdbfile, atoms=['CB', 'CA', 'N', 'O']):
	tpl = LoadTPL(tplfile)