示例#1
0
def ligandfilter(pdb):
    """
    Remove water and other ligands from pdb.
    :param pdb: PDB.Structure.Structure
    :return: None
    """
    # Remove non amino acid residues
    # To upkeep the integrity due to detaching, iterate over child_list copy!
    for model in pdb.child_list[:]:
        for chain in model.child_list[:]:
            for res in chain.child_list[:]:
                if not PDB.is_aa(res):
                    chain.detach_child(res.id)
            if len(chain) == 0:
                model.detach_child(chain)
        if len(model) == 0:
            pdb.detach_child(model)
    # if the pdb still has more than one model, it's probably an NMR structure
    # simply keep the first model
    if len(pdb) > 1:
        for model in pdb.child_list[1:]:
            pdb.detach_child(model.id)
    if len(pdb.child_list[0]) > 1:
        model = pdb.child_list[0]
        for chain in model.child_list[1:]:
            model.detach_child(chain.id)
    # There is only one model left
    assert len(pdb) == 1
    # This model has only one chain
    assert len(pdb.child_list[0]) == 1
示例#2
0
def pdb2cd(name):
    f = name + ".pdb"
    dssp_tuple = dssp_dict_from_pdb_file(f)
    dssp_dict = dssp_tuple[0]
    p = PDBParser(QUIET=True).get_structure("file", f)

    # Initiates and fills array ("cc") with chains.
    cc = [chain.get_id() for model in p for chain in model]

    # Determines length of sequence, initiates an array ("ss") of same length.
    howLong = ss_out = 0
    for c in cc:
        howLong += len([_ for _ in p[0][c].get_residues() if PDB.is_aa(_)])
    if not howLong == len(dssp_tuple[1]): howLong = len(dssp_tuple[1])
    ss = np.arange(1, howLong + 1)

    # Fills the array ("ss") with secondary structures.
    for i in ss:
        ss_lib = dssp_dict[dssp_tuple[1][
            i -
            3]]  # ss_lib = dssp_dict[(dssp_tuple[1][0][0], (' ', i-1, ' '))]
        dict_ss = ss_lib[1]
        if dict_ss == 'H':
            ss_out = 0
        if dict_ss == 'E':
            ss_out = 1
        if dict_ss == '-':  # else:# dict_ss == '-':
            ss_out = 2
        ss[i - 1] = ss_out
    # Returns the fractional composition of alpha helix, beta sheet or random coil.
    alpha = (ss == 0).sum() / ss.__len__()
    beta = (ss == 1).sum() / ss.__len__()
    coil = (ss == 2).sum() / ss.__len__()
    abc = [alpha, beta, coil]
    return abc
示例#3
0
def assign_sensitivity(structure, md_df, chain, pdb_path, go):
    """
    Changed:
    lookup the sensitivities directly in the df, no dict.
    :param structure:
    :param md_df:
    :param chain:
    :param pdb_path:
    :return:
    """
    seq_pdb = []
    residues = structure[0][chain]
    for res in residues:  # move along the protein chain
        if not pdb.is_aa(res):
            continue
        aa = three2single[res.get_resname()]
        seq_pdb.append(aa)
    # get the sequence:
    aas = ''.join(md_df['AA'].values[1:].tolist())

    # align

    seq_md = ''.join(md_df['AA'][1:])
    aligned_md, aligned_pdb, identity = water(seq_md, seq_pdb)

    gos = [c for c in md_df.columns if c.startswith('GO:')]

    for aa_md, aa_pdb, res, pos in zip(aligned_md, aligned_pdb, residues,
                                       range(len(aligned_md))):
        if aa_md == '-' or aa_pdb == '-':
            continue
        res.sensitivity = {go: md_df.loc[pos, go] for go in gos}
    return structure
示例#4
0
	def parse_structure(self):
		for residue in self.structure.get_residues():
		#	if PDB.is_aa(residue, standard=True):	 # only the standard 20
			if PDB.is_aa(residue):
				res = residue.id[1]
				if res not in self.residues:	#dont doublecount mutated residues
					self.residues.append(res)
					self.atoms.extend(atoms_method(self.contact_defn, residue))
示例#5
0
 def parse_structure(self):
     for residue in self.structure.get_residues():
         if PDB.is_aa(residue,
                      standard=True):  #only consider standard 20 residues
             res = residue.id[1]
             if res not in self.residues:  #dont doublecount mutated residues	(ex. 1ORC)
                 self.residues.append(res)
                 self.d_sequence[res] = Polypeptide.three_to_one(
                     Residue.Residue.get_resname(residue))
示例#6
0
def extract_residues(model):
    """Returns a list of protein residues given a PDB model"""

    #TODO : return a list of protein residues given a PDB model
    residues = []
    for chain in model:
        for residue in chain:
            if PDB.is_aa(residue, standard=True): residues.append(residue)
    # print(residues)
    return residues
示例#7
0
def get_AAlist(aa_name, prot):
    """This functions takes in an amino acids string name and a protein pdb structure and 
    returns a list of the AAs inside that structure"""
    aa_list = []  #create empty list
    assert PDB.is_aa(
        aa_name
    )  # Make sure aa_name is amino acid (works for string or residue object)
    for res in prot.get_residues():  #loop through all residues in the protein
        if res.get_resname() == aa_name:  #check the right AA is selected
            aa_list.append(res)
    return aa_list
示例#8
0
def load_model(modelfile, debug, complex_state=None):
    '''Loads the model file and generates a fasta
       with dashes for any skipped residue #'s
       Takes a filename and returns a dict with
       filename: model filename
       fasta: fastaseq (of chain, with - for skipped res #)
       chain: chainID (only uses first chain if multiple)
       icodes: sequence of icodes (' ' for none)
       resnums: list of residue numbers'''
    debug_head = "DEBUG: IO: load_model: "
    if debug:
        print debug_head + "Loading model {}".format(modelfile)
    structure = parser.get_structure("Model", modelfile)
    if complex_state is None:
        complex_state = model_information(modelfile)
    if debug:
        print debug_head + "model has complex_state: {}".format(complex_state)
    # I'm assuming that all chains in a model are of the target protein
    # I can't find any models that are hetero even if they come from a heteroolig template
    # So, for now I need to stick with this assumption because I can't see any way to determine
    # which chain is the target protein in any cases of a heterooligomer model
    # which is why I think that never happens
    for cc in structure[
            0]:  # For now, only the first chain in the model is taken
        chain = cc
        break
    resnum = 0
    fasta = ""
    icodes = ""
    resnums = list()
    for residue in chain:
        #Skip hetatoms
        if not PDB.is_aa(residue): continue
        resid = residue.get_id()
        #Fill in gaps with -
        while resid[1] > resnum + 1:
            fasta += '-'
            resnum += 1
        hetflag, resnum, icode = resid
        aa = AA[residue.get_resname()]
        fasta += aa
        icodes += icode
        resnums.append(resnum)
    if debug:
        print debug_head + "{} has resnums".format(modelfile, resnums)
    return {
        'filename': modelfile,
        'fasta': fasta,
        'chain': chain.get_id(),
        'icodes': icodes,
        'resnums': resnums,
        'complex_state': complex_state
    }
示例#9
0
def find_contact(fname):
    if os.path.isfile(fname)==True:                                    #checks if the name saved in file_list is a file
        print(fname)
        parser=PDBParser(PERMISSIVE=1)                                     #parser for PDB file
        temp_struct=parser.get_structure(fname[0:-4], fname)                # parsing of PDB file
        model=temp_struct[0]
        contact_count=0
        contacts=[]
        resnum=0
        res_ids=[]
        chain_seq=[]
        for res in model.get_residues():
            if PDB.is_aa(res,standard=True):    ### considers only the 20 standard aa not modified ones
                res_ids.append(res.get_id()[1])
                chain_seq.append(res.get_parent().get_id())
                resnum=resnum+1
            else:
                pass
        for i in range(0,len(res_ids)):
            #if temp_struct[0][chain_seq[i]][res_ids[i]]['CA']:
            if temp_struct[0][chain_seq[i]][res_ids[i]]['CA'].is_disordered():
                atom1=temp_struct[0][chain_seq[i]][res_ids[i]]['CA']
                atom1.set_altloc(' ')
            else:
                atom1=temp_struct[0][chain_seq[i]][res_ids[i]]['CA']
            #else:
             #   pass
            for j in range(0,len(res_ids)):
                #print(res_ids[j])
                #if temp_struct[0][chain_seq[j]][res_ids[j]]['CA']:
                if temp_struct[0][chain_seq[j]][res_ids[j]]['CA'].is_disordered():
                    atom2=temp_struct[0][chain_seq[j]][res_ids[j]]['CA']
                    atom2.set_altloc(' ')
                else:
                    atom2=temp_struct[0][chain_seq[j]][res_ids[j]]['CA']
                
#                atom2=temp_struct[0][chain_seq[j]][res_ids[j]]['CA']
#                 else:
#                     pass

                if (atom1-atom2 <= 7) and (abs(int(res_ids[i])-res_ids[j])>2):
                    contact=(res_ids[i],chain_seq[i],res_ids[j],chain_seq[j])
                    contact_rev=(res_ids[j],chain_seq[j],res_ids[i],chain_seq[i])
                    if contact in contacts or contact_rev in contacts :
                        pass
                    else:
                        #contacts.append((atom1.get_id(),resid1.get_id()[1],atom2.get_id(),resid2.get_id()[1]))
                        contacts.append(contact)
                        contact_count=contact_count+1
                else:
                    pass
    return contacts 
示例#10
0
def get_sidechain(res):
    '''Get a list of side chain atoms from a residue'''
    assert PDB.is_aa(res)  #make sure residue is actually an amino acid
    sidechain = []
    exclude = ["N", "C", "O", "OXT"
               ]  #exclude nitrogen, carbonyl carbon, oxygen and special case
    for atom in res.get_atoms():
        if atom.get_id(
        ) in exclude or atom.element == "H":  #ignore exclusion list and hydrogen atoms
            continue
        else:
            sidechain.append(atom)
    return sidechain  #either returns list of side chain atoms or an empty list
示例#11
0
    def _get_residues_from_structure(self, pdb_structure):
        """
            _get_residues_from_structure: Given a pdb_structure object, parse residues into a list
                                          and return it
        """
        res_ids = []
        num_res = 0
        my_res = pdb_structure.get_residues()
        for r_ele in my_res:
            if PDB.is_aa(r_ele):
                num_res += 1
                res_ids.append(r_ele.get_id())

        return (num_res, res_ids)
示例#12
0
文件: __init__.py 项目: gieses/CLQC
    def __get_residues__(self, structure):
        """
        Gets all amino acids residues from a given structure and stores them
        in an array.

        parameters:
        ----------------
        structure: PDB strutore obj,
                   openened PDB structure file object

        Returns:
        ---------------------------------------
        array: np-arr,
               residue objects from Bio.PDB
        """
        residues_arr = []
        for res_i in structure.get_residues():
            if PDB.is_aa(res_i):
                residues_arr.append(res_i)
        return(np.array(residues_arr))
示例#13
0
    def _file_to_data(self, file_path):
        """Do the PDB conversion"""
        parser = PDB.PDBParser(PERMISSIVE=1)
        ppb = PPBuilder()
        pdb1 = file_path
        structure = parser.get_structure("test", pdb1)
        model = structure[0]
        chain_no = 0
        res_no = 0
        atom_no = 0
        pp_list = []
        pp_no = 0
        for model in structure:
            for chain in model:
                chain_no += 1
        for residue in model.get_residues():
            if PDB.is_aa(residue):
                res_no += 1
            for atom in residue.get_atoms():
                atom_no += 1

        for pp in ppb.build_peptides(structure):
            pp_no += 1
            my_seq = pp.get_sequence()
            pp_list += str(my_seq)
        seq = ''.join(pp_list)

        data = {
            'name': os.path.basename(file_path),
            'num_chains': chain_no,
            'num_residues': res_no,
            'num_atoms': atom_no,
            'protein': {
                'id': os.path.basename(file_path),
                'sequence': seq,
                'md5': hashlib.md5(seq.encode()).hexdigest()
            },
        }

        return data, pp_no
示例#14
0
def find_contact(fname):
    if os.path.isfile(
            fname) == True:  #checks if the name saved in file_list is a file
        print(fname)
        parser = PDBParser(PERMISSIVE=1)  #parser for PDB file
        temp_struct = parser.get_structure(fname[0:-4],
                                           fname)  # parsing of PDB file
        model = temp_struct[0]
        contact_count = 0
        contacts = []
        resnum = 0
        res_ids = []
        chain_seq = []
        for res in model.get_residues():
            if PDB.is_aa(res):
                res_ids.append(res.get_id()[1])
                chain_seq.append(res.get_parent().get_id())
                resnum = resnum + 1
        for i in range(0, len(res_ids)):
            for j in range(0, len(res_ids)):
                atom1 = temp_struct[0][chain_seq[i]][res_ids[i]]['CA']
                atom2 = temp_struct[0][chain_seq[j]][res_ids[j]]['CA']
                if (atom1 - atom2 <= 7) and (abs(int(res_ids[i]) - res_ids[j])
                                             > 2):
                    contact = (str(res_ids[i]), chain_seq[i], str(res_ids[j]),
                               chain_seq[j])
                    contact_rev = (str(res_ids[j]), chain_seq[j],
                                   str(res_ids[i]), chain_seq[i])
                    if contact in contacts or contact_rev in contacts:
                        pass
                    else:
                        #contacts.append((atom1.get_id(),resid1.get_id()[1],atom2.get_id(),resid2.get_id()[1]))
                        contacts.append(contact)
                        contact_count = contact_count + 1
                else:
                    pass
    return contacts
示例#15
0
def get_dis(name):
    p = PDBParser(PERMISSIVE=1)
    pdb_name = name
    try:
        s = p.get_structure("X", pdb_name)
        s = s[0]
    except:
        return None, None, None, None, None, None, None, None

    res_list = PDB.Selection.unfold_entities(s, 'R')
    aa_list = []
    for a in res_list:
        if PDB.is_aa(a):
            aa_list.append(a)

    t = aa_list[0].get_id()[1]
    aa_list_full = []
    error = 0
    for a in aa_list:
        while 1:
            if a.get_id()[1] < t:
                error = 1
                break
            if a.get_id()[1] == t:
                aa_list_full.append(a)
                t += 1
                break
            else:
                aa_list_full.append(None)
                t += 1
    if error == 1:
        return None, None, None, None, None, None, None, None
    try:
        depth = PDB.ResidueDepth(s)
    except:
        return None, None, None, None, None, None, None, None

    dep_dict = depth.property_dict
    dep_keys = depth.property_keys
    dep_list = depth.property_list
    dps = []
    for a in aa_list_full:
        try:
            aa_id = (a.get_parent().get_id(), a.get_id())
            if dep_dict.get(aa_id):
                dps.append(dep_dict[aa_id])
            else:
                dps.append([None, None])
        except:
            dps.append([None, None])
    dps = np.array(dps)

    try:
        HSEA = PDB.HSExposureCA(s)
    except:
        return None, None, None, None, None, None, None, None

    HSEA_dict = HSEA.property_dict
    HSEA_keys = HSEA.property_keys
    HSEA_list = HSEA.property_list
    hse_a = []
    for a in aa_list_full:
        try:
            aa_id = (a.get_parent().get_id(), a.get_id())
            if HSEA_dict.get(aa_id):
                hse_a.append(HSEA_dict[aa_id])
            else:
                hse_a.append([None, None, None])
        except:
            hse_a.append([None, None, None])
    hse_a = np.array(hse_a)

    try:
        HSEB = PDB.HSExposureCB(s)
    except:
        return None, None, None, None, None, None, None, None

    HSEB_dict = HSEB.property_dict
    HSEB_keys = HSEB.property_keys
    HSEB_list = HSEB.property_list

    hse_b = []
    for a in aa_list_full:
        try:
            aa_id = (a.get_parent().get_id(), a.get_id())
            if HSEB_dict.get(aa_id):
                hse_b.append(HSEB_dict[aa_id])
            else:
                hse_b.append([None, None, None])
        except:
            hse_b.append([None, None, None])

    hse_b = np.array(hse_b)

    seq_list = ''
    for a in aa_list_full:
        try:
            t = a.get_resname()
            if t in t_dic:
                seq_list += t_dic[t]
            else:
                seq_list += 'X'
        except:
            seq_list += 'X'

    ca_list = []
    for a in aa_list_full:
        try:
            t = a['CA']
            ca_list.append(t)
        except:
            t = None
            ca_list.append(t)

    cb_list = []
    for a in aa_list_full:
        try:
            t = a['CB']
            cb_list.append(t)
        except:
            t = None
            cb_list.append(t)

    n_list = []
    for a in aa_list_full:
        try:
            t = a['N']
            n_list.append(t)
        except:
            t = None
            n_list.append(t)
    c_list = []
    for a in aa_list_full:
        try:
            t = a['C']
            c_list.append(t)
        except:
            t = None
            c_list.append(t)

    angle = []
    for j in range(len(ca_list)):
        angle_t = []
        for k in range(len(ca_list)):
            if ca_list[j] != None and ca_list[k] != None:
                ca1 = ca_list[j].get_vector()
                ca2 = ca_list[k].get_vector()
                if cb_list[j] != None:
                    cb = cb_list[j].get_vector()
                    t1 = PDB.vectors.calc_angle(cb, ca1, ca2)
                else:
                    if c_list[j] != None and n_list[j] != None and ca_list[
                            j] != None:
                        ca_v = ca_list[j].get_vector().get_array()
                        c_v = c_list[j].get_vector().get_array()
                        n_v = n_list[j].get_vector().get_array()
                        cb = calha1(n_v, c_v, ca_v)
                        cb = PDB.vectors.Vector(cb)
                        t1 = PDB.vectors.calc_angle(cb, ca1, ca2)
                    else:
                        t1 = None
                if n_list[j] != None:
                    n_ = n_list[j].get_vector()
                    t2 = PDB.vectors.calc_angle(n_, ca1, ca2)
                else:
                    t2 = None
                if c_list[j] != None:
                    c_ = c_list[j].get_vector()
                    t3 = PDB.vectors.calc_angle(c_, ca1, ca2)
                else:
                    t3 = None
                angle_t.append([t1, t2, t3])
            else:
                angle_t.append([None, None, None])
        angle.append(angle_t)

    angle_d = []
    for j in range(len(angle)):
        angle_dt = []
        for k in range(len(angle[j])):
            angle_dt.append(angle[j][k] + angle[k][j])
        angle_d.append(angle_dt)
    angle_d = np.array(angle_d)

    ca_num = len(ca_list)
    ca_dist = []
    for j in range(len(ca_list)):
        for k in range(len(ca_list)):
            if ca_list[j] != None and ca_list[k] != None:
                ca_dist.append(ca_list[j] - ca_list[k])
            else:
                ca_dist.append(None)

    ca_dist = np.array(ca_dist)
    ca_dist = ca_dist.reshape(ca_num, ca_num)

    mask = []
    for j in range(len(ca_list)):
        if ca_list[j] != None:
            mask.append(1)
        else:
            mask.append(0)

    ids = ca_dist == None
    ca_dist[ids] = 100
    ca_dist_cs = []
    angle_cs = []
    num_cs = []
    for j in range(len(ca_dist)):
        t = ca_dist[j]
        s = t.argsort()
        ca_dist_cs.append(t[s[1:17]])
        angle_cs.append(angle_d[j][s[1:17]])
        num_cs.append(s[1:17])

    return seq_list, num_cs, mask, ca_dist_cs, angle_cs, dps, hse_a, hse_b
示例#16
0
def cal(i):
	print(pdbid[i],pdbchain[i])
	pdb_name='pdb_/pdb'+pdbid[i].lower()+'.ent'     #pdb name
	try:
		s = p.get_structure("1",pdb_name)       #read pdb struture
		s = s[0][pdbchain[i]]                   #choose chain
		res_list = PDB.Selection.unfold_entities(s, 'R')   #read aminoacid
	except:
		return 0
	
	aa_list = []
	for a in res_list:
		if PDB.is_aa(a):
			aa_list.append(a)  #get acid

	error=0
	t=aa_list[0].get_id()[1]
	aa_list_full=[]
	for a in aa_list:
		while 1:
			if a.get_id()[1]<t:
				error=1
				break
			if a.get_id()[1]==t:
				aa_list_full.append(a)
				t+=1
				break
			else:
				aa_list_full.append(None)
				t+=1
	if error==1:                 
		return 0
	
	try:
		depth=PDB.ResidueDepth(s)   #氨基酸到蛋白质表面距离
	except:
		return 0

	dep_dict=depth.property_dict
	dep_keys=depth.property_keys
	dep_list=depth.property_list
	dps=[]
	for a in aa_list_full:
		try:
			aa_id=(a.get_parent().get_id(),a.get_id())
			if dep_dict.get(aa_id):
				dps.append(dep_dict[aa_id])
			else:
				dps.append([None,None])
		except:
			dps.append([None,None])
	dps=np.array(dps)

	try:
		HSEA=PDB.HSExposureCA(s)
	except:
		return 0

	HSEA_dict=HSEA.property_dict
	HSEA_keys=HSEA.property_keys
	HSEA_list=HSEA.property_list
	hse_a=[]
	for a in aa_list_full:
		try:
			aa_id=(a.get_parent().get_id(),a.get_id())
			if HSEA_dict.get(aa_id):
				hse_a.append(HSEA_dict[aa_id])
			else:
				hse_a.append([None,None,None])
		except:
			hse_a.append([None,None,None])
	hse_a=np.array(hse_a)

	try:
		HSEB=PDB.HSExposureCB(s)
	except:
		return 0

	HSEB_dict=HSEB.property_dict
	HSEB_keys=HSEB.property_keys
	HSEB_list=HSEB.property_list

	hse_b=[]
	for a in aa_list_full:
		try:
			aa_id=(a.get_parent().get_id(),a.get_id())
			if HSEB_dict.get(aa_id):
				hse_b.append(HSEB_dict[aa_id])
			else:
				hse_b.append([None,None,None])
		except:
			hse_b.append([None,None,None])

	hse_b=np.array(hse_b)

	seq_list=''
	for a in aa_list_full:
		try:
			t=a.get_resname()
			if t in t_dic:
				seq_list+=t_dic[t]
			else:
				seq_list+='X'
		except:
			seq_list+='X'

	ca_list=[]
	for a in aa_list_full:
		try:
			t=a['CA']
			ca_list.append(t)
		except:
			t=None
			ca_list.append(t)

	cb_list=[]
	for a in aa_list_full:
		try:
			t=a['CB']
			cb_list.append(t)
		except:
			t=None
			cb_list.append(t)

	n_list=[]
	for a in aa_list_full:
		try:
			t=a['N']
			n_list.append(t)
		except:
			t=None
			n_list.append(t)
	c_list=[]
	for a in aa_list_full:
		try:
			t=a['C']
			c_list.append(t)
		except:
			t=None
			c_list.append(t)



	angle=[]                             #三个角两个氨基酸相对位置
	for j in range(len(ca_list)):
		angle_t=[]
		for k in range(len(ca_list)):
			if ca_list[j]!=None and ca_list[k]!=None:
				ca1=ca_list[j].get_vector()
				ca2=ca_list[k].get_vector()
				if cb_list[j]!=None:
					cb=cb_list[j].get_vector()
					t1=PDB.vectors.calc_angle(cb,ca1,ca2)
				else:
					if c_list[j]!=None and n_list[j]!=None and ca_list[j]!=None:
						ca_v=ca_list[j].get_vector().get_array()
						c_v=c_list[j].get_vector().get_array()
						n_v=n_list[j].get_vector().get_array()
						cb=calha1(n_v,c_v,ca_v)
						cb=PDB.vectors.Vector(cb)
						t1=PDB.vectors.calc_angle(cb,ca1,ca2)
					else:
						t1=None
				if n_list[j]!=None:
					n_=n_list[j].get_vector()
					t2=PDB.vectors.calc_angle(n_,ca1,ca2)
				else:
					t2=None
				if c_list[j]!=None:
					c_=c_list[j].get_vector()
					t3=PDB.vectors.calc_angle(c_,ca1,ca2)
				else:
					t3=None
				angle_t.append([t1,t2,t3])
			else:
				angle_t.append([None,None,None])
		angle.append(angle_t)

	angle_d=[]              #六个角
	for j in range(len(angle)):
		angle_dt=[]
		for k in range(len(angle[j])):
			angle_dt.append(angle[j][k]+angle[k][j])
		angle_d.append(angle_dt)
	angle_d=np.array(angle_d)

	ca_num=len(ca_list)
	ca_dist=[]             #CA距离
	for j in range(len(ca_list)):
		for k in range(len(ca_list)):
			if ca_list[j]!=None and ca_list[k]!=None:
				ca_dist.append(ca_list[j]-ca_list[k])
			else:
				ca_dist.append(None)
	
	ca_dist=np.array(ca_dist)
	ca_dist=ca_dist.reshape(ca_num,ca_num)

	mask=[]    #是否有CA
	for j in range(len(ca_list)):
		if ca_list[j]!=None:
			mask.append(1)
		else:
			mask.append(0)
	
	ids=ca_dist==None
	ca_dist[ids]=100   #算不出来距离的设置为100
	ca_dist_cs=[]
	angle_cs=[]
	num_cs=[]
	for j in range(len(ca_dist)):
		t=ca_dist[j]
		s=t.argsort()
		ca_dist_cs.append(t[s[1:17]])
		angle_cs.append(angle_d[j][s[1:17]])
		num_cs.append(s[1:17])

	dic_r={}
	dic_r['dis']=ca_dist_cs
	dic_r['angle']=angle_cs
	dic_r['mask']=mask
	dic_r['ids']=num_cs
	dic_r['seq']=seq_list
	dic_r['dps']=dps
	dic_r['hsea']=hse_a
	dic_r['hseb']=hse_b

	out_name='pdb_other_cb/'+pdbid[i].lower()+pdbchain[i]+'_all.npy'
	np.save(out_name,dic_r)
示例#17
0
    line=line.split('\t')
    if line[3]=="Chl":
        chl_list.append(line[0])
        
for microen in chl_list:
  # try:
    
    dirname=microen.split('_')[0]
    dirname=dirname.split('.')[1]    
    structure = parser.get_structure('pdb','F:\microfolds_8_2018/new_all/'+dirname+'/'+microen+'.pdb') 
    model = structure[0]
    res_no = 0
    non_resi = 0
    #in_file=open('/home/hraanan/MicrofoldsPDBs/ChlorophyllNewCenter/'+dirname+'/'+filename,'r')
    for model in structure:
       
            
        for residue in model.get_residues():
            if PDB.is_aa(residue):
                res_no += 1
    
            elif residue.resname in chl:
                non_resi += 1
        
       # print ("Residues2: %i" % (res_no))
       # print ("Other2:    %i" % (non_resi)) 
        ratio=res_no/non_resi 
        out_file.write(microen+'\t'+str(res_no)+'\t'+str(non_resi)+'\t'+str(ratio)+'\n')
out_file.close()
print('end')
示例#18
0
def main():
    # parse command line arguments
    parser = ArgumentParser()
    parser.add_argument('-p', '--pdb', dest='pdb', help='input PDB file')
    parser.add_argument('-c',
                        '--chains',
                        dest='chains',
                        help='chains to extract sequence for')
    parser.add_argument('-o',
                        '--output',
                        dest='output',
                        help='output fasta file')
    parser.add_argument('-r',
                        '--resseq',
                        dest='resseq',
                        help='residue seq number in the PDB file')
    parser.add_argument('-i',
                        '--interactive',
                        dest='interactive',
                        action='store_true',
                        help='select sequenceinteractively')
    args = parser.parse_args()

    print("input file:          " + args.pdb)
    print("output file:         " + args.output)
    print("chains:              " + args.chains)

    # extract sequences from SEQRES records
    pdb_id = basename(args.pdb).split('.')[0]
    with open(args.pdb, 'rt') as f:
        seqres_sequences = list(SeqIO.parse(f, 'pdb-seqres'))

    # extract sequences from residues with resolved coordinates
    structure = PDB.PDBParser().get_structure(id=pdb_id, file=args.pdb)
    model = structure[0]
    peptide_builder = PDB.Polypeptide.PPBuilder()
    sequences = []
    resseq_ids = []
    for c in args.chains:
        residues = [r for r in model[c].get_residues() if PDB.is_aa(r)]
        resseq_ids.append(r.get_id()[1] for r in residues)
        chain = peptide_builder.build_peptides(model[c])
        coord_sequence = chain[0].get_sequence()
        print('Sequence for chain ' + c + ' extracted from coordinates:')
        print(coord_sequence, '\n')
        # if the SEQRES records are missing from the PDB file, use sequence from coordinates
        if len(seqres_sequences) == 0:
            print(
                'SEQRES records in the given PDB file are missing! Using sequence '
                'extracted from coordinates.')
            sequence = coord_sequence
        else:
            for record in seqres_sequences:
                if c == record.id:
                    print('Sequence for chain ' + c + ' in SEQRES:')
                    print(record.seq, '\n')
            # pairwise alignment between the two sequences
            print('Here is an alignment of the two sequences:')
            alignment = pairwise2.align.globalms(coord_sequence, record.seq, 1,
                                                 -0.5, -10, 0)
            print(pairwise2.format_alignment(*alignment[0]))

            if args.interactive:
                # ask for which sequence to choose
                s = input(
                    'Which sequence are you interested? 1 for sequence from coordinates '
                    '2 for sequence from SEQRES: ')
                if int(s) == 1:
                    sequence = coord_sequence
                else:
                    sequence = record.seq
            else:
                sequence = record.seq

        # append sequence for the current chain
        sequences.append(
            SeqRecord.SeqRecord(seq=sequence,
                                id=pdb_id.upper() + ':' + c,
                                description=''))

    # write sequences to a fasta file
    with open(args.output, 'wt') as f:
        SeqIO.write(sequences, f, 'fasta')

    # write resseq ids
    if args.resseq is not None:
        with open(args.resseq, 'wt') as f:
            for i, resseq in enumerate(resseq_ids):
                f.write('> ' + args.chains[i] + '\n')
                f.write(','.join(str(j) for j in resseq))
    io.set_structure(s)

    # Remove disordered atoms,
    io.save(folder_name + file_name + "_ordered2.pdb", select=NotDisordered())
    s = parser.get_structure("my_pdb",
                             folder_name + file_name + "_ordered.pdb")
    io = PDBIO()
    io.set_structure(s)

    #Remove heteroatoms
    io.save(folder_name + file_name + "_ordered1.pdb", NonHetSelect())
    s = parser.get_structure("my_pdb",
                             folder_name + file_name + "_ordered.pdb")
    model = s[0]
    chain = model
    atoms = [a for a in chain.get_atoms() if pdb.is_aa(a.parent)]

    # Renumber residues to be sequential
    parents = []
    counter = 0
    for a in chain.get_atoms():
        if pdb.is_aa(a.parent):
            parents.append(a.parent)
            counter = counter + 1
    xyzs = [(a.coord) for a in atoms]
    xyzarr = np.array(xyzs)
    f = open(folder_name + file_name + '_ordered.pdb', 'w')
    id_counter = 0

    # Write to PDB file
    for i in range(0, len(atoms)):
示例#20
0
文件: BMOD.py 项目: drou0302/CapiPy
 def align_for_modeller(option):
     for chain in structure.get_chains():
         last_chain = chain.id
         tnum_res = (len([_ for _ in chain.get_residues() if PDB.is_aa(_)]))
     for pp in ppb.build_peptides(structure[0][str(last_chain)], aa_only=option):
         with open("model.fasta", "a") as f:
             sequence = pp.get_sequence()
             f.write(str(sequence))
     mdel = structure[0]
     chain = mdel[last_chain]
     res_list = PDB.Selection.unfold_entities(chain, "R")
     for residue in res_list:
         nid_1aa = (res_list[0].get_id())[1]
         nid_lastaa = (res_list[tnum_res - 1].get_id())[1]
     # Modify files so they have only 1 identifier + seq (id_1 or query)#
     with open('model.fasta', 'r') as o:
         data = o.read()
     with open('model.fasta', 'w') as mo:
         mo.write(">" + file_name + "\n" + data)
     with open("query.fasta") as o2:
         line2 = o2.readlines()
     line2[0] = ">query\n"
     with open("query.fasta", "w") as m1:
         m1.writelines(line2)
         # Combine original sequence and first hit into a fasta input file for the alignment#
     filenames = ["query.fasta", "model.fasta"]
     with open('alignment.fasta', 'w+') as aligninput:
         for files in filenames:
             with open(files) as infile:
                 aligninput.write(infile.read())
             aligninput.write("\n")
             # Profile-profile alignment using salign from modeller#
     log.none()
     aln = alignment(env, file='alignment.fasta', alignment_format='FASTA')
     aln.salign(rr_file='${LIB}/blosum62.sim.mat', gap_penalties_1d=(-500, 0), output='', align_block=15,
                align_what='PROFILE', alignment_type='PAIRWISE', comparison_type='PSSM', similarity_flag=True,
                substitution=True, smooth_prof_weight=10.0)
     aln.write(file='salign.ali', alignment_format='PIR')
     print("Alignment of template and query for modelling successfull.")
     # Fix formatting of the .ali file to specify the 1st and last aminoacid and the structure of the model protein#
     shutil.copyfile("salign.ali", "salign1.ali")
     with open("salign1.ali", "r") as file:
         filedata = file.read()
     replacement = filedata.replace(">P1;" + str(file_name) + "\nsequence::     : :     : :::-1.00:-1.00",
                                    ">P1;" + str(file_name) + "\nstructureX" + ":" + str(file_name) + ":" + str(
                                        nid_1aa) + ":" + str(last_chain) + ":" + str(nid_lastaa) + ": ::::")
     with open("salign1.ali", "w+") as f:
         f.write(replacement)
     # Make a single model of the query sequence using: salign1.ali and model.pdb#
     a = automodel(env, alnfile="salign1.ali", knowns=id_1.casefold(), sequence="query")
     a.starting_model = 1
     a.ending_model = 1
     a.make()
     # Check how good the alignment is#
     pir_alignment = AlignIO.read("salign1.ali", "pir")
     total_length = len(pir_alignment[0])
     gaps_1 = 0
     gaps_2 = 0
     for aas in pir_alignment[0].seq:
         if aas == "-":
             gaps_1 += 1
     for aas in pir_alignment[1].seq:
         if aas == "-":
             gaps_2 += 1
     if gaps_1 / total_length > 0.5 or gaps_2 / total_length > 0.5:
         print(
             "\nYour model protein covers less than half of the query. The created model could be inaccurate, " +
             "please check the result before continuing with anything else.\n")
         time.sleep(3)
示例#21
0
    def atoms(self):
        """obtain all atoms in the protein"""

        for residue in self.structure.get_residues():
            if PDB.is_aa(residue, standard=True):
                self.atom_list.extend(residue.get_atoms())
示例#22
0
		break
	t=s.split()[0][:4]
	t2=s.split()[0][4:]
	pdbid.append(t)
	pdbchain.append(t2)

for i in range(len(pdbid)):
	print(pdbid[i],pdbchain[i])
	pdb_name='pdb_/pdb'+pdbid[i].lower()+'.ent'
	s = p.get_structure("1",pdb_name)
	s = s[0][pdbchain[i]]
	res_list = PDB.Selection.unfold_entities(s, 'R')
	
	aa_list = []
	for a in res_list:
		if PDB.is_aa(a):
			aa_list.append(a)

	error=0
	t=aa_list[0].get_id()[1]
	aa_list_full=[]
	for a in aa_list:
		while 1:
			if a.get_id()[1]<t:
				error=1
				break
			if a.get_id()[1]==t:
				aa_list_full.append(a)
				t+=1
				break
			else:
示例#23
0
def is_no_aa_chain(chain):
	"""
	Test if a chain contains no amino acids.
	"""
	return all([(not PDB.is_aa(r)) for r in chain])
示例#24
0
def get_aa_list(res_list):
    aa_list = [a for a in res_list if PDB.is_aa(a)]
    return aa_list
示例#25
0
def getPolygonalChain(PDBfilename,
                      outNumpy_b=0,
                      optionOut_b=False,
                      outputFile=''):
    '''Derives the polygonal chain representation of the protein 
    from its PDB file. Output: array of line segments between the
    C-alpha atoms and ordered by these.'''
    CaChain = {}
    polyCaChain = {}
    structure = parser.get_structure(PDBfilename,
                                     PDBfilename)  #file id = PDBfilename
    #we trust that the top100 files all have crystallograhic content for modelId = 0:
    model = structure[0]
    #first loop though the chains; for each residue we store the CA info:
    cntCA = 0
    if outNumpy_b == 1:
        for chain in model:
            print "Reading chain: %s" % chain
            if chain.id.isspace():
                chain.id = '>'
                print "Chain id is blank and gets subst with: %s" % chain.id
            for residue in chain:
                if PDB.is_aa(residue):
                    if residue.id[
                            0] == ' ':  #residue.id[0] is the hetero-flag; if not blank the residue will contain hetero-atoms (HETATM in pdb format)
                        #we only add chain as key if there is an aa in the chain:
                        if not (CaChain.has_key(chain.id)):
                            print "Recording a new chain id in the structure: %s" % chain.id
                            CaChain[chain.id] = []
                            polyCaChain[chain.id] = []
                            cntCA = 0
        #                CA = residue['CA']
                        vCA = np.array(residue['CA'].get_vector())
                        CaChain[chain.id].append(vCA)
                        if cntCA > 0:
                            #                    v = vCA - vPrev
                            polyCaChain[chain.id].append([vPrev,
                                                          vCA])  #[vCA,vPrev]??
        #                    print v
                        vPrev = vCA
                        cntCA += 1
        #                print cntCA
        #                print vCA
    else:
        for chain in model:
            print "Reading chain: %s" % chain
            if chain.id.isspace():
                chain.id = '>'
                print "Chain id is blank and gets subst with: %s" % chain.id
            for residue in chain:
                if PDB.is_aa(residue):
                    if residue.id[
                            0] == ' ':  #residue.id[0] is the hetero-flag; if not blank the residue will contain hetero-atoms (HETATM in pdb format)
                        #we only add chain as key if there is an aa in the chain:
                        if not (CaChain.has_key(chain.id)):
                            print "Recording a new chain id in the structure: %s" % chain.id
                            CaChain[chain.id] = []
                            polyCaChain[chain.id] = []
                            cntCA = 0
        #                CA = residue['CA']
                        vCA = residue['CA'].get_vector()
                        CaChain[chain.id].append(vCA)
                        if cntCA > 0:
                            #                    v = vCA - vPrev
                            polyCaChain[chain.id].append([vPrev,
                                                          vCA])  #[vCA,vPrev]??
        #                    print v
                        vPrev = vCA
                        cntCA += 1
        #                print cntCA
        #                print vCA
    if optionOut_b:
        with open(outputFile, 'w') as of:
            for k in polyCaChain.keys():
                of.write('file: ' + PDBfilename + ';' + 'Chain: ' + k)
                for v in polyCaChain[k]:
                    x0, y0, z0 = v[0]
                    #                print x0, y0, z0
                    x1, y1, z1 = v[1]
                    s = str(x0) + ';' + str(y0) + ';' + str(z0) + ';' + str(
                        x1) + ';' + str(y1) + ';' + str(z1) + '\n'
                    of.write(s)

    return CaChain, polyCaChain
示例#26
0
def one_entry(entry, radius, out_path, sen_path, plot_path, prefix):
    print(entry)
    pdbid = entry.loc['PDB']
    chain = entry.loc['Chain']
    go = entry.loc['GO']
    name = entry.loc['Name']
    ligand = entry.loc['Ligand']

    try:
        lig_ids = entry.loc['LigID'].split(',')
    except AttributeError:
        print('LigID not given for {}_{}'.format(pdbid, chain))
        return '!LigID', entry, None
    # call parser:
    parser = pdb.PDBParser()

    # get data
    sensitivity_file = os.path.join(sen_path,
                                    'masked_{}_{}_1.txt'.format(pdbid, chain))
    print('Reading {}'.format(sensitivity_file))

    try:
        md_df = pd.read_csv(sensitivity_file, sep='\t', index_col=0)

    except FileNotFoundError:
        print('File not found: {}'.format(sensitivity_file))
        return '!Senstivity file', entry, None

    pdb_file = os.path.join(pdb_path, '{}.pdb'.format(pdbid))
    print('Using PDB-File {}'.format(pdb_file))

    try:
        struc = parser.get_structure(id='{}_{}'.format(pdbid, chain),
                                     file=pdb_file)
    except FileNotFoundError:
        print('File not found: {}'.format(pdb_file))
        return '!PDB file', entry, None

    # calculate each residues distance to the ligand
    # select ligands
    ligs = []

    for lig_id in lig_ids:
        lig_chain, lig_name, lig_resid = lig_id.split('/')
        try:
            lig = struc[0][lig_chain][' ', int(lig_resid), ' ']
        except KeyError:
            lig_name = 'H_{}'.format(lig_name)
            lig = struc[0][lig_chain][lig_name, int(lig_resid), ' ']

        print(lig)
        ligs.append(lig)

    # Min instead of mean
    seq = ''.join(md_df['AA'][1:])
    distances = []
    seq_matched = ''
    for res in struc[0][chain].get_residues():  # move along the protein chain
        tmp = []
        if not pdb.is_aa(res):
            continue

        for lig in ligs:
            for lig_at in lig.get_atoms():
                for res_at in res.get_atoms():
                    tmp.append(lig_at - res_at)

        try:
            aa = three2single[res.get_resname()]
        except KeyError:
            continue
        seq_matched += aa
        distances.append(min(tmp))

    alignment_obj = pairwise2.align.globalxx(seq,
                                             seq_matched,
                                             one_alignment_only=True)[0]
    aligned_md, aligned_pdb = alignment_obj[:2]

    aligned_distances = []
    current_pos = 0
    for aa_md, aa_pdb in zip(aligned_md, aligned_pdb):
        if aa_md == '-':
            continue

        if aa_pdb == '-':
            aligned_distances.append(float('nan'))

        else:
            aligned_distances.append(distances[current_pos])

            current_pos += 1

    md_df['d_ligand'] = [0.0] + aligned_distances

    md_df.to_csv(os.path.join(out_path,
                              'masked_{}_{}_1.txt'.format(pdbid, chain)),
                 sep='\t')
    # this is saved now

    # rest of the calculation
    try:
        c, d, zero_distance_idxes = separate_values(md_df, go, radius, prefix)
    except IndexError:  # (KeyError, IndexError):
        print('GO not in data for {}: {} not in {}'.format(
            go, md_df.columns, pdbid))
        return '!GO', entry, None

    stat, p, zero_distance_below_threshold = plot_and_compare(
        c, d, zero_distance_idxes, plot_path, pdbid, chain, go, name, ligand)

    print(
        '{} {} - {:.1e}, {:.2f} \t-> {}, zero distance values below percentile: {}'
        .format(pdbid, chain, p, stat, ligand, zero_distance_below_threshold))

    entry.loc['p'] = p
    entry.loc['stat'] = stat
    entry.loc['nc'] = len(c)
    entry.loc['nd'] = len(d)
    org = entry['Organism']
    name = entry['Name']
    lig = entry['Ligand']

    head = '{} {} - {}\n{} {}, {}, p = {:.3f}, t = {:.2f}\n\n'.format(
        org, name, lig, pdbid, chain, go, p, stat)
    return head, entry, zero_distance_below_threshold
示例#27
0
warnings.simplefilter('ignore', BiopythonWarning)
parser = PDBParser(PERMISSIVE=1)
structure_id = pdbid
filename = pdbid + ".pdb"
structure = parser.get_structure(structure_id, filename)
'''
model=structure[0]
chain=model["A"]
print(len(model.get_list()))
'''

print(structure.get_full_id())
for chain in structure.get_chains():
    i = 0
    for temp in chain.get_residues():
        if PDB.is_aa(temp):
            i += 1
    print(chain, " length : ", i)  #length of each chain
'''
print("\nfor the chain : ",end='')
print(chain)
residues=chain.get_residues()	#print all residues of a chain
for res in residues:
	print("residue name : ",end='')
	print(res.get_resname(),end='')
	if PDB.is_aa(res):
		print(" amino acid")
	else:
		print(" not amino acid")
	print("atoms and their coordinates in the residue : ")
	atoms=res.get_atom()
示例#28
0
def parse_pdb_length(name):
	pdb = PDBParser().get_structure(name, "../../../0-identify_structure/2-get_pdb_chain/{0}/{1}.pdb".format(organism, name))
	chain = list(pdb.get_chains())[0]	#only 1 chain present	
	return len([_ for _ in chain.get_residues() if PDB.is_aa(_)])	#omits missing residues