def fetchmol(qnmrfinal, qxrayfinal, qartfinal): """gets the molecular weight of each hit for each generated ID list and then generates a count for the coming bar graph. mol count info stored in dicts. The info of what weight each ID is is not recorded, rather, their existence is recorded if they are within a range of kDa""" nmrmolrange = np.linspace(2.5, 37.5, 15, endpoint=True) nmr = {i: 0 for i in nmrmolrange} xrayartmolrange = np.linspace(20, 280, 14, endpoint=True) xray = {i: 0 for i in xrayartmolrange} xray[1000] = 0 art = {i: 0 for i in xrayartmolrange} art[1000] = 0 for hit in qnmrfinal: try: test = pd.get_all_info(hit) nmr = molcounternmr(test, nmr) except: continue for hit in qxrayfinal: try: test = pd.get_all_info(hit) xray = molcounterxray(test, xray) except: continue for hit in qartfinal: try: test = pd.get_all_info(hit) art = molcounterart(test, art) except: continue print("NMR Mol Weight Tally:" + str(nmr)) print("Xray Mol Weight Tally:" + str(xray)) print("Artifact Mol Weight Tally:" + str(art)) return nmr, xray, art
def extractData(maxDepositionDate, foundStructures): results = {} moleculeName = [] entityTitle = '' for entry in foundStructures: entityInfo = describe_pdb(entry) depositionDate = datetime.date( parser.parse(entityInfo['deposition_date'])) if depositionDate >= maxDepositionDate: entityStructureID = entityInfo['structureId'] entityTitle = entityInfo['title'] entityExtraInfo = get_all_info(entry) if int(describe_pdb(entry)['nr_entities']) > 1: for mol in entityExtraInfo['polymer']: moleculeName.append(mol.get('macroMolecule').get('@name')) elif int(describe_pdb(entry)['nr_entities']) == 1: moleculeName = entityExtraInfo.get('polymer').get( 'macroMolecule').get('@name') else: moleculeName = 'No molecule name give' results[entry] = (moleculeName, entityTitle, str(depositionDate)) return (results)
def fetch_info(self, pdb_ids): for pdb_id in tqdm(pdb_ids, disc="Checking PDB for new structures", unit="Structures "): try: info = get_all_info(pdb_id) enzyClass = info["polymer"]["enzClass"]["@ec"] length = int(info["polymer"]["@length"]) #ADD TO DATA FILE except: pass add_to_checked(pdb_id)
def get_organism(id): nombres = list() info = pypdb.get_all_info(id) info_nombre = info.get('polymer') #esto es porque a veces pueden tener mas de un organismo asociado if (isinstance(info_nombre, list)): for x in range(0, len(info_nombre)): nombres.append(info.get('polymer')[x].get('Taxonomy').get('@name')) nombres = list(dict.fromkeys(nombres)) return ' '.join(nombres) return info_nombre.get('Taxonomy').get('@name')
def MakeTemplateList(fasta): counter = 0 DataList = [] for file in os.listdir('.'): if file.endswith('.table'): source = file[-9:-6] for line in open(file): counter += 1 if counter == 2: headerstring = line headerstring = headerstring.replace( 'target name', 'target-name') headerstring = headerstring.replace( 'query name', 'query-name') headerstring = headerstring.replace( 'description of target', 'description-of-target') headerstring = headerstring[1:].split() headerstring.append('UniProtID') headerstring.append('Source') headerstring = list(MakeUniq(headerstring)) elif counter >= 4: if line[0] == '#': pass else: line2 = line.split() description = ' '.join(line2[18:]) line3 = line2[:18] line3.append(description) if source == 'CSA': line3[0] = line3[0][:6].lower().replace(':', '_') PDBinfo = pypdb.get_all_info(line3[0][:4])['polymer'] if isinstance(PDBinfo, list): UniProtID = PDBinfo[0]['macroMolecule'][ 'accession']['@id'] else: UniProtID = PDBinfo['macroMolecule']['accession'][ '@id'] line3.append(UniProtID) line3.append(source) DataList.append(line3) df = pd.DataFrame(DataList, columns=headerstring) df['E-value'] = df['E-value'].astype(float) df.sort_values(by=['Source', 'E-value'], inplace=True) df.to_csv('HmmerInfo/CSV/' + fasta + '_fullhmmer.csv') df.drop_duplicates(subset='UniProtID', inplace=True) df.to_csv('HmmerInfo/CSV/' + fasta + '_noduphmmer.csv') targetlist = df['target-name'].tolist() os.system('mv ' + fasta + '.MCSA.table HmmerInfo/MCSAhm') os.system('mv ' + fasta + '.PDB.table HmmerInfo/PDBhm') return targetlist
def fetch_and_name_chains(cif_id): ''' DESCRIPTION 1. Fecth a cif structure from the pdb 2. Rename the cif file accoring to its 4-letter PDB-code followed by its Genus species name 3. Create a PyMOL object for each of its chains and name them according to whats on the PDB ''' polymers = pypdb.get_all_info(cif_id)['polymer'] taxonomy = set() chain_to_name = {} for poly in range(len(polymers)): taxonomy.add(polymers[poly]['Taxonomy']['@name']) polymer_description = polymers[poly]['polymerDescription'][ '@description'].split(',')[0] entry_name = re.sub('\(|\)', '', polymer_description) chain = polymers[poly]['chain']['@id'] name = re.sub('ribosomal\s|protein\s|60S\s|40S\s|subunit\s|\sprotein', '', entry_name, flags=re.IGNORECASE) chain_to_name[name] = chain if len(taxonomy) > 1: print('multiple species in structure') exit for organism in taxonomy: Genus_species = organism.replace(' ', '_') ge_sp = organism.split() GeSp = ge_sp[0][:2] + ge_sp[1][0].capitalize() + ge_sp[1][1] cmd.fetch(cif_id, cif_id + '_' + GeSp) os.rename(cif_id + '.cif', cif_id + '_' + Genus_species + '.cif') # os.remove(cif_id+'.cif') # Not sure why this isn't working for obj_name in chain_to_name: cmd.create( GeSp + '_' + cif_id + '_' + obj_name, cif_id + '_' + GeSp + ' and chain ' + chain_to_name[obj_name]) cmd.show_as('cartoon')
def _get_edge_cluster(pdb,pdb_names,percent): edges = [] # get info and chain labes polymer = pypdb.get_all_info(pdb)['polymer'] chain_labels = [] for p in polymer: chain = p['chain'] if not isinstance(chain,list): chain = [chain] for c in chain: chain_labels.append(c['@id']) # get all the neighbors for the all the chains for chain in chain_labels: check, niter = False, 0 # get the cluster while not check and niter < 10: try: cluster,check = pypdb.get_seq_cluster_percent(pdb+'.'+chain, percent=percent) cluster = cluster['pdbChain'] except Exception as e: print(str(e)) print('Request failed for %s.%s -> Trying again' %(pdb,chain)) niter += 1 # add the (pdb,pdbneighbor) to the edge list if len(cluster)>0: if not isinstance(cluster,list): cluster = [cluster] for n in cluster: pdbid = n['@name'].split('.')[0] # make sure the neighbor is in the pdb_names if pdbid in pdb_names: edges.append((pdb,pdbid)) return edges
def screen_pdb(pdb, dict_cond=None): if dict_cond is None: dict_cond = { 'method': 'xray', 'resolution': np.float('Inf'), 'number_of_entity': 2, 'type': ['protein'], 'len_min': 50, 'len_max': 5000 } check = True # info = pypdb.get_entity_info(pdb) info = pypdb.get_all_info(pdb)['rcsb_entry_info'] # method check *= info['experimental_method'] == dict_cond['method'] if not check: reason = 'Incorrect Method : %s' % info['experimental_method'] return check, reason if 'diffrn_resolution_high' not in info: info['diffrn_resolution_high'] = { 'provenance_source': None, 'value': np.Inf } check *= float( info['diffrn_resolution_high']['value']) <= dict_cond['resolution'] if not check: reason = 'Low Resolution (%1.2f)' % float(info['resolution']) return check, reason # number of entity # entity = _make_list(info['Entity']) # check *= len(entity) == dict_cond['number_of_entity'] # if not check: # reason = 'Wrong number of entitites %d' % len(entity) # return check, reason # number/type of chain types = info['selected_polymer_entity_types'] check *= types in dict_cond['type'] if not check: reason = 'Incorrect chain Type %s' % types return check, reason # chain = _make_list(e['Chain']) # check *= len(chain) == bioAss # if not check: # reason = 'Incorrect Number of Chain %d/%d' % ( # len(chain), bioAss) # return check, reason # lentgth l = info['deposited_polymer_monomer_count'] check *= (l >= dict_cond['len_min'] and l <= dict_cond['len_max']) if not check: reason = 'Incorrect chain length %d' % l return check, reason return check, 'Entries accepted'
"""this code is showing two methods to establish the relations between uniprotID and pdb chainID""" import os ##for directory import pypdb import pprint #First method os.chdir('/Users/luho/PycharmProjects/pdb/code') all_info = pypdb.get_all_info('6fai') pprint.pprint(all_info) #second method #download all the Structure-chainID-UniprotID mapping from http://www.ebi.ac.uk/pdbe/docs/sifts/
for i in tqdm(checked_list): if i in pdb_ids: pdb_ids.remove(i) for pdb_id in tqdm(pdb_ids): pdb_info = describe_pdb(pdb_id) AAs = int(pdb_info["nr_residues"]) if AAs <= 400: if AAs > 200: info = get_all_info(pdb_id) try: enzyClass = info["polymer"]["enzClass"]["@ec"] ec.write("{}, ".format(pdb_id)) except: pass else: pass else: pass checked.write("{}, ".format(pdb_id)) def read_list(path): "Reads list stored in txt file" list_enzymes = open(path,'r')
def get_protein_cluster_graph(cluster, percent): edges, nodes, dict_chains = {}, {}, {} Edge = namedtuple('Edge', ['weight', 'txt']) Node = namedtuple('Node', ['number', 'txt']) pdbid = None for pdb in tqdm(cluster): print_id(pdb, pdb, pdbid) # get the polymer infos check, niter = True, 0 while check and niter < 10: try: polymer = pypdb.get_all_info(pdb)['polymer'] check = False except: print('PDBUnique -> Issue getting info for :', pdb) print('PDBUnique -> Trying again in 5 sec') time.sleep(5) niter += 1 if check: print('PDBUnique -> Entry %s ignored' % pdb) continue # get the chain labels chain_labels, chain_entity = [], [] for ip, p in enumerate(polymer): chain = p['chain'] # only conserve the first chain if isinstance(chain, list): chain = chain[0] chain_labels.append(chain['@id']) chain_entity.append(ip) # init the names names = [None] * len(chain_labels) print_id(chain, pdb, pdbid) nup = 0 # enumerate chans for ic, (chain, ip) in enumerate(zip(chain_labels, chain_entity)): # pdb.chain ID id_chain = pdb + '.' + chain # add the pdb.chain ID to the general dict # {pdb.chain: prot name} if id_chain not in dict_chains: # use the macromolecule or polymer description name for name_option, tag in zip( ['polymerDescription', 'macroMolecule'], ['@description', '@name']): if name_option in polymer[ip]: if isinstance(polymer[ip][name_option], list): names[ic] = polymer[ip][name_option][0][tag] else: names[ic] = polymer[ip][name_option][tag] break if names[ic] == 'Uncharacterized Protein': names[ic] = 'UP_%03d' % nup nup += 1 # add the pdb.chain to the dict dict_chains[id_chain] = names[ic] # get the seq similarity of the chain check, niter = True, 0 while check and niter < 10: try: cluster, _ = pypdb.get_seq_cluster_percent( id_chain, percent=percent) check = False except: print('PDBUnique -> Issue getting cluster for :', id_chain) print('PDBUnique -> Trying again in 5 sec') time.sleep(5) niter += 1 if check: print('PDBUnique -> Entry %s ignored' % id_chain) cluster = [] else: cluster = cluster['pdbChain'] print_id(cluster, pdb, pdbid) # add all the chains with similar seq # to the dict_chain {pdb.chain: prot_name} if len(cluster) > 0: if not isinstance(cluster, list): cluster = [cluster] for n in cluster: dict_chains[n['@name']] = names[ic] # reuse the previously defined entry else: names[ic] = dict_chains[id_chain] # add the node to the dict of Node namedtuples key = names[ic] print_id(key, pdb, pdbid) if key not in nodes: nodes[key] = Node(number=1, txt=id_chain) else: nodes[key] = nodes[key]._replace(number=nodes[key].number + 1) if nodes[key].number < 35: nodes[key] = nodes[key]._replace(txt=nodes[key].txt + '<br>' + id_chain) elif nodes[key] == 35: nodes[key] = nodes[key]._replace(txt=nodes[key].txt + '<br>' + '...') # add the edge to the dict of Edge namedtuples names.sort() key = tuple(names) print_id(key, pdb, pdbid) if key not in edges: edges[key] = Edge(weight=1, txt=pdb) else: edges[key] = edges[key]._replace(weight=edges[key].weight + 1) edges[key] = edges[key]._replace(txt=edges[key].txt + '<br>' + pdb) # Create the graph g = nx.Graph() for node_key, node_val in nodes.items(): g.add_node(node_key, number=node_val.number, txt=node_val.txt) for edge_key, edge_val in edges.items(): g.add_edge(edge_key[0], edge_key[1], weight=edge_val.weight, txt=edge_val.txt) return g
def _get_pdb_length(pdb): polymer = pypdb.get_all_info(pdb)['polymer'] l = [] for p in polymer: l.append(float(p['@length'])) return l