def fetchmol(qnmrfinal, qxrayfinal, qartfinal):
    """gets the molecular weight of each hit for each generated ID list and then generates a count
    for the coming bar graph. mol count info stored in dicts. The info of what weight each ID is is not recorded,
    rather, their existence is recorded if they are within a range of kDa"""
    nmrmolrange = np.linspace(2.5, 37.5, 15, endpoint=True)
    nmr = {i: 0 for i in nmrmolrange}
    xrayartmolrange = np.linspace(20, 280, 14, endpoint=True)
    xray = {i: 0 for i in xrayartmolrange}
    xray[1000] = 0
    art = {i: 0 for i in xrayartmolrange}
    art[1000] = 0
    for hit in qnmrfinal:
        try:
            test = pd.get_all_info(hit)
            nmr = molcounternmr(test, nmr)
        except:
            continue
    for hit in qxrayfinal:
        try:
            test = pd.get_all_info(hit)
            xray = molcounterxray(test, xray)
        except:
            continue
    for hit in qartfinal:
        try:
            test = pd.get_all_info(hit)
            art = molcounterart(test, art)
        except:
            continue
    print("NMR Mol Weight Tally:" + str(nmr))
    print("Xray Mol Weight Tally:" + str(xray))
    print("Artifact Mol Weight Tally:" + str(art))
    return nmr, xray, art
예제 #2
0
def extractData(maxDepositionDate, foundStructures):
    results = {}
    moleculeName = []
    entityTitle = ''

    for entry in foundStructures:
        entityInfo = describe_pdb(entry)
        depositionDate = datetime.date(
            parser.parse(entityInfo['deposition_date']))

        if depositionDate >= maxDepositionDate:
            entityStructureID = entityInfo['structureId']
            entityTitle = entityInfo['title']
            entityExtraInfo = get_all_info(entry)

            if int(describe_pdb(entry)['nr_entities']) > 1:
                for mol in entityExtraInfo['polymer']:
                    moleculeName.append(mol.get('macroMolecule').get('@name'))

            elif int(describe_pdb(entry)['nr_entities']) == 1:
                moleculeName = entityExtraInfo.get('polymer').get(
                    'macroMolecule').get('@name')

            else:
                moleculeName = 'No molecule name give'

            results[entry] = (moleculeName, entityTitle, str(depositionDate))

    return (results)
예제 #3
0
	def fetch_info(self, pdb_ids):
		for pdb_id in tqdm(pdb_ids, disc="Checking PDB for new structures", unit="Structures "):
			try:
				info = get_all_info(pdb_id)
				enzyClass = info["polymer"]["enzClass"]["@ec"]
				length = int(info["polymer"]["@length"])
				#ADD TO DATA FILE
			except:
				pass
			add_to_checked(pdb_id)
def get_organism(id):
    nombres = list()
    info = pypdb.get_all_info(id)
    info_nombre = info.get('polymer')
    #esto es porque a veces pueden tener mas de un organismo asociado
    if (isinstance(info_nombre, list)):
        for x in range(0, len(info_nombre)):
            nombres.append(info.get('polymer')[x].get('Taxonomy').get('@name'))
            nombres = list(dict.fromkeys(nombres))
        return ' '.join(nombres)
    return info_nombre.get('Taxonomy').get('@name')
예제 #5
0
def MakeTemplateList(fasta):
    counter = 0
    DataList = []
    for file in os.listdir('.'):
        if file.endswith('.table'):
            source = file[-9:-6]
            for line in open(file):
                counter += 1
                if counter == 2:
                    headerstring = line
                    headerstring = headerstring.replace(
                        'target name', 'target-name')
                    headerstring = headerstring.replace(
                        'query name', 'query-name')
                    headerstring = headerstring.replace(
                        'description of target', 'description-of-target')
                    headerstring = headerstring[1:].split()
                    headerstring.append('UniProtID')
                    headerstring.append('Source')
                    headerstring = list(MakeUniq(headerstring))

                elif counter >= 4:
                    if line[0] == '#':
                        pass
                    else:
                        line2 = line.split()
                        description = ' '.join(line2[18:])
                        line3 = line2[:18]
                        line3.append(description)
                        if source == 'CSA':
                            line3[0] = line3[0][:6].lower().replace(':', '_')
                        PDBinfo = pypdb.get_all_info(line3[0][:4])['polymer']
                        if isinstance(PDBinfo, list):
                            UniProtID = PDBinfo[0]['macroMolecule'][
                                'accession']['@id']
                        else:
                            UniProtID = PDBinfo['macroMolecule']['accession'][
                                '@id']
                        line3.append(UniProtID)
                        line3.append(source)
                        DataList.append(line3)
    df = pd.DataFrame(DataList, columns=headerstring)
    df['E-value'] = df['E-value'].astype(float)
    df.sort_values(by=['Source', 'E-value'], inplace=True)
    df.to_csv('HmmerInfo/CSV/' + fasta + '_fullhmmer.csv')
    df.drop_duplicates(subset='UniProtID', inplace=True)
    df.to_csv('HmmerInfo/CSV/' + fasta + '_noduphmmer.csv')
    targetlist = df['target-name'].tolist()
    os.system('mv ' + fasta + '.MCSA.table HmmerInfo/MCSAhm')
    os.system('mv ' + fasta + '.PDB.table HmmerInfo/PDBhm')
    return targetlist
예제 #6
0
def fetch_and_name_chains(cif_id):
    '''
DESCRIPTION

    1. Fecth a cif structure from the pdb
    2. Rename the cif file accoring to its 4-letter PDB-code followed by its Genus species name
    3. Create a PyMOL object for each of its chains and name them according to whats on the PDB

    '''

    polymers = pypdb.get_all_info(cif_id)['polymer']

    taxonomy = set()
    chain_to_name = {}
    for poly in range(len(polymers)):
        taxonomy.add(polymers[poly]['Taxonomy']['@name'])
        polymer_description = polymers[poly]['polymerDescription'][
            '@description'].split(',')[0]
        entry_name = re.sub('\(|\)', '', polymer_description)
        chain = polymers[poly]['chain']['@id']
        name = re.sub('ribosomal\s|protein\s|60S\s|40S\s|subunit\s|\sprotein',
                      '',
                      entry_name,
                      flags=re.IGNORECASE)
        chain_to_name[name] = chain

    if len(taxonomy) > 1:
        print('multiple species in structure')
        exit

    for organism in taxonomy:
        Genus_species = organism.replace(' ', '_')
        ge_sp = organism.split()
        GeSp = ge_sp[0][:2] + ge_sp[1][0].capitalize() + ge_sp[1][1]

    cmd.fetch(cif_id, cif_id + '_' + GeSp)
    os.rename(cif_id + '.cif', cif_id + '_' + Genus_species + '.cif')
    #    os.remove(cif_id+'.cif') # Not sure why this isn't working

    for obj_name in chain_to_name:
        cmd.create(
            GeSp + '_' + cif_id + '_' + obj_name,
            cif_id + '_' + GeSp + ' and chain ' + chain_to_name[obj_name])

    cmd.show_as('cartoon')
예제 #7
0
    def _get_edge_cluster(pdb,pdb_names,percent):

        edges = []

        # get info and chain labes
        polymer = pypdb.get_all_info(pdb)['polymer']
        chain_labels = []
        for p in polymer:

            chain = p['chain']
            if not isinstance(chain,list):
                chain = [chain]
            for c in chain:
                chain_labels.append(c['@id'])

        # get all the neighbors for the all the chains
        for chain in chain_labels:

            check, niter = False, 0
            # get the cluster
            while not check and niter < 10:
                try:
                    cluster,check = pypdb.get_seq_cluster_percent(pdb+'.'+chain, percent=percent)
                    cluster = cluster['pdbChain']
                except Exception as e:
                    print(str(e))
                    print('Request failed for %s.%s -> Trying again' %(pdb,chain))
                niter += 1

            # add the (pdb,pdbneighbor) to the edge list
            if len(cluster)>0:

                if not isinstance(cluster,list):
                    cluster = [cluster]

                for n in cluster:
                    pdbid = n['@name'].split('.')[0]

                    # make sure the neighbor is in the pdb_names
                    if pdbid in pdb_names:
                        edges.append((pdb,pdbid))

        return edges
예제 #8
0
def screen_pdb(pdb, dict_cond=None):

    if dict_cond is None:
        dict_cond = {
            'method': 'xray',
            'resolution': np.float('Inf'),
            'number_of_entity': 2,
            'type': ['protein'],
            'len_min': 50,
            'len_max': 5000
        }

    check = True
    # info = pypdb.get_entity_info(pdb)
    info = pypdb.get_all_info(pdb)['rcsb_entry_info']

    # method
    check *= info['experimental_method'] == dict_cond['method']
    if not check:
        reason = 'Incorrect Method : %s' % info['experimental_method']
        return check, reason

    if 'diffrn_resolution_high' not in info:
        info['diffrn_resolution_high'] = {
            'provenance_source': None,
            'value': np.Inf
        }

    check *= float(
        info['diffrn_resolution_high']['value']) <= dict_cond['resolution']
    if not check:
        reason = 'Low Resolution (%1.2f)' % float(info['resolution'])
        return check, reason

    # number of entity
    # entity = _make_list(info['Entity'])
    # check *= len(entity) == dict_cond['number_of_entity']
    # if not check:
    #     reason = 'Wrong number of entitites %d' % len(entity)
    #     return check, reason

    # number/type of chain
    types = info['selected_polymer_entity_types']
    check *= types in dict_cond['type']

    if not check:
        reason = 'Incorrect chain Type %s' % types
        return check, reason

        # chain = _make_list(e['Chain'])
        # check *= len(chain) == bioAss

        # if not check:
        #     reason = 'Incorrect Number of Chain %d/%d' % (
        #         len(chain), bioAss)
        #     return check, reason

    # lentgth
    l = info['deposited_polymer_monomer_count']

    check *= (l >= dict_cond['len_min'] and l <= dict_cond['len_max'])
    if not check:
        reason = 'Incorrect chain length %d' % l
        return check, reason

    return check, 'Entries accepted'
"""this code is showing two methods to establish the relations between uniprotID and pdb chainID"""
import os  ##for directory
import pypdb
import pprint

#First method
os.chdir('/Users/luho/PycharmProjects/pdb/code')
all_info = pypdb.get_all_info('6fai')
pprint.pprint(all_info)

#second method
#download all the Structure-chainID-UniprotID mapping from http://www.ebi.ac.uk/pdbe/docs/sifts/
예제 #10
0




for i in tqdm(checked_list):
	if i in pdb_ids:
		pdb_ids.remove(i)

for pdb_id in tqdm(pdb_ids):
	pdb_info = describe_pdb(pdb_id)
	AAs = int(pdb_info["nr_residues"])

	if AAs <= 400:
		if AAs > 200:
			info = get_all_info(pdb_id)
			try:
				enzyClass = info["polymer"]["enzClass"]["@ec"]
				ec.write("{}, ".format(pdb_id))
			except:
				pass
		else:
			pass
	else:
		pass
	checked.write("{}, ".format(pdb_id))


def read_list(path):
	"Reads list stored in txt file"
	list_enzymes = open(path,'r')
예제 #11
0
    def get_protein_cluster_graph(cluster, percent):

        edges, nodes, dict_chains = {}, {}, {}
        Edge = namedtuple('Edge', ['weight', 'txt'])
        Node = namedtuple('Node', ['number', 'txt'])
        pdbid = None
        for pdb in tqdm(cluster):

            print_id(pdb, pdb, pdbid)

            # get the polymer infos
            check, niter = True, 0
            while check and niter < 10:
                try:
                    polymer = pypdb.get_all_info(pdb)['polymer']
                    check = False
                except:
                    print('PDBUnique -> Issue getting info for :', pdb)
                    print('PDBUnique -> Trying again in 5 sec')
                    time.sleep(5)
                    niter += 1
            if check:
                print('PDBUnique -> Entry %s ignored' % pdb)
                continue

            # get the chain labels
            chain_labels, chain_entity = [], []
            for ip, p in enumerate(polymer):

                chain = p['chain']

                # only conserve the first chain
                if isinstance(chain, list):
                    chain = chain[0]

                chain_labels.append(chain['@id'])
                chain_entity.append(ip)

            # init the names
            names = [None] * len(chain_labels)
            print_id(chain, pdb, pdbid)
            nup = 0
            # enumerate chans
            for ic, (chain, ip) in enumerate(zip(chain_labels, chain_entity)):

                # pdb.chain ID
                id_chain = pdb + '.' + chain

                # add the pdb.chain ID to the general dict
                # {pdb.chain: prot name}
                if id_chain not in dict_chains:

                    # use the macromolecule or polymer description name
                    for name_option, tag in zip(
                        ['polymerDescription', 'macroMolecule'],
                        ['@description', '@name']):
                        if name_option in polymer[ip]:
                            if isinstance(polymer[ip][name_option], list):
                                names[ic] = polymer[ip][name_option][0][tag]
                            else:
                                names[ic] = polymer[ip][name_option][tag]
                            break

                        if names[ic] == 'Uncharacterized Protein':
                            names[ic] = 'UP_%03d' % nup
                            nup += 1

                    # add the pdb.chain to the dict
                    dict_chains[id_chain] = names[ic]

                    # get the seq similarity of the chain
                    check, niter = True, 0
                    while check and niter < 10:
                        try:
                            cluster, _ = pypdb.get_seq_cluster_percent(
                                id_chain, percent=percent)
                            check = False
                        except:
                            print('PDBUnique -> Issue getting cluster for :',
                                  id_chain)
                            print('PDBUnique -> Trying again in 5 sec')
                            time.sleep(5)
                            niter += 1

                    if check:
                        print('PDBUnique -> Entry %s ignored' % id_chain)
                        cluster = []
                    else:
                        cluster = cluster['pdbChain']
                    print_id(cluster, pdb, pdbid)

                    # add all the chains with similar seq
                    # to the dict_chain {pdb.chain: prot_name}
                    if len(cluster) > 0:
                        if not isinstance(cluster, list):
                            cluster = [cluster]
                        for n in cluster:
                            dict_chains[n['@name']] = names[ic]

                # reuse the previously defined entry
                else:
                    names[ic] = dict_chains[id_chain]

                # add the node to the dict of Node namedtuples
                key = names[ic]
                print_id(key, pdb, pdbid)
                if key not in nodes:
                    nodes[key] = Node(number=1, txt=id_chain)
                else:
                    nodes[key] = nodes[key]._replace(number=nodes[key].number +
                                                     1)
                    if nodes[key].number < 35:
                        nodes[key] = nodes[key]._replace(txt=nodes[key].txt +
                                                         '<br>' + id_chain)
                    elif nodes[key] == 35:
                        nodes[key] = nodes[key]._replace(txt=nodes[key].txt +
                                                         '<br>' + '...')

            # add the edge to the dict of Edge namedtuples
            names.sort()
            key = tuple(names)
            print_id(key, pdb, pdbid)
            if key not in edges:
                edges[key] = Edge(weight=1, txt=pdb)
            else:
                edges[key] = edges[key]._replace(weight=edges[key].weight + 1)
                edges[key] = edges[key]._replace(txt=edges[key].txt + '<br>' +
                                                 pdb)

        # Create the graph
        g = nx.Graph()

        for node_key, node_val in nodes.items():
            g.add_node(node_key, number=node_val.number, txt=node_val.txt)

        for edge_key, edge_val in edges.items():
            g.add_edge(edge_key[0],
                       edge_key[1],
                       weight=edge_val.weight,
                       txt=edge_val.txt)

        return g
예제 #12
0
 def _get_pdb_length(pdb):
     polymer = pypdb.get_all_info(pdb)['polymer']
     l = []
     for p in polymer:
         l.append(float(p['@length']))
     return l