def get_structure(pdb_data_folder, structure_PDB_ID): """ Function to retrieve information about structure of specified molecule :param pdb_data_folder: path to folder contaiting all 3D structures :param structure_PDB_ID: structures PDB ID :return: the structure object """ if (not is_non_zero_file(pdb_data_folder / (structure_PDB_ID + ".pdb")) ) and (not is_non_zero_file(pdb_data_folder / (structure_PDB_ID + ".cif"))): PDBList().retrieve_pdb_file(structure_PDB_ID, pdir=pdb_data_folder) if is_non_zero_file(pdb_data_folder / (structure_PDB_ID + ".pdb")): parser_pdb = PDBParser() structure = parser_pdb.get_structure( structure_PDB_ID, pdb_data_folder / (structure_PDB_ID + ".pdb")) if is_non_zero_file(pdb_data_folder / (structure_PDB_ID + ".cif")): try: parser_cif = MMCIFParser(QUIET=True) structure = parser_cif.get_structure( structure_PDB_ID, pdb_data_folder / (structure_PDB_ID + ".cif")) except: PDBList().retrieve_pdb_file(structure_PDB_ID, pdir=pdb_data_folder, file_format='pdb') parser_pdb = PDBParser() structure = parser_pdb.get_structure( structure_PDB_ID, pdb_data_folder / (structure_PDB_ID + ".pdb")) return structure
def getPDB(self, ID=None): ''' Retrives a PDB file from RCSB when ID is supplied or OBJECT.id is defined ''' from Bio.PDB import PDBList if ID is not None : return PDBList().retrieve_pdb_file(ID, pdir = '.', file_format = 'pdb') elif self.id is not None: return PDBList().retrieve_pdb_file(self.id, pdir = '.', file_format = 'pdb')
def mmcif_Method(pdbId_list, filePath, info_dict): def loadPDB_CIF_format(pdbId, filePath, pdbl): pdbFileSavePath = '%s%s.cif' % (filePath, pdbId) try: mmcif_dict = MMCIF2Dict(pdbFileSavePath) except IOError: # Get the file pdbl.retrieve_pdb_file(pdbId, file_format='mmCif', pdir=filePath) mmcif_dict = MMCIF2Dict(pdbFileSavePath) return mmcif_dict def getChainInfo_CIF(mmcif_dict, resListName, chainListName): dfrm = pd.DataFrame({ 'res': mmcif_dict[resListName], 'chain_id': mmcif_dict[chainListName] }) chainInfo = '' for chain, df in dfrm.groupby('chain_id'): chainInfo += '%s: %s; ' % ( chain, len(list(filter(lambda x: x != '?', df['res'])))) return chainInfo pdbl = PDBList() for pdbId in pdbId_list: mmcif_dict = loadPDB_CIF_format(pdbId, filePath, pdbl) info_dict['PDBID'].append(pdbId) info_dict['name'].append(mmcif_dict['_struct.title']) info_dict['PMID'].append( mmcif_dict['_citation.pdbx_database_id_PubMed']) info_dict['resolution'].append(mmcif_dict['_refine.ls_d_res_high']) info_dict['Chain Information'].append( getChainInfo_CIF(mmcif_dict, '_pdbx_poly_seq_scheme.pdb_mon_id', '_pdbx_poly_seq_scheme.pdb_strand_id')) return info_dict
def get_unique(input_df): from Bio.PDB import PDBList unique_pdbs = input_df.CPX.unique() pdbl = PDBList() for single_pdb in unique_pdbs: pdbl.retrieve_pdb_file(single_pdb, pdir='PDB', file_format="pdb")
def get_pdb(pdb_list): import os from Bio.PDB import PDBList out_dir = "PDB_benchmark_structures\\" pdb = pdb_list number_ids = len(pdb) filename = [] not_found = [] print("Downloading in %s:\n" % out_dir) for i, pdbid in enumerate(pdb): print('%s' % pdbid[:4]) pdbl = PDBList() try: if not os.path.exists("{}{}.pdb".format(out_dir, pdbid)): x = pdbl.retrieve_pdb_file(pdbid[:4], file_format='pdb', pdir=out_dir) filename.append(x) except FileNotFoundError: not_found.append(pdbid) print("(NOTE) {} not found.".format(pdbid)) return filename
def build_xtalstable(dbpathstr, sourcedbpathstrs, pdbformat='cif'): dbpath = Path(dbpathstr) today = datetime.date.today() todaystr = f'{today.year}-{today.month:02d}-{today.day:02d}' conn = seqdbutils.gracefuldbopen(dbpath) xtaldirpath = Path(dbpath).parent / 'Xtals' if not xtaldirpath.exists(): os.mkdir(xtaldirpath) c = conn.cursor() c.execute( '''CREATE TABLE IF NOT EXISTS XTALS (pdbid text, acc text, srcdb text,dldate text, relpath text, pdbformat text, dlsuccess int, obsolete int)''') pdbl = PDBList() cxRE = re.compile('([A-Za-z0-9]{4})\[') #by default folder containing xtals is named 'xtals' and in same directory as xtalsdb for srcdbpathstr in sourcedbpathstrs: srcdbpath = Path(srcdbpathstr) srcdbstr = srcdbpath.name src_conn = seqdbutils.gracefuldbopen(srcdbpath) seqdbutils.check_tables_exist(src_conn, ['CAZYSEQDATA']) src_c = src_conn.cursor() src_c.execute( 'SELECT acc,pdbids FROM CAZYSEQDATA WHERE pdbids NOT NULL') pdbrows = src_c.fetchall() dbpdbs = [] dbaccs = [] for pdbrow in pdbrows: pdbentry = pdbrow['pdbids'] accentry = pdbrow['acc'] pdbs = cxRE.findall(pdbentry) dbaccs.extend([accentry for _ in range(len(pdbs))]) dbpdbs.extend(pdbs) #dbpdbs=['4IM4'] pdbl.download_pdb_files(dbpdbs, pdir=xtaldirpath) #,obsolete=True) for acc, pdb in zip(dbaccs, dbpdbs): rel_pdbpath = xtaldirpath / f'{pdb}.cif' download_success = os.path.exists(rel_pdbpath) str_relpath = str(rel_pdbpath) c.execute( '''SELECT COUNT(*) FROM XTALS WHERE pdbid=(?) AND dlsuccess=(?)''', (pdb, 1)) already_downloaded = c.fetchone()[0] if already_downloaded: continue c.execute( '''SELECT COUNT(*) FROM XTALS WHERE pdbid=(?) AND dlsuccess=(?)''', (pdb, 0)) previously_failed = c.fetchone()[0] if previously_failed: if download_success: print(f'new download of previously failed {pdb}') c.execute( '''UPDATE XTALS SET dldate = (?), dlsuccess = (?) WHERE pdbid=(?)''', (todaystr, 1, pdb)) continue c.execute('''INSERT INTO XTALS VALUES (?,?,?,?,?,?,?,?)''',\ (pdb,acc,srcdbstr,todaystr,str_relpath,pdbformat,download_success,None)) conn.commit() src_conn.close() conn.close()
def download_pdb(config, pdb_code: str) -> Path: """ Download PDB structure from PDB. :param pdb_code: 4 character PDB accession code. :type pdb_code: str :return: returns filepath to downloaded structure. :rtype: str """ if not config.pdb_dir: config.pdb_dir = Path("/tmp/") # Initialise class and download pdb file pdbl = PDBList() pdbl.retrieve_pdb_file(pdb_code, pdir=config.pdb_dir, overwrite=True, file_format="pdb") # Rename file to .pdb from .ent os.rename( config.pdb_dir / f"pdb{pdb_code}.ent", config.pdb_dir / f"{pdb_code}.pdb", ) # Assert file has been downloaded assert any(pdb_code in s for s in os.listdir(config.pdb_dir)) log.info(f"Downloaded PDB file for: {pdb_code}") return config.pdb_dir / f"{pdb_code}.pdb"
def pdb_Method(pdbId_list, filePath, info_dict): def loadPDB_PDB_format(pdbId, filePath, parser, pdbl): try: structure = parser.get_structure( pdbId, filePath + "pdb%s.ent" % pdbId.lower()) except IOError: pdbl.retrieve_pdb_file(pdbId, file_format='pdb', pdir=filePath) structure = parser.get_structure( pdbId, filePath + "pdb%s.ent" % pdbId.lower()) return structure def getChainInfo_PDB(structure): model = structure[0] chainInfo = '' for chain in model.get_chains(): chainInfo += '%s: %s; ' % (chain.id, len(list(filter(is_aa, chain)))) return chainInfo parser = PDBParser(PERMISSIVE=1) pdbl = PDBList() findPMID = re.compile(r'PMID\s+([\d,]+)') for pdbId in pdbId_list: structure = loadPDB_PDB_format(pdbId, filePath, parser, pdbl) info_dict['PDBID'].append(pdbId) info_dict['name'].append(structure.header['name']) info_dict['PMID'].append(findPMID.findall(structure.header['journal'])) info_dict['resolution'].append(structure.header['resolution']) info_dict['Chain Information'].append(getChainInfo_PDB(structure)) return info_dict
def transform(tab, pdbrepo='pdbs'): def trans1(p, Id): path = os.path.join(os.path.dirname(p), '{}.pdb'.format(Id)) try: copyfile(p, path) except: sys.stderr.write( 'error when downloading structure {}\n'.format(Id)) return None return os.path.relpath(path, os.getcwd()) def func1(row): path = pl.retrieve_pdb_file(row['accession'], file_format='pdb') return trans1(path, row['accession']) def func2(row): ret = [ row['fromA'] + '-' + row['toA'], row['fromB'] + '-' + row['toB'] ] return pd.Series(ret) ret = tab[['A', 'B', 'accession', 'chainA', 'chainB']] #prepare for path/ locA/ locB pl = PDBList(pdb='pdbs') pdbs = tab.apply(func1, 1) ret = pd.concat([ret, pdbs], axis=1) locs = tab.apply(func2, 1) ret = pd.concat([ret, locs], axis=1) ret.columns = [ 'chainA', 'chainB', 'PDBid', 'codeA', 'codeB', 'path', 'locA', 'locB' ] return ret
def retrieve_cif_list(): server = PDBList(server='ftp://ftp.wwpdb.org', pdb='input_files', obsolete_pdb=None ,verbose=True) pdb_list = open('input_files/protlist.txt','r') content = pdb_list.read().split() pdb_list.close() server.download_pdb_files(content,pdir="input_files/cif",file_format='mmCif', overwrite=True,obsolete= False)
def download_pdbs(base_dir, protein_codes): """ Downloads the PDB database (or a part of it) as PDB files. Every protein is stored in it's own directory (with name the PDB code) under base_dir. :param base_dir: where to download all the proteins. :param protein_codes: the PDB codes of the proteins that should be downloaded. """ prot_codes = [] if isinstance(protein_codes, dict): for key in protein_codes.keys(): prot_codes += protein_codes[key] else: prot_codes = protein_codes prot_codes = list(set(prot_codes)) from Bio.PDB import PDBList failed = 0 attempted = len(prot_codes) for code in prot_codes: try: pl = PDBList(pdb=os.path.join(base_dir, code.upper())) pl.flat_tree = 1 pl.retrieve_pdb_file(pdb_code=code) except IOError: log.warning("Failed to download protein {}".format(code)) failed += 1 continue log.info("Downloaded {0}/{1} molecules".format(attempted - failed, attempted))
def __fetch_pdb(self): pdb_file_path = PDBList().retrieve_pdb_file(self.__pdb_key, pdir='mutaviz/atom_files', file_format="pdb") new_file_name = 'mutaviz/atom_files/%s.pdb' % self.__pdb_key os.rename(pdb_file_path, new_file_name) return new_file_name
def from_id(cls, pdb_id): """ Initialize structure by PDB ID (fetches structure from RCSB servers) Parameters ---------- pdb_id : str PDB identifier (e.g. 1hzx) Returns ------- PDB initialized PDB structure """ from urllib.error import URLError from Bio.PDB import PDBList pdblist = PDBList() try: # download PDB file to temporary directory pdb_file = pdblist.retrieve_pdb_file(pdb_id, pdir=tempdir()) return cls.from_file(pdb_file, file_format="pdb") except URLError as e: raise ResourceError( "Could not fetch PDB data for {}".format(pdb_id) ) from e
def generate_seq_file(score_file, save_file): score_file = './dataFile/' + score_file sf = pd.read_csv(score_file, sep='\t') mut_chains = sf.iloc[:,0] mut_dict = dict() mut_track = set() pdb_track = set() for chain in mut_chains: info = chain.split('_') pdb_id = info[0] chain_id = info[1] wt_aa = info[2][0:3] mu_aa = info[2][-3:] mu_pos = int(''.join(filter(lambda x: x.isdigit(), info[2]))) if not chain in mut_track: mut_track.add(chain) if pdb_id in pdb_track: mut_dict[pdb_id].append({'chain_id':chain_id, 'wt_aa': wt_aa, 'mu_aa': mu_aa, 'mu_pos': mu_pos, 'name': chain}) else: mut_dict[pdb_id] = [{'chain_id': chain_id, 'wt_aa': wt_aa, 'mu_aa': mu_aa, 'mu_pos': mu_pos, 'name': chain}] pdb_track.add(pdb_id) del mut_track del pdb_track parser = PDBParser() seq_builder = PPBuilder() pdb_dl_handle = PDBList() PDB_DIR = './dataFile/PDB_dl' # check if pdb file exists mut_collect = dict() for pdb_id in mut_dict.keys(): if not os.path.exists(PDB_DIR+'/pdb'+pdb_id.lower()+'.ent'): pdb_dl_handle.retrieve_pdb_file(pdb_code=pdb_id, file_format='pdb', overwrite=False, pdir=PDB_DIR) pdb_file = PDB_DIR+'/pdb'+pdb_id.lower()+'.ent' model = parser.get_structure(pdb_id, pdb_file)[0] for mutation in mut_dict[pdb_id]: protein_chain = model[mutation['chain_id']] sequence = "".join([str(pp.get_sequence()) for pp in seq_builder.build_peptides(protein_chain)]) sequence = sequence.replace('\n', '').replace(' ', '') assert sequence[mutation['mu_pos']-1] == three_to_one(mutation['wt_aa']), 'Wt amino acid failed to match' mut_Seq_list = list(sequence) mut_Seq_list[mutation['mu_pos']-1] = three_to_one(mutation['mu_aa']) mut_Seq = ''.join(mut_Seq_list) mut_collect[mutation['name']] = mut_Seq with open(save_file, 'w') as output_hl: for k, v in mut_collect.items(): output_hl.write(k+'\t'+v+'\n')
def getpdbIds(dic, fastafile, fbase1, fbase2, pathToProg): #old files are deleting if os.path.exists("./PDB/"): commands.getoutput('rm ./PDB/*.pdb') #create a new folder that will contain the pdb files to download commands.getoutput('mkdir ./PDB/') #file 1 with open(fbase1, 'r') as fd: database1 = fd.readlines() #file 2 with open(fbase2, 'r') as fd: database2 = fd.readlines() #Output dictionary diProt = {} pdbl = PDBList() '''Selecting structures from PDB''' for protId in dic: #The 3D structures associated with the protId protein in the first file associate1 = list(filter(lambda line: protId in line, database1)) #The 3D structures associated with the protId protein in the second file associate2 = list(filter(lambda line: protId in line, database2)) associate_pdb = [] if associate1 != []: #When associated 3D structures are found in file 1 for line in associate1: pdbId = line.split('\t')[0] associate_pdb.append(pdbId) pdbId = getBestPdbId(fastafile, protId, associate_pdb, pathToProg) if commands.getoutput('wget -c http://www.rcsb.org/pdb/files/' + pdbId + '.pdb -O ' + os.getcwd() + "/PDB/" + pdbId + '.pdb'): diProt[protId] = pdbId else: diProt[protId] = "" elif associate2 != []: #When associated 3D structures are found in file 2 for line in associate2: pdbId = line.split('\t')[0] associate_pdb.append(pdbId) pdbId = getBestPdbId(fastafile, protId, associate_pdb) if commands.getoutput('wget -c http://www.rcsb.org/pdb/files/' + pdbId + '.pdb -O ' + os.getcwd() + "/PDB/" + pdbId + '.pdb'): diProt[protId] = pdbId else: diProt[protId] = "" else: diProt[protId] = "" #Everything is renaming from .cif to .pdb if it exists commands.getoutput('bash ./Programs/rename.sh cif pdb') if os.path.exists("pdb_tmp"): commands.getoutput('rm -r ./pdb_tmp') #Return a dictionary whose keys are the identifiers of the proteins #and the aqssociated values are the pdb identifiers associated with its proteins return diProt
def download(filelist: list, q: Queue, lock: Lock, cursor: sqlite3.Cursor, conn: sqlite3.Connection, dir_name: str): """ :param filelist: :param q: :param lock: :param cursor: :param conn: :param dir_name: """ with open('status_tmp.txt', 'w') as f: f.write('') for file in filelist: if file in open('status_tmp.txt').readlines(): continue pdbl = PDBList() pdbl.retrieve_pdb_file(file, pdir=os.path.join(dir_name, file), file_format='pdb') if not os.path.exists(os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file))): print("File with ID PDB: {:s} not found!".format(file)) continue parser = PDBParser() structure = parser.get_structure('{:s}', os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file))) name = parser.header.get('name', '') head = parser.header.get('head', '') method = parser.header.get('structure_method', '') res = parser.header.get('resolution', '') ncomp = 0 nchain = 0 eclist = [] for values in parser.header['compound'].values(): ncomp += 1 nchain += len(values['chain'].split(',')) eclist.append(values.get('ec', '') or values.get('ec_number', '')) ec = ", ".join(eclist) nres = 0 mmass = 0 ppb = PPBuilder() for pp in ppb.build_peptides(structure): seq = pp.get_sequence() nres += len(seq) seqan = ProteinAnalysis(str(seq)) mmass += int(seqan.molecular_weight()) lock.acquire() try: cursor.execute("""INSERT INTO Structures (IDPDB, NAME, HEAD, METHOD, RESOLUTION, NCOMP, NCHAIN, NRES, MMASS, EC) VALUES ("{:s}", "{:s}", "{:s}", "{:s}", {:.2f}, {:d}, {:d},{:d}, {:d}, "{:s}")""".format( file, name, head, method, res, ncomp, nchain, nres, mmass, ec)) except sqlite3.DatabaseError as err: print("Error: ", err) continue else: print("Download Done for ID PDB: {:s}".format(file)) conn.commit() q.put(file) finally: lock.release() with open('status_tmp.txt', 'at') as f: f.write((file + '\n')) os.remove('status_tmp.txt') q.put(None)
def fetchPDB(name, path): """Fetch a pdb and save to path""" from Bio.PDB import PDBList pdbname = os.path.join(path,name+'.pdb') pdbl = PDBList() filename = pdbl.retrieve_pdb_file(name,pdir=path) os.rename(filename, pdbname) return
def struct_retrieve(self): """ Retrieve PDB structure given argparse ID """ self.pdb_id = str(self.args.id_input) pdbl = PDBList() pdbl.retrieve_pdb_file(self.pdb_id, file_format='pdb', pdir=".")
def struct_retrieve(self): """ Retrieve PDB structure given argparse ID """ pdbl = PDBList() pdbl.retrieve_pdb_file(self.pdb_id, file_format='pdb', pdir=f"{self.out_dir}/")
def download_structure_file(pdb_id: str) -> None: """Download PDB/mmCIF file with a user provided identifer from PDB using BioPython library :param pdb_id: the protein id in protein data bank :type pdb_id: str """ pdb_list = PDBList() pdb_list.retrieve_pdb_file(pdb_id) return
def downloadPdb(pdb_list): os.system("mkdir -p original_pdbs") for pdb_id in pdb_list: pdb = f"{pdb_id.lower()[:4]}" pdbFile = pdb + ".pdb" if not os.path.isfile("original_pdbs/" + pdbFile): pdbl = PDBList() name = pdbl.retrieve_pdb_file(pdb, pdir='.', file_format='pdb') os.system(f"mv {name} original_pdbs/{pdbFile}")
def load_data(experiment: str, in_file: str, out_dir: str) -> None: #in_file = os.path.join('../data/', experiment, 'full_list.txt') print(in_file) #out_dir = os.path.join('../data/raw/', experiment) pdbl = PDBList(server='http://ftp.wwpdb.org', verbose=False) with open(in_file, 'r') as molecule_id_list: molecule_id_list = molecule_id_list.readlines() for molecule_id in tqdm(molecule_id_list): pdbl.retrieve_pdb_file(molecule_id.strip('\n').split('_')[0], pdir=out_dir, file_format='pdb')
def download_PDB(pdb_ids, pdb_dir='.'): # Define pdb file fetching class pdbl = PDBList() # Fetch every protein for pdb_id in pdb_ids: # Debug logging.debug('PDB file which will be downloaded') logging.debug(pdb_id) # Execute fetching of the protein (pdb file) pdbl.retrieve_pdb_file(pdb_id, pdir=pdb_dir, file_format='pdb')
def descargarPDB(pdb): pdbl = PDBList() pdbl.retrieve_pdb_file(pdb, pdir='./Script/PDB', file_format='pdb') parser = PDBParser() ent_file = './Script/PDB/pdb' + pdb.lower() + '.ent' structure = parser.get_structure(pdb, ent_file) io = PDBIO() io.set_structure(structure) pdb_structure_file = './Script/PDBStructure/' + pdb + '.pdb' io.save(pdb_structure_file)
def retrieve_cif(prot_id): server = PDBList(server='ftp://ftp.wwpdb.org', pdb='input_files', obsolete_pdb=None, verbose=True) server.retrieve_pdb_file(prot_id, pdir="input_files/cif", file_format='mmCif', overwrite=True, obsolete=False)
def download_PDB_structures(pdb_ID): pdb_data_folder = Path("PDB_files_raw/") if not os.path.exists(pdb_data_folder): os.makedirs(pdb_data_folder) pdb_ID = pdb_ID.lower() if (not is_non_zero_file(pdb_data_folder / (pdb_ID + ".pdb"))) and ( not is_non_zero_file(pdb_data_folder / (pdb_ID + ".cif"))): PDBList().retrieve_pdb_file(pdb_ID, pdir=pdb_data_folder, file_format='mmCif')
def obtian_seq_wo_seq_file(score_file): score_file = './dataFile/' + score_file sf = pd.read_csv(score_file, sep='\t') chains_involved = sf.iloc[:, 0] pdb = dict() pdb_track = set() for chain in chains_involved: chain_name = chain[0:6] pdb_name = chain[0:4] # if we encounter a old pdb if pdb_name in pdb_track: pdb[pdb_name].add(chain_name) # else, we have a new pdb else: # update the track file pdb_track.add(pdb_name) pdb[pdb_name] = {chain_name} # create the link to the PDB database and retrive all the file # related to the files, store them locally under ./dataFile/PDB_dl/ PDB_DIR = './dataFile/PDB_dl' if not os.path.exists(PDB_DIR): os.mkdir(PDB_DIR) # create the download handle pdb_dl_handle = PDBList() # download all of the pdb files for item in pdb.keys(): if not os.path.exists(PDB_DIR + '/pdb' + item.lower() + '.ent'): pdb_dl_handle.retrieve_pdb_file(pdb_code=item, file_format='pdb', overwrite=False, pdir=PDB_DIR) # for each pdb, we will construct the sequence seq_dict = dict() parser = PDBParser() seq_builder = PPBuilder() # key is the pdb_id, value is the chain in a for pdb_id, chain_names in pdb.items(): pdb_file = PDB_DIR + '/pdb' + pdb_id.lower() + '.ent' model = parser.get_structure(pdb_id, pdb_file)[0] for chain in chain_names: # extract the last letter, which is the chain name chain_id = chain[-1] protein_chain = model[chain_id] sequence = "".join([ str(pp.get_sequence()) for pp in seq_builder.build_peptides(protein_chain) ]) sequence = sequence.replace('\n', '').replace(' ', '') # clean the bad chars seq_dict[chain] = sequence return seq_dict
def download_structure_file(self, outdir, file_type=None, load_header_metadata=True, force_rerun=False): """Download a structure file from the PDB, specifying an output directory and a file type. Optionally download the mmCIF header file and parse data from it to store within this object. Args: outdir (str): Path to output directory file_type (str): ``pdb``, ``mmCif``, ``xml``, ``mmtf`` - file type for files downloaded from the PDB load_header_metadata (bool): If header metadata should be loaded into this object, fastest with mmtf files force_rerun (bool): If structure file should be downloaded even if it already exists """ ssbio.utils.double_check_attribute( object=self, setter=file_type, backup_attribute='file_type', custom_error_text= 'Please set file type to be downloaded from the PDB: ' 'pdb, mmCif, xml, or mmtf') # XTODO: check if outfile exists using ssbio.utils.force_rerun, pdblist seems to take long if it exists # I know why - it's because we're renaming the ent to pdb. need to have mapping from file type to final extension # Then check if file exists, if not then download again p = PDBList() with ssbio.utils.suppress_stdout(): structure_file = p.retrieve_pdb_file(pdb_code=self.id, pdir=outdir, file_format=file_type, overwrite=force_rerun) if not op.exists(structure_file): log.debug('{}: {} file not available'.format(self.id, file_type)) raise URLError('{}.{}: file not available to download'.format( self.id, file_type)) else: log.debug('{}: {} file saved'.format(self.id, file_type)) # Rename .ent files to .pdb if file_type == 'pdb': new_name = structure_file.replace('pdb', '').replace('ent', 'pdb') os.rename(structure_file, new_name) structure_file = new_name self.load_structure_path(structure_file, file_type) if load_header_metadata and file_type == 'mmtf': self.update(parse_mmtf_header(structure_file)) if load_header_metadata and file_type != 'mmtf': self.update( parse_mmcif_header( download_mmcif_header(pdb_id=self.id, outdir=outdir, force_rerun=force_rerun)))
def download_pdb_structure(pdb_code, pdb_file_name, file_path='.'): """Downloads a PDB structure from the Protein Data Bank""" pdbl = PDBList() file_name = pdbl.retrieve_pdb_file(pdb_code, file_format='pdb', pdir=file_path, overwrite=True) if os.path.exists(file_name): os.rename(file_name, pdb_file_name) else: raise Exception("Can not download structure: {0}".format(pdb_code))
def get_pdb(pdb_list): from Bio.PDB import PDBList out_dir = "PDB_benchmark_structures\\" pdb = pdb_list number_ids = len(pdb) print("Downloading in %s:\n" % out_dir) for ids in pdb: print('%s' % ids[:4]) pdbl = PDBList() pdbl.retrieve_pdb_file(ids[:4], file_format='pdb', pdir=out_dir)