Exemplo n.º 1
0
def get_structure(pdb_data_folder, structure_PDB_ID):
    """
    Function to retrieve information about structure of specified molecule
    :param pdb_data_folder: path to folder contaiting all 3D structures
    :param structure_PDB_ID: structures PDB ID
    :return: the structure object
    """
    if (not is_non_zero_file(pdb_data_folder / (structure_PDB_ID + ".pdb"))
        ) and (not is_non_zero_file(pdb_data_folder /
                                    (structure_PDB_ID + ".cif"))):
        PDBList().retrieve_pdb_file(structure_PDB_ID, pdir=pdb_data_folder)

    if is_non_zero_file(pdb_data_folder / (structure_PDB_ID + ".pdb")):
        parser_pdb = PDBParser()
        structure = parser_pdb.get_structure(
            structure_PDB_ID, pdb_data_folder / (structure_PDB_ID + ".pdb"))
    if is_non_zero_file(pdb_data_folder / (structure_PDB_ID + ".cif")):
        try:
            parser_cif = MMCIFParser(QUIET=True)
            structure = parser_cif.get_structure(
                structure_PDB_ID,
                pdb_data_folder / (structure_PDB_ID + ".cif"))
        except:
            PDBList().retrieve_pdb_file(structure_PDB_ID,
                                        pdir=pdb_data_folder,
                                        file_format='pdb')
            parser_pdb = PDBParser()
            structure = parser_pdb.get_structure(
                structure_PDB_ID,
                pdb_data_folder / (structure_PDB_ID + ".pdb"))
    return structure
Exemplo n.º 2
0
 def getPDB(self, ID=None):
     '''
     Retrives a PDB file from RCSB when ID is supplied
     or OBJECT.id is defined
     '''
     from Bio.PDB import PDBList
     if ID is not None :
         return PDBList().retrieve_pdb_file(ID, pdir = '.', file_format = 'pdb')
     elif self.id is not None:
         return PDBList().retrieve_pdb_file(self.id, pdir = '.', file_format = 'pdb')
Exemplo n.º 3
0
def mmcif_Method(pdbId_list, filePath, info_dict):
    def loadPDB_CIF_format(pdbId, filePath, pdbl):
        pdbFileSavePath = '%s%s.cif' % (filePath, pdbId)
        try:
            mmcif_dict = MMCIF2Dict(pdbFileSavePath)
        except IOError:
            # Get the file
            pdbl.retrieve_pdb_file(pdbId, file_format='mmCif', pdir=filePath)
            mmcif_dict = MMCIF2Dict(pdbFileSavePath)
        return mmcif_dict

    def getChainInfo_CIF(mmcif_dict, resListName, chainListName):
        dfrm = pd.DataFrame({
            'res': mmcif_dict[resListName],
            'chain_id': mmcif_dict[chainListName]
        })
        chainInfo = ''
        for chain, df in dfrm.groupby('chain_id'):
            chainInfo += '%s: %s; ' % (
                chain, len(list(filter(lambda x: x != '?', df['res']))))
        return chainInfo

    pdbl = PDBList()
    for pdbId in pdbId_list:
        mmcif_dict = loadPDB_CIF_format(pdbId, filePath, pdbl)
        info_dict['PDBID'].append(pdbId)
        info_dict['name'].append(mmcif_dict['_struct.title'])
        info_dict['PMID'].append(
            mmcif_dict['_citation.pdbx_database_id_PubMed'])
        info_dict['resolution'].append(mmcif_dict['_refine.ls_d_res_high'])
        info_dict['Chain Information'].append(
            getChainInfo_CIF(mmcif_dict, '_pdbx_poly_seq_scheme.pdb_mon_id',
                             '_pdbx_poly_seq_scheme.pdb_strand_id'))
    return info_dict
def get_unique(input_df):

    from Bio.PDB import PDBList
    unique_pdbs = input_df.CPX.unique()
    pdbl = PDBList()
    for single_pdb in unique_pdbs:
        pdbl.retrieve_pdb_file(single_pdb, pdir='PDB', file_format="pdb")
Exemplo n.º 5
0
def get_pdb(pdb_list):
    import os
    from Bio.PDB import PDBList

    out_dir = "PDB_benchmark_structures\\"
    pdb = pdb_list
    number_ids = len(pdb)

    filename = []
    not_found = []
    print("Downloading in %s:\n" % out_dir)
    for i, pdbid in enumerate(pdb):
        print('%s' % pdbid[:4])
        pdbl = PDBList()
        try:
            if not os.path.exists("{}{}.pdb".format(out_dir, pdbid)):
                x = pdbl.retrieve_pdb_file(pdbid[:4],
                                           file_format='pdb',
                                           pdir=out_dir)
                filename.append(x)

        except FileNotFoundError:
            not_found.append(pdbid)
            print("(NOTE) {} not found.".format(pdbid))
    return filename
Exemplo n.º 6
0
def build_xtalstable(dbpathstr, sourcedbpathstrs, pdbformat='cif'):
    dbpath = Path(dbpathstr)
    today = datetime.date.today()
    todaystr = f'{today.year}-{today.month:02d}-{today.day:02d}'
    conn = seqdbutils.gracefuldbopen(dbpath)
    xtaldirpath = Path(dbpath).parent / 'Xtals'
    if not xtaldirpath.exists():
        os.mkdir(xtaldirpath)
    c = conn.cursor()
    c.execute(
        '''CREATE TABLE IF NOT EXISTS XTALS (pdbid text, acc text, srcdb text,dldate text, 
              relpath text, pdbformat text, dlsuccess int, obsolete int)''')
    pdbl = PDBList()
    cxRE = re.compile('([A-Za-z0-9]{4})\[')
    #by default folder containing xtals is named 'xtals' and in same directory as xtalsdb
    for srcdbpathstr in sourcedbpathstrs:
        srcdbpath = Path(srcdbpathstr)
        srcdbstr = srcdbpath.name
        src_conn = seqdbutils.gracefuldbopen(srcdbpath)
        seqdbutils.check_tables_exist(src_conn, ['CAZYSEQDATA'])
        src_c = src_conn.cursor()
        src_c.execute(
            'SELECT acc,pdbids FROM CAZYSEQDATA WHERE pdbids NOT NULL')
        pdbrows = src_c.fetchall()
        dbpdbs = []
        dbaccs = []
        for pdbrow in pdbrows:
            pdbentry = pdbrow['pdbids']
            accentry = pdbrow['acc']
            pdbs = cxRE.findall(pdbentry)
            dbaccs.extend([accentry for _ in range(len(pdbs))])
            dbpdbs.extend(pdbs)
        #dbpdbs=['4IM4']
        pdbl.download_pdb_files(dbpdbs, pdir=xtaldirpath)  #,obsolete=True)
        for acc, pdb in zip(dbaccs, dbpdbs):
            rel_pdbpath = xtaldirpath / f'{pdb}.cif'
            download_success = os.path.exists(rel_pdbpath)
            str_relpath = str(rel_pdbpath)
            c.execute(
                '''SELECT COUNT(*) FROM XTALS WHERE pdbid=(?) AND dlsuccess=(?)''',
                (pdb, 1))
            already_downloaded = c.fetchone()[0]
            if already_downloaded:
                continue
            c.execute(
                '''SELECT COUNT(*) FROM XTALS WHERE pdbid=(?) AND dlsuccess=(?)''',
                (pdb, 0))
            previously_failed = c.fetchone()[0]
            if previously_failed:
                if download_success:
                    print(f'new download of previously failed {pdb}')
                    c.execute(
                        '''UPDATE XTALS SET dldate = (?), dlsuccess = (?) WHERE pdbid=(?)''',
                        (todaystr, 1, pdb))
                continue
            c.execute('''INSERT INTO XTALS VALUES (?,?,?,?,?,?,?,?)''',\
                      (pdb,acc,srcdbstr,todaystr,str_relpath,pdbformat,download_success,None))
        conn.commit()
        src_conn.close()
    conn.close()
Exemplo n.º 7
0
def download_pdb(config, pdb_code: str) -> Path:
    """
    Download PDB structure from PDB.

    :param pdb_code: 4 character PDB accession code.
    :type pdb_code: str
    :return: returns filepath to downloaded structure.
    :rtype: str
    """
    if not config.pdb_dir:
        config.pdb_dir = Path("/tmp/")

    # Initialise class and download pdb file
    pdbl = PDBList()
    pdbl.retrieve_pdb_file(pdb_code,
                           pdir=config.pdb_dir,
                           overwrite=True,
                           file_format="pdb")
    # Rename file to .pdb from .ent
    os.rename(
        config.pdb_dir / f"pdb{pdb_code}.ent",
        config.pdb_dir / f"{pdb_code}.pdb",
    )

    # Assert file has been downloaded
    assert any(pdb_code in s for s in os.listdir(config.pdb_dir))
    log.info(f"Downloaded PDB file for: {pdb_code}")
    return config.pdb_dir / f"{pdb_code}.pdb"
Exemplo n.º 8
0
def pdb_Method(pdbId_list, filePath, info_dict):
    def loadPDB_PDB_format(pdbId, filePath, parser, pdbl):
        try:
            structure = parser.get_structure(
                pdbId, filePath + "pdb%s.ent" % pdbId.lower())
        except IOError:
            pdbl.retrieve_pdb_file(pdbId, file_format='pdb', pdir=filePath)
            structure = parser.get_structure(
                pdbId, filePath + "pdb%s.ent" % pdbId.lower())
        return structure

    def getChainInfo_PDB(structure):
        model = structure[0]
        chainInfo = ''
        for chain in model.get_chains():
            chainInfo += '%s: %s; ' % (chain.id, len(list(filter(is_aa,
                                                                 chain))))
        return chainInfo

    parser = PDBParser(PERMISSIVE=1)
    pdbl = PDBList()
    findPMID = re.compile(r'PMID\s+([\d,]+)')
    for pdbId in pdbId_list:
        structure = loadPDB_PDB_format(pdbId, filePath, parser, pdbl)
        info_dict['PDBID'].append(pdbId)
        info_dict['name'].append(structure.header['name'])
        info_dict['PMID'].append(findPMID.findall(structure.header['journal']))
        info_dict['resolution'].append(structure.header['resolution'])
        info_dict['Chain Information'].append(getChainInfo_PDB(structure))
    return info_dict
Exemplo n.º 9
0
def transform(tab, pdbrepo='pdbs'):
    def trans1(p, Id):
        path = os.path.join(os.path.dirname(p), '{}.pdb'.format(Id))
        try:
            copyfile(p, path)
        except:
            sys.stderr.write(
                'error when downloading structure {}\n'.format(Id))
            return None
        return os.path.relpath(path, os.getcwd())

    def func1(row):
        path = pl.retrieve_pdb_file(row['accession'], file_format='pdb')
        return trans1(path, row['accession'])

    def func2(row):
        ret = [
            row['fromA'] + '-' + row['toA'], row['fromB'] + '-' + row['toB']
        ]
        return pd.Series(ret)

    ret = tab[['A', 'B', 'accession', 'chainA', 'chainB']]
    #prepare for path/ locA/ locB
    pl = PDBList(pdb='pdbs')
    pdbs = tab.apply(func1, 1)
    ret = pd.concat([ret, pdbs], axis=1)
    locs = tab.apply(func2, 1)
    ret = pd.concat([ret, locs], axis=1)
    ret.columns = [
        'chainA', 'chainB', 'PDBid', 'codeA', 'codeB', 'path', 'locA', 'locB'
    ]
    return ret
Exemplo n.º 10
0
def retrieve_cif_list():
    server = PDBList(server='ftp://ftp.wwpdb.org', pdb='input_files', obsolete_pdb=None ,verbose=True)
    pdb_list = open('input_files/protlist.txt','r')
    content = pdb_list.read().split()
    pdb_list.close()

    server.download_pdb_files(content,pdir="input_files/cif",file_format='mmCif', overwrite=True,obsolete= False)
def download_pdbs(base_dir, protein_codes):
    """
    Downloads the PDB database (or a part of it) as PDB files. Every protein is stored in it's own
    directory (with name the PDB code) under base_dir.

    :param base_dir: where to download all the proteins.
    :param protein_codes: the PDB codes of the proteins that should be downloaded.
    """
    prot_codes = []
    if isinstance(protein_codes, dict):
        for key in protein_codes.keys():
            prot_codes += protein_codes[key]
    else:
        prot_codes = protein_codes
    prot_codes = list(set(prot_codes))
    from Bio.PDB import PDBList
    failed = 0
    attempted = len(prot_codes)
    for code in prot_codes:
        try:
            pl = PDBList(pdb=os.path.join(base_dir, code.upper()))
            pl.flat_tree = 1
            pl.retrieve_pdb_file(pdb_code=code)
        except IOError:
            log.warning("Failed to download protein {}".format(code))
            failed += 1
            continue
    log.info("Downloaded {0}/{1} molecules".format(attempted - failed,
                                                   attempted))
Exemplo n.º 12
0
 def __fetch_pdb(self):
     pdb_file_path = PDBList().retrieve_pdb_file(self.__pdb_key,
                                                 pdir='mutaviz/atom_files',
                                                 file_format="pdb")
     new_file_name = 'mutaviz/atom_files/%s.pdb' % self.__pdb_key
     os.rename(pdb_file_path, new_file_name)
     return new_file_name
Exemplo n.º 13
0
    def from_id(cls, pdb_id):
        """
        Initialize structure by PDB ID (fetches
        structure from RCSB servers)

        Parameters
        ----------
        pdb_id : str
            PDB identifier (e.g. 1hzx)

        Returns
        -------
        PDB
            initialized PDB structure
        """
        from urllib.error import URLError
        from Bio.PDB import PDBList
        pdblist = PDBList()

        try:
            # download PDB file to temporary directory
            pdb_file = pdblist.retrieve_pdb_file(pdb_id, pdir=tempdir())
            return cls.from_file(pdb_file, file_format="pdb")
        except URLError as e:
            raise ResourceError(
                "Could not fetch PDB data for {}".format(pdb_id)
            ) from e
Exemplo n.º 14
0
def generate_seq_file(score_file, save_file):
    score_file = './dataFile/' + score_file
    sf = pd.read_csv(score_file, sep='\t')
    mut_chains = sf.iloc[:,0]

    mut_dict = dict()
    mut_track = set()
    pdb_track = set()
    for chain in mut_chains:
        info = chain.split('_')
        pdb_id = info[0]
        chain_id = info[1]
        wt_aa = info[2][0:3]
        mu_aa = info[2][-3:]
        mu_pos = int(''.join(filter(lambda x: x.isdigit(), info[2])))
        if not chain in mut_track:
            mut_track.add(chain)
            if pdb_id in pdb_track:
                mut_dict[pdb_id].append({'chain_id':chain_id,
                                         'wt_aa': wt_aa,
                                         'mu_aa': mu_aa,
                                         'mu_pos': mu_pos,
                                         'name': chain})
            else:
                mut_dict[pdb_id] = [{'chain_id': chain_id,
                                     'wt_aa': wt_aa,
                                     'mu_aa': mu_aa,
                                     'mu_pos': mu_pos,
                                     'name': chain}]
                pdb_track.add(pdb_id)
    del mut_track
    del pdb_track
                
    parser = PDBParser()
    seq_builder = PPBuilder()
    pdb_dl_handle = PDBList()
    PDB_DIR = './dataFile/PDB_dl'
    # check if pdb file exists
    mut_collect = dict()
    for pdb_id in mut_dict.keys():
        if not os.path.exists(PDB_DIR+'/pdb'+pdb_id.lower()+'.ent'):
            pdb_dl_handle.retrieve_pdb_file(pdb_code=pdb_id, file_format='pdb', overwrite=False, pdir=PDB_DIR)
        pdb_file = PDB_DIR+'/pdb'+pdb_id.lower()+'.ent'
        model = parser.get_structure(pdb_id, pdb_file)[0]

        for mutation in mut_dict[pdb_id]:
            protein_chain = model[mutation['chain_id']]
            sequence = "".join([str(pp.get_sequence())
                                for pp in seq_builder.build_peptides(protein_chain)])
            sequence = sequence.replace('\n', '').replace(' ', '')
            assert sequence[mutation['mu_pos']-1] == three_to_one(mutation['wt_aa']), 'Wt amino acid failed to match'
            mut_Seq_list = list(sequence)
            mut_Seq_list[mutation['mu_pos']-1] = three_to_one(mutation['mu_aa'])
            mut_Seq = ''.join(mut_Seq_list)
            mut_collect[mutation['name']] = mut_Seq
    
    with open(save_file, 'w') as output_hl:
        for k, v in mut_collect.items():
            output_hl.write(k+'\t'+v+'\n')
Exemplo n.º 15
0
def getpdbIds(dic, fastafile, fbase1, fbase2, pathToProg):
    #old files are deleting
    if os.path.exists("./PDB/"):
        commands.getoutput('rm  ./PDB/*.pdb')
    #create a new folder that will contain the pdb files to download
    commands.getoutput('mkdir ./PDB/')
    #file 1
    with open(fbase1, 'r') as fd:
        database1 = fd.readlines()
    #file 2
    with open(fbase2, 'r') as fd:
        database2 = fd.readlines()
    #Output dictionary
    diProt = {}

    pdbl = PDBList()
    '''Selecting structures from PDB'''
    for protId in dic:
        #The 3D structures associated with the protId protein in the first file
        associate1 = list(filter(lambda line: protId in line, database1))
        #The 3D structures associated with the protId protein in the second file
        associate2 = list(filter(lambda line: protId in line, database2))
        associate_pdb = []

        if associate1 != []:
            #When associated 3D structures are found in file 1
            for line in associate1:
                pdbId = line.split('\t')[0]
                associate_pdb.append(pdbId)
            pdbId = getBestPdbId(fastafile, protId, associate_pdb, pathToProg)
            if commands.getoutput('wget -c http://www.rcsb.org/pdb/files/' +
                                  pdbId + '.pdb -O ' + os.getcwd() + "/PDB/" +
                                  pdbId + '.pdb'):
                diProt[protId] = pdbId
            else:
                diProt[protId] = ""

        elif associate2 != []:
            #When associated 3D structures are found in file 2
            for line in associate2:
                pdbId = line.split('\t')[0]
                associate_pdb.append(pdbId)
            pdbId = getBestPdbId(fastafile, protId, associate_pdb)
            if commands.getoutput('wget -c http://www.rcsb.org/pdb/files/' +
                                  pdbId + '.pdb -O ' + os.getcwd() + "/PDB/" +
                                  pdbId + '.pdb'):
                diProt[protId] = pdbId
            else:
                diProt[protId] = ""

        else:
            diProt[protId] = ""
    #Everything is renaming from .cif to .pdb if it exists
    commands.getoutput('bash ./Programs/rename.sh cif pdb')
    if os.path.exists("pdb_tmp"):
        commands.getoutput('rm -r ./pdb_tmp')
    #Return a dictionary whose keys are the identifiers of the proteins
    #and the aqssociated values are the pdb identifiers associated with its proteins
    return diProt
Exemplo n.º 16
0
def download(filelist: list, q: Queue, lock: Lock, cursor: sqlite3.Cursor, conn: sqlite3.Connection, dir_name: str):
    """
    :param filelist:
    :param q:
    :param lock:
    :param cursor:
    :param conn:
    :param dir_name:
    """
    with open('status_tmp.txt', 'w') as f:
        f.write('')
    for file in filelist:
        if file in open('status_tmp.txt').readlines():
            continue
        pdbl = PDBList()
        pdbl.retrieve_pdb_file(file, pdir=os.path.join(dir_name, file), file_format='pdb')
        if not os.path.exists(os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file))):
            print("File with ID PDB: {:s} not found!".format(file))
            continue
        parser = PDBParser()
        structure = parser.get_structure('{:s}', os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file)))
        name = parser.header.get('name', '')
        head = parser.header.get('head', '')
        method = parser.header.get('structure_method', '')
        res = parser.header.get('resolution', '')
        ncomp = 0
        nchain = 0
        eclist = []
        for values in parser.header['compound'].values():
            ncomp += 1
            nchain += len(values['chain'].split(','))
            eclist.append(values.get('ec', '') or values.get('ec_number', ''))
        ec = ", ".join(eclist)
        nres = 0
        mmass = 0
        ppb = PPBuilder()
        for pp in ppb.build_peptides(structure):
            seq = pp.get_sequence()
            nres += len(seq)
            seqan = ProteinAnalysis(str(seq))
            mmass += int(seqan.molecular_weight())
        lock.acquire()
        try:
            cursor.execute("""INSERT INTO Structures (IDPDB, NAME, HEAD, METHOD, RESOLUTION, NCOMP, NCHAIN,
NRES, MMASS, EC) VALUES ("{:s}", "{:s}", "{:s}", "{:s}", {:.2f}, {:d}, {:d},{:d}, {:d}, "{:s}")""".format(
                file, name, head, method, res, ncomp, nchain, nres, mmass, ec))
        except sqlite3.DatabaseError as err:
            print("Error: ", err)
            continue
        else:
            print("Download Done for ID PDB: {:s}".format(file))
            conn.commit()
            q.put(file)
        finally:
            lock.release()
            with open('status_tmp.txt', 'at') as f:
                f.write((file + '\n'))
    os.remove('status_tmp.txt')
    q.put(None)
Exemplo n.º 17
0
def fetchPDB(name, path):
    """Fetch a pdb and save to path"""
    from Bio.PDB import PDBList
    pdbname = os.path.join(path,name+'.pdb')
    pdbl = PDBList()
    filename = pdbl.retrieve_pdb_file(name,pdir=path)
    os.rename(filename, pdbname)
    return
Exemplo n.º 18
0
    def struct_retrieve(self):
        """
            Retrieve PDB structure given argparse ID
        """
        self.pdb_id = str(self.args.id_input)
        pdbl = PDBList()

        pdbl.retrieve_pdb_file(self.pdb_id, file_format='pdb', pdir=".")
Exemplo n.º 19
0
    def struct_retrieve(self):
        """
            Retrieve PDB structure given argparse ID
        """
        pdbl = PDBList()

        pdbl.retrieve_pdb_file(self.pdb_id,
                               file_format='pdb',
                               pdir=f"{self.out_dir}/")
Exemplo n.º 20
0
def download_structure_file(pdb_id: str) -> None:
    """Download PDB/mmCIF file with a user provided identifer from PDB using BioPython library 

    :param pdb_id: the protein id in protein data bank 
    :type pdb_id: str
    """
    pdb_list = PDBList()
    pdb_list.retrieve_pdb_file(pdb_id)
    return
Exemplo n.º 21
0
def downloadPdb(pdb_list):
    os.system("mkdir -p original_pdbs")
    for pdb_id in pdb_list:
        pdb = f"{pdb_id.lower()[:4]}"
        pdbFile = pdb + ".pdb"
        if not os.path.isfile("original_pdbs/" + pdbFile):
            pdbl = PDBList()
            name = pdbl.retrieve_pdb_file(pdb, pdir='.', file_format='pdb')
            os.system(f"mv {name} original_pdbs/{pdbFile}")
Exemplo n.º 22
0
def load_data(experiment: str, in_file: str, out_dir: str) -> None:
    #in_file = os.path.join('../data/', experiment, 'full_list.txt')
    print(in_file)
    #out_dir = os.path.join('../data/raw/', experiment)
    pdbl = PDBList(server='http://ftp.wwpdb.org', verbose=False)
    with open(in_file, 'r') as molecule_id_list:
        molecule_id_list = molecule_id_list.readlines()
        for molecule_id in tqdm(molecule_id_list):
            pdbl.retrieve_pdb_file(molecule_id.strip('\n').split('_')[0], pdir=out_dir, file_format='pdb')
Exemplo n.º 23
0
def download_PDB(pdb_ids, pdb_dir='.'):
    # Define pdb file fetching class
    pdbl = PDBList()
    # Fetch every protein
    for pdb_id in pdb_ids:
        # Debug
        logging.debug('PDB file which will be downloaded')
        logging.debug(pdb_id)
        # Execute fetching of the protein (pdb file)
        pdbl.retrieve_pdb_file(pdb_id, pdir=pdb_dir, file_format='pdb')
Exemplo n.º 24
0
def descargarPDB(pdb):
    pdbl = PDBList()
    pdbl.retrieve_pdb_file(pdb, pdir='./Script/PDB', file_format='pdb')
    parser = PDBParser()
    ent_file = './Script/PDB/pdb' + pdb.lower() + '.ent'
    structure = parser.get_structure(pdb, ent_file)
    io = PDBIO()
    io.set_structure(structure)
    pdb_structure_file = './Script/PDBStructure/' + pdb + '.pdb'
    io.save(pdb_structure_file)
Exemplo n.º 25
0
def retrieve_cif(prot_id):
    server = PDBList(server='ftp://ftp.wwpdb.org',
                     pdb='input_files',
                     obsolete_pdb=None,
                     verbose=True)
    server.retrieve_pdb_file(prot_id,
                             pdir="input_files/cif",
                             file_format='mmCif',
                             overwrite=True,
                             obsolete=False)
Exemplo n.º 26
0
def download_PDB_structures(pdb_ID):
    pdb_data_folder = Path("PDB_files_raw/")
    if not os.path.exists(pdb_data_folder):
        os.makedirs(pdb_data_folder)
    pdb_ID = pdb_ID.lower()
    if (not is_non_zero_file(pdb_data_folder / (pdb_ID + ".pdb"))) and (
            not is_non_zero_file(pdb_data_folder / (pdb_ID + ".cif"))):
        PDBList().retrieve_pdb_file(pdb_ID,
                                    pdir=pdb_data_folder,
                                    file_format='mmCif')
Exemplo n.º 27
0
def obtian_seq_wo_seq_file(score_file):
    score_file = './dataFile/' + score_file
    sf = pd.read_csv(score_file, sep='\t')
    chains_involved = sf.iloc[:, 0]
    pdb = dict()
    pdb_track = set()
    for chain in chains_involved:
        chain_name = chain[0:6]
        pdb_name = chain[0:4]
        # if we encounter a old pdb
        if pdb_name in pdb_track:
            pdb[pdb_name].add(chain_name)
        # else, we have a new pdb
        else:
            # update the track file
            pdb_track.add(pdb_name)
            pdb[pdb_name] = {chain_name}

    # create the link to the PDB database and retrive all the file
    # related to the files, store them locally under ./dataFile/PDB_dl/
    PDB_DIR = './dataFile/PDB_dl'
    if not os.path.exists(PDB_DIR):
        os.mkdir(PDB_DIR)
    # create the download handle
    pdb_dl_handle = PDBList()
    # download all of the pdb files
    for item in pdb.keys():
        if not os.path.exists(PDB_DIR + '/pdb' + item.lower() + '.ent'):
            pdb_dl_handle.retrieve_pdb_file(pdb_code=item,
                                            file_format='pdb',
                                            overwrite=False,
                                            pdir=PDB_DIR)

    # for each pdb, we will construct the sequence
    seq_dict = dict()
    parser = PDBParser()
    seq_builder = PPBuilder()
    # key is the pdb_id, value is the chain in a
    for pdb_id, chain_names in pdb.items():
        pdb_file = PDB_DIR + '/pdb' + pdb_id.lower() + '.ent'
        model = parser.get_structure(pdb_id, pdb_file)[0]
        for chain in chain_names:
            # extract the last letter, which is the chain name
            chain_id = chain[-1]
            protein_chain = model[chain_id]
            sequence = "".join([
                str(pp.get_sequence())
                for pp in seq_builder.build_peptides(protein_chain)
            ])
            sequence = sequence.replace('\n',
                                        '').replace(' ',
                                                    '')  # clean the bad chars
            seq_dict[chain] = sequence

    return seq_dict
Exemplo n.º 28
0
Arquivo: pdb.py Projeto: le-yuan/ssbio
    def download_structure_file(self,
                                outdir,
                                file_type=None,
                                load_header_metadata=True,
                                force_rerun=False):
        """Download a structure file from the PDB, specifying an output directory and a file type. Optionally download
        the mmCIF header file and parse data from it to store within this object.

        Args:
            outdir (str): Path to output directory
            file_type (str): ``pdb``, ``mmCif``, ``xml``, ``mmtf`` - file type for files downloaded from the PDB
            load_header_metadata (bool): If header metadata should be loaded into this object, fastest with mmtf files
            force_rerun (bool): If structure file should be downloaded even if it already exists

        """
        ssbio.utils.double_check_attribute(
            object=self,
            setter=file_type,
            backup_attribute='file_type',
            custom_error_text=
            'Please set file type to be downloaded from the PDB: '
            'pdb, mmCif, xml, or mmtf')

        # XTODO: check if outfile exists using ssbio.utils.force_rerun, pdblist seems to take long if it exists
        # I know why - it's because we're renaming the ent to pdb. need to have mapping from file type to final extension
        # Then check if file exists, if not then download again
        p = PDBList()
        with ssbio.utils.suppress_stdout():
            structure_file = p.retrieve_pdb_file(pdb_code=self.id,
                                                 pdir=outdir,
                                                 file_format=file_type,
                                                 overwrite=force_rerun)
        if not op.exists(structure_file):
            log.debug('{}: {} file not available'.format(self.id, file_type))
            raise URLError('{}.{}: file not available to download'.format(
                self.id, file_type))
        else:
            log.debug('{}: {} file saved'.format(self.id, file_type))

            # Rename .ent files to .pdb
            if file_type == 'pdb':
                new_name = structure_file.replace('pdb',
                                                  '').replace('ent', 'pdb')
                os.rename(structure_file, new_name)
                structure_file = new_name

            self.load_structure_path(structure_file, file_type)
            if load_header_metadata and file_type == 'mmtf':
                self.update(parse_mmtf_header(structure_file))
            if load_header_metadata and file_type != 'mmtf':
                self.update(
                    parse_mmcif_header(
                        download_mmcif_header(pdb_id=self.id,
                                              outdir=outdir,
                                              force_rerun=force_rerun)))
Exemplo n.º 29
0
def download_pdb_structure(pdb_code, pdb_file_name, file_path='.'):
    """Downloads a PDB structure from the Protein Data Bank"""
    pdbl = PDBList()
    file_name = pdbl.retrieve_pdb_file(pdb_code,
                                       file_format='pdb',
                                       pdir=file_path,
                                       overwrite=True)
    if os.path.exists(file_name):
        os.rename(file_name, pdb_file_name)
    else:
        raise Exception("Can not download structure: {0}".format(pdb_code))
Exemplo n.º 30
0
def get_pdb(pdb_list):
    from Bio.PDB import PDBList

    out_dir = "PDB_benchmark_structures\\"
    pdb = pdb_list
    number_ids = len(pdb)

    print("Downloading in %s:\n" % out_dir)
    for ids in pdb:
        print('%s' % ids[:4])
        pdbl = PDBList()
        pdbl.retrieve_pdb_file(ids[:4], file_format='pdb', pdir=out_dir)