Пример #1
0
 def build_multifasta(file_name, sequenceList, force=False):
     newFasta = File(file_name, 'w', overwrite=force)
     file_dsc = newFasta.descriptor
     for sequence in sequenceList:
         file_dsc.write(sequence.format('FASTA') + "\n")
     newFasta.close()
     return Fasta(fasta_file=newFasta.full)
Пример #2
0
 def build(file_name, sequenceID, sequence, force=False):
     newFasta = File(file_name, 'w', overwrite=force)
     newSeq = Sequence(seqID=sequenceID, sequence=sequence)
     file_dsc = newFasta.descriptor
     file_dsc.write(newSeq.format('FASTA'))
     newFasta.close()
     return Fasta(fasta_file=newFasta.full)
Пример #3
0
 def format2file(self, filename, extension='pdb', center=False):
     if extension not in ('pdb', 'js'):
         raise AttributeError('Not accepted extension')
     structure = File('.'.join([filename, extension]), 'w')
     if extension == 'pdb': structure.write(self.pdb_format(center=center))
     elif extension == 'js': structure.write(self.js_format(center=center))
     structure.close()
Пример #4
0
    def pdb_file(self, value):
        """
        Sets a PDB file if none has been given
        @raise UsedAttributeError
        """
        if self._pdb_file is not None:
            raise AttributeError(
                "The PDB object is loaded from file {0}. To load the new file {1} create a new PDB object"
                .format(self._pdb_file.full, value))

        if isinstance(value, File):
            self._pdb_file = value
        else:
            self._pdb_file = File(file_name=value, type='r')
Пример #5
0
    def __init__(self, cif_file):
        self._file = File(file_name=cif_file, action='r')
        self.__name__ = 'databases.PDBeChem'  # This must be included in every class for the SBIglobals.alert()

        self._id = None
        self._name = None
        self._type = None
        self._formula = None
        self._parent = None
        self._weight = None
        self._fcharge = None
        self._code1l = None
        self._flformula = {}

        self._parse()
        self._decompose_formula()
Пример #6
0
    def __init__(self, database, search_type = 'prot'):

        #Search Type Check
        if search_type not in set(['prot','nucl']):
            raise BE(-10)
        self._search_type = search_type

        #Blast executable configuration
        self._configurator = ConfigParser.RawConfigParser(allow_no_value=True)
        self._configurator.read(os.getenv('SBI_CONFIG_FILE',default_configuration_file))
        self._exe    = Executable(executable    = self._configurator.get('blast','executable'),
                                  path          = self._configurator.get('blast','path'),
                                  variable_path = self._configurator.get('blast','variable_path'))

        #Database Configuration
        self._database = self._check_database(os.path.abspath(database))
        if os.path.isfile(self._database.file.full + ".idx"):
            self._idx = File(file_name = self._database.file.full + ".idx", action = 'r')
        else:
            self._idx = None

        #Adding fixed blast parameters
        self._exe.add_attribute(self._database.file.full, '-db')
        self._exe.add_attribute('5', '-outfmt')
        self._exe.add_parameter('-lcase_masking')

        SBIglobals.alert('debug', self, 'New Blast Executable created.\nBlast executable at {0}\n'.format(self._exe.full_executable))

        self._selfHit     = False
        self._hitIDformat = 'single'
        self._overwritte  = False
        self._clean_files = True
Пример #7
0
    def make_PDBseq(self, log_file, resolution_threshold=None):
        if not self.has_local:
            raise NameError(
                'A local PDB database must be defined to do create a PDBseq database.'
            )
        outdir = self.PDBseq if self.PDBseq is not None else os.curdir

        Path.mkdir(self.PDBseq)
        fasta_file = File(file_name=os.path.join(outdir, 'PDBseq.fa'),
                          action='w',
                          overwrite=True)
        fasta_fd = fasta_file.descriptor
        idx_file = File(file_name=os.path.join(outdir, 'PDBseq.fa.idx'),
                        action='w',
                        overwrite=True)
        idx_fd = idx_file.descriptor
        # if resolution_threshold is not None:
        #     filtered_file_name = self.get_PDBseq_filtered(resolution_threshold)
        #     filtered_file      = File(file_name = filtered_file_name, action = 'w', overwrite = True)
        #     filtered_fd        = filtered_file.descriptor
        #     resolutions        = self.get_resolutions(resolution_threshold = resolution_threshold)
        log_file = File(file_name=log_file, action='w', overwrite=True)
        log_idx = log_file.descriptor

        for pdb_file in self.localPDBs:
            log_idx.write("Reading File: {0}\n".format(pdb_file))
            newPDB = PDB(pdb_file=pdb_file, dehydrate=True)
            fasta_idx = newPDB.FASTA_IDX(nucleotide=False)
            if len(fasta_idx['FASTA']) != len(fasta_idx['IDX']):
                log_idx.write(
                    'ERROR!!!!! Number of fastas and indexes are different for pdb {0}!!\n'
                    .format(newPDB.id))
            if len(fasta_idx['FASTA']) > 0:
                log_idx.write('\tPrinting FASTA and IDX...\n')
            else:
                log_idx.write('\tProblably just a nucleotide PDB...\n')
            for c in range(len(fasta_idx['FASTA'])):
                sequence = fasta_idx['FASTA'][c].split('\n')[1]
                sequence = sequence.replace('X', '').replace('x', '')
                if len(sequence) > 0:
                    fasta_fd.write(fasta_idx['FASTA'][c] + "\n")
                    if resolution_threshold is not None and newPDB.id in resolutions and not newPDB.is_all_ca:
                        filtered_fd.write(fasta_idx['FASTA'][c] + "\n")
                    idx_fd.write(fasta_idx['IDX'][c] + "\n")
            del (newPDB)

        #CLOSE & END
        fasta_file.close()
        idx_file.close()
        if resolution_threshold is not None:
            filtered_fd.close()
Пример #8
0
    def _process(self):
        go_dic = {}
        parseFile = File(os.path.join(self.local, self._gfile), 'r')
        go = None
        for line in parseFile.descriptor:
            line = re.sub('\'', '\\\'', line)
            if line.startswith('[Term]'):
                if go is not None:
                    go_dic[go.id] = go
            if line.startswith('id:'):
                go = GOterm(id=line.split()[1].strip())
                continue
            if line.startswith('name:'):
                go.name = " ".join(line.split()[1:]).strip()
                continue
            if line.startswith('namespace:'):
                go.namespace = line.split()[1].strip()
                continue
            if line.startswith('alt_id:'):
                go.alt_id.append(line.split()[1].strip())
                continue
            if line.startswith('is_obsolete:'):
                go.obsolete = True
                continue
            if line.startswith('is_a:'):
                go.parents.add(line.split()[1].strip())
                continue
            if line.startswith('relationship:'):
                go.relations.append(
                    (line.split()[1].strip(), line.split()[2].strip()))
                continue
            if line.startswith('[Typedef]'):
                go_dic[go.id] = go
                break
        parseFile.close()

        for go in go_dic:
            go_dic[go].parents = self._search_parents(go_dic, go)

        goFile = File(self._gofile, 'w', True)
        for go in go_dic:
            go_dic[go].parents.add(go)
            goFile.write(str(go_dic[go]) + "\n")
        goFile.close()
Пример #9
0
    def __init__(self,
                 pdb_file=None,
                 dehydrate=False,
                 header=False,
                 onlyheader=False,
                 biomolecule=False):
        """
        @type  pdb_file: String
        @param pdb_file: PDB formated file to read

        @raise IOError if pdb_file does not exist and it is not an empty object
        """
        if biomolecule or onlyheader:
            header = True

        self._pdb_file = pdb_file
        self._chains = []
        self._NMR = False
        self._NMR_chains = []
        self._chain_id = set()

        self._biomol_id = -1  # -1 -> original
        #  0 -> symmetry
        # >0 -> biomolecule

        self._header = None

        self._has_prot = False
        self._has_nucl = False

        self._COMPND = None

        if self.pdb_file is not None:
            self._pdb_file = File(file_name=self._pdb_file, action='r')
            self._read_PDB_file(header=header,
                                onlyheader=onlyheader,
                                biomolecule=biomolecule)

        if dehydrate:
            self.dehydrate()
Пример #10
0
 def _process(self):
     tmoFile = File(self._pdbtmfile,'w', True)
     for xmlfile in Path.list_files(os.path.join(self._local,'pdbtm/database/'), '*.xml'):
         xmldata = TM(pdb = os.path.splitext(os.path.split(xmlfile)[1])[0].upper())
         skip_chains = set()
         read = False
         fdxml = open(xmlfile)
         for line in fdxml:
             if line.startswith('    <TMRES>'):     xmldata.tmres  = line
             elif line.startswith('    <TMTYPE'):   xmldata.tmtype = line
             elif line.startswith('    <PDBKWRES'): xmldata.kwres  = line
             elif line.startswith('  <SIDEDEFINITION'):
                 m = re.search('Side1="(\S+)"', line)
                 xmldata.side = m.group(1)
             elif line.startswith('      <APPLY_TO_CHAIN'):
                 m = re.search('NEW_CHAINID=\"(\S{1})\"', line)
                 if m: skip_chains.add(m.group(1))
             elif line.startswith('  <CHAIN '):
                 m = re.search('CHAINID=\"(\S{1})\" NUM_TM=\"(\d{1})\" TYPE=\"(\S+)\"', line)
                 if m:
                     chain, num, tmtype = m.group(1), m.group(2), m.group(3)
                     if not chain in skip_chains:
                         cdata = tuple([chain, num, tmtype])
                         xmldata.set_chain(cdata)
                         read = True
             elif line.startswith('    <REGION ') and read:
                 m = re.search('pdb_beg=\"(\-*\d+\w*)\"[\s\S]+pdb_end=\"(\-*\d+\w*)\"\s+type=\"(\w{1})\"', line)
                 ini, end, tmtype = m.group(1), m.group(2), m.group(3)
                 xmldata.set_chain(cdata, tuple([ini, end, tmtype]))
             elif line.startswith('  </CHAIN>'): read = False
         fdxml.close()
         if len(xmldata.chains) > 0:
             tmoFile.write(str(xmldata)+"\n")
     tmoFile.close()
Пример #11
0
    def write(self, output_file=None, format='PDB', force=False, clean=False):
        """
        Writes the object in a specific format

        @type  output_file: String
        @param output_file: File to write

        @type  format: String
        @param format: Format of the file to print
        """
        outfile = File(file_name=output_file, action='w', overwrite=force)
        if format == 'PDB':
            self._write_PDB_file(pdb_file=outfile, clean=clean)
Пример #12
0
    def _process(self):
        enzymes = self._parse_enzclass() + self._parse_enzymedat()
        enzymes.sort()

        enzFile = File(self._enzfile, 'w', True)
        for e in enzymes:
            enzFile.write(repr(e) + "\n")
        enzFile.close()
Пример #13
0
    def __init__(self, fasta_file):

        if isinstance(fasta_file, basestring):
            self._file = File(file_name=fasta_file, action='r')
        elif isinstance(fasta_file, File):
            self._file = fasta_file
            self._file.action = 'r'
        else:
            raise AttributeError('Check the input of the Fasta object')

        self._is_multifasta = self._check_multifasta()

        self._sequences = []
        self._seqfinder = {}
Пример #14
0
    def _process(self):

        targets = self._process_targets()
        drugs = self._process_drugs(targets)

        drugFile = File(self._drugfile, 'w', True)
        for d in drugs:
            drugFile.write(repr(d) + "\n")
        drugFile.close()
Пример #15
0
    def get_PDBeChem(self, chemID):
        if self.has_local:
            for chem_file in self.localPDBeChems:
                newfile = File(file_name=chem_file, action='r')
                if newfile.prefix.upper() == chemID.upper():
                    return chem_file

        #If we do not find it in local (or we do not have a local) we search it on the FTP
        chem_file = chemID.upper() + '.cif'
        source = PDBeChemftp['single'] + chem_file
        try:
            urllib.urlretrieve(source, chem_file)
        except:
            return False
        return os.path.abspath(chem_file)
Пример #16
0
    def get_PDBs(self, pdbIDset):
        if isintance(pdbIDset, str):
            warnings.warn(
                'For single PDB search the get_PDB function is recomended.')
            yield self.get_PDB(pdbIDset)
        else:
            pdbIDset = set([x.upper() for x in pdbIDset])

        if self.has_local:
            for pdb_file in self.localPDBs:
                newfile = File(file_name=pdb_file, action='r')
                if newfile.prefix.lstrip('pdb').upper() in pdbIDset:
                    yield pdb_file
        else:
            for pdbID in pdbIDset:
                yield self.get_PDB(pdbID)
Пример #17
0
    def get_PDB(self, pdbID):
        if self.has_local:
            rootdir = os.path.join(self.local, pdbID.lower()[1:3])
            for pdb_file in Path.list_files(root=rootdir, pattern='*.ent.gz'):
                newfile = File(file_name=pdb_file, action='r')
                if newfile.prefix.lstrip('pdb').upper() == pdbID.upper():
                    return pdb_file

        #If we do not find it in local (or we do not have a local) we search it on the FTP
        pdb_file = 'pdb' + pdbID.lower() + '.ent.gz'
        source = 'ftp://' + PDBftp['address'] + os.path.join(
            PDBftp['structures'], pdbID[1:3].lower(), pdb_file)
        try:
            urllib.urlretrieve(source, pdb_file)
        except:
            return False
        return os.path.abspath(pdb_file)
Пример #18
0
    def get_PDBeChems(self, chemIDset):
        if isintance(chemIDset, str):
            warnings.warn(
                'For single PDBeChem search the get_PDBeChem function is recomended.'
            )
            yield self.get_PDBeChem(chemIDset)
        else:
            chemIDset = set([x.upper() for x in chemIDset])

        if self.has_local:
            for chem_file in self.localPDBeChems:
                newfile = File(file_name=chem_file, action='r')
                if newfile.prefix.lstrip('pdb').upper() in chemIDset:
                    yield chem_file
        else:
            for chemID in chemIDset:
                yield self.get_PDBeChem(chemID)
Пример #19
0
    def get_resolutions(self):
        # resolutions (-1) are for methods that do not define resolution
        resolutions = {}

        ftp = ftplib.FTP(PDBftp['address'])
        ftp.login()
        ftp.cwd(PDBftp['derived'])
        resoluIDX = []
        ftp.retrlines('RETR ' + PDBftp['resolution'], resoluIDX.append)
        ftp.quit()

        SBIglobals.alert('debug', self,
                         'Retrieving resolution data from PDB FTP...')

        active = False
        for line in resoluIDX:
            if line.startswith('-'):
                active = True
                continue
            if active and len(line.strip()) > 0:
                data = [x.strip() for x in line.split(';')]
                if len(data[1]) > 0:
                    SBIglobals.alert(
                        'debug', self,
                        '\tResolution for {0[0]} is {0[1]}...'.format(data))
                    # if resolution_threshold is None:
                    resolutions[data[0]] = data[1]

        #rsync is accumulative, we might have structures that are not in the residu.idx anymore.. must check
        for pdb_file in self.localPDBs:
            newfile = File(file_name=pdb_file, action='r')
            pdbid = newfile.prefix.lstrip('pdb').upper()
            if pdbid not in resolutions:
                pdbobj = PDB(pdb_file=pdb_file, header=True, onlyheader=True)
                SBIglobals.alert(
                    'debug', self,
                    '\tGrabbing Resolution for {0} is {1}...'.format(
                        pdbid, pdbobj.header.resolution))
                resolutions[pdbid] = pdbobj.header.resolution

        return resolutions
Пример #20
0
class PDBeChem(object):
    """
    """
    def __init__(self, cif_file):
        self._file = File(file_name=cif_file, action='r')
        self.__name__ = 'databases.PDBeChem'  # This must be included in every class for the SBIglobals.alert()

        self._id = None
        self._name = None
        self._type = None
        self._formula = None
        self._parent = None
        self._weight = None
        self._fcharge = None
        self._code1l = None
        self._flformula = {}

        self._parse()
        self._decompose_formula()

    """ATTRIBUTES"""

    @property
    def id(self):
        return self._id

    @property
    def name(self):
        return self._name

    @property
    def type(self):
        return self._type

    @property
    def formula(self):
        return self._formula

    @property
    def full_formula(self):
        return self._flformula

    @property
    def parent(self):
        return self._parent

    @property
    def weight(self):
        return self._weight

    @property
    def formal_charge(self):
        return self._fcharge

    @property
    def code1(self):
        return self._code1l

    @property
    def code3(self):
        return self._id

    """PRIVATE METHODS"""

    def _parse(self):
        for line in self._file.descriptor:
            if line.startswith('_chem_comp.'):
                line = line.replace('_chem_comp.', '')
                value = line[35:].strip().strip('"')
                value = value.replace(' (NON-PREFERRED NAME)', '')
                value = value if value != '?' else None
                if line.startswith('id'): self._id = value
                if line.startswith('pdbx_type'): self._type = value
                if line.startswith('formula '): self._formula = value
                if line.startswith('formula_weight'): self._weight = value
                if line.startswith('pdbx_formal_charge'): self._fcharge = value
                if line.startswith('one_letter_code'): self._code1l = value
                if line.startswith('name'): self._name = value.upper()
                if line.startswith('mon_nstd_parent_comp_id'):
                    self._parent = set([x.strip() for x in value.split(',')
                                        ]) if value is not None else None
            if line.startswith(';') and self._name == '':
                self._name += line.strip().lstrip(';').upper()
        self._file.close()

    def _decompose_formula(self):
        if self.formula is not None:
            data = self.formula.split()
            atregex = re.compile('(\D+)(\d*)')
            for atom in data:
                m = atregex.search(atom)
                if m.group(1) in element_dic:
                    self._flformula[m.group(1)] = m.group(
                        2) if m.group(2) != '' else 1

    """OVERWRITE INHERITED FUNCTIONS"""

    def __str__(self):
        if self.code1 is not None and self.parent is not None:
            return "[{0.id} - {0.code1} from {0.parent}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format(
                self)
        elif self.code1 is not None:
            return "[{0.id} - {0.code1}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format(
                self)
        elif self.parent is not None:
            return "[{0.id} from {0.parent}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format(
                self)
        else:
            return "[{0.id}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format(
                self)
Пример #21
0
 def localTM(self):
     tmoFile = File(self._pdbtmfile, 'r')
     for tm_line in tmoFile.descriptor:
         yield tm_line
Пример #22
0
 def localEnzymes(self):
     enzFile = File(self._enzfile, 'r')
     for enz_line in enzFile.descriptor:
         yield enz_line
Пример #23
0
 def localTrembls(self):
     tblFile = File(self._trbfile, 'r')
     for uni_line in tblFile.descriptor:
         yield uni_line
Пример #24
0
 def descriptions(self):
     dscFile = File(self._desc, 'r')
     for dsc_line in dscFile.descriptor:
         if not dsc_line.startswith('#'):
             yield dsc_line
Пример #25
0
 def localGOs(self):
     goFile = File(self._gofile, 'r')
     for go_line in goFile.descriptor:
         yield go_line
Пример #26
0
 def relations(self):
     relFile = File(self._rel, 'r')
     for rel_line in relFile.descriptor:
         if not rel_line.startswith('#'):
             yield rel_line
Пример #27
0
class PDB(StorableObject):
    """
    A {PDB} is a collection of {Chain}
    """
    def __init__(self,
                 pdb_file=None,
                 dehydrate=False,
                 header=False,
                 onlyheader=False,
                 biomolecule=False):
        """
        @type  pdb_file: String
        @param pdb_file: PDB formated file to read

        @raise IOError if pdb_file does not exist and it is not an empty object
        """
        if biomolecule or onlyheader:
            header = True

        self._pdb_file = pdb_file
        self._chains = []
        self._NMR = False
        self._NMR_chains = []
        self._chain_id = set()

        self._biomol_id = -1  # -1 -> original
        #  0 -> symmetry
        # >0 -> biomolecule

        self._header = None

        self._has_prot = False
        self._has_nucl = False

        self._COMPND = None

        if self.pdb_file is not None:
            self._pdb_file = File(file_name=self._pdb_file, action='r')
            self._read_PDB_file(header=header,
                                onlyheader=onlyheader,
                                biomolecule=biomolecule)

        if dehydrate:
            self.dehydrate()

    #
    # ATTRIBUTES
    #
    @property
    def pdb_file(self):
        """
        PDB file name
        @rtype: String
        """
        return self._pdb_file

    @pdb_file.setter
    def pdb_file(self, value):
        """
        Sets a PDB file if none has been given
        @raise UsedAttributeError
        """
        if self._pdb_file is not None:
            raise AttributeError(
                "The PDB object is loaded from file {0}. To load the new file {1} create a new PDB object"
                .format(self._pdb_file.full, value))

        if isinstance(value, File):
            self._pdb_file = value
        else:
            self._pdb_file = File(file_name=value, type='r')

    @property
    def chain_identifiers(self):
        return self._chain_id

    @property
    def id(self):
        return self._chains[0].pdb

    @property
    def chains(self):
        """
        List of {Chain} contained in the PDB w/out NMR replicas
        @rtype: List of {Chain}
        """
        return self._chains

    @property
    def proteins(self):
        """
        List of {ProteinChain} contained in the PDB w/out NMR replicas
        @rtype: List of {ProteinChain} (iterator)
        """
        for chain in self.chains:
            if isinstance(chain, ChainOfProtein):
                yield chain

    @property
    def nucleotides(self):
        """
        List of {NucleotideChain} contained in the PDB w/out NMR replicas
        @rtype: List of {NucleotideChain} (iterator)
        """
        for chain in self.chains:
            if isinstance(chain, ChainOfNucleotide):
                yield chain

    @property
    def non_standard_chains(self):
        """
        List of non {NucleotideChain}/ non {ProteinChain} contained in the PDB w/out NMR replicas
        @rtype: List of non {NucleotideChain}/ non {ProteinChain} (iterator)
        """
        for chain in self.chains:
            if not isinstance(chain, ChainOfNucleotide) and not isinstance(
                    chain, ChainOfProtein):
                yield chain

    @property
    def all_models(self):
        """
        List of {Chain} contained in the PDB w/ NMR replicas
        @rtype: List of {Chain}
        """
        return self._chains + self._NMR_chains

    @property
    def header(self):
        if self._header is None:
            return ''
        else:
            return self._header

    @property
    def biomolecule_identifier(self):
        return self._biomol_id

    #
    # COMPLEX GETTERS & SETTERS
    #
    def get_chain_by_id(self, id):
        """
        Returns a chain according to its id or None if no chain with that id is found
        @rtype: {Chain}
        """
        for chain in self._chains:
            if chain.chain == id:
                return chain
        return None

    def add_chain(self, chain, NMR=False):
        """
        Adds a new chain to the PDB
        """
        if not NMR:
            self._chains.append(chain)
        elif NMR and self._NMR:
            self._NMR_chains.append(chain)

        self._chain_id.add(chain.chain)

    def add_chains(self, chains, NMR=False):
        """
        Adds a new chains to the PDB
        """
        for chain in chains:
            self.add_chain(chain=chain, NMR=NMR)

    def _get_chain_position_by_id(self, id):
        """
        Returns the position in the chain array where the chain is
        @rtype: Integer
        """
        for x in range(len(self._chains)):
            if self._chains[x].chain == id:
                return x
        return None

    #
    # BOOLEANS
    #
    @property
    def is_NMR(self):
        """
        Identifies if the PDB contains NMRs
        @rtype: Boolean
        """
        return self._NMR

    def chain_exists(self, chain):
        """
        Confirms if a given chain exists in the PDB
        @rtype: Boolean
        """
        return chain in self._chain_id

    @property
    def has_protein(self):
        """
        Checks if the PDB contains a protein (not only)
        @rtype: Boolean
        """
        return self._has_prot

    @property
    def has_nucleotide(self):
        """
        Checks if the PDB contains a nucleotide chain (not only)
        @rtype: Boolean
        """
        return self._has_nucl

    @property
    def repeated_chain_ids(self):
        """
        Checks if more than one {Chain} has the same assigned ID
        @rtype: Boolean
        """
        return len(self._chain_id) < len(self._chains)

    @property
    def is_all_ca(self):
        for p in self.proteins:
            if p.is_only_ca():
                return True
        return False

    #
    # METHODS
    #
    def dehydrate(self):
        recheck_chains = False
        for c in self.chains:
            c.dehydrate()
            if c.is_empty:
                recheck_chains = True
        if recheck_chains:
            c = []
            for ch in self.chains:
                if not ch.is_empty:
                    c.append(ch)
                else:
                    self._chain_id.remove(ch.chain)
            self._chains = c

    def duplicate(self, hetero=True, water=False, NMR=False):
        """
        Returns a {PDB} identical to the original but as a new object
        @rtype: {PDB}
        """
        new_PDB = PDB()
        new_PDB.pdb_file = self.pdb_file

        for chain in self.chains:
            new_PDB.add_chain(
                chain=chain.duplicate(hetero=hetero, water=water))

        if NMR:
            for chain in self._NMR_chains:
                new_PDB.add_chain(chain=chain.duplicate(hetero=hetero,
                                                        water=water),
                                  NMR=True)

        new_PDB._NMR = self._NMR
        new_PDB._has_prot = self._has_prot
        new_PDB._has_nucl = self._has_nucl

        return new_PDB

    def apply_symmetry_matrices(self):
        """
        Only works if the PDB file is an original PDB file
        or the matrices have been added in the correct PDB format
        @rtype: {PDB}
        """
        if self._header is None:
            self._read_PDB_file(header=True, onlyheader=True)
        return self._apply_matrix(matrix=self.header.symmetry_matrix)

    def apply_biomolecule_matrices(self, keepchains=False, water=True):
        """
        Only works if the PDB file is an original PDB file or
        the matrices have been added in the correct PDB format
        @rtype: {PDB}
        """
        if self._header is None:
            self._read_PDB_file(header=True, onlyheader=True)
        PDB_list = []
        for matrix in self.header.biomolecules:
            PDB_list.append(
                self._apply_matrix(matrix=matrix,
                                   keepchains=keepchains,
                                   water=water))
        return PDB_list

    def _apply_matrix(self, matrix, keepchains=False, water=True):
        new_PDB = PDB()
        new_PDB._biomol_id = matrix.identifier

        for chain in self.chains:
            if chain.chain in matrix.chains:
                for mat in matrix.matrices:
                    new_chain = chain.duplicate(water=water)
                    new_chain.reposition(matrix=mat.matrix, vector=mat.vector)
                    if len(new_chain) >= 1:
                        new_PDB.add_chain(chain=new_chain)
        if not keepchains:
            new_PDB.tmpclean(cluster_by_alternative_id=True)
        return new_PDB

    def clean(self):
        first_atom = 1
        for c in self.chains:
            c.clean(initatom=first_atom)
            first_atom = c.last_residue.last_atom_number + 1

    def tmpclean(self, cluster_by_alternative_id=False):
        """
        Makes a clean version of the PDB, rechaining in order and renumerating atoms.
        Renumbering residues is optional
        """
        pchainsIDs = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890"
        chainsIDs = ""
        chainsNIDs = ""
        chainID = 0
        atom_count = 1

        for x in range(len(pchainsIDs)):
            if not self.chain_exists(chain=pchainsIDs[x]):
                chainsIDs += pchainsIDs[x]
            else:
                chainsNIDs += pchainsIDs[x]

        chain_change = len(self) <= len(chainsIDs)

        for chain in self.chains:
            if (not chain.chain in chainsNIDs) and chain_change:
                self._chain_id.add(chain.chain)
                chain.chain = chainsIDs[chainID]
                chainID += 1
                self._chain_id.add(chain.chain)
                if cluster_by_alternative_id:
                    if self._COMPND is None:
                        self._COMPND = {}
                    if not self._COMPND.has_key(chain.alternative_id):
                        self._COMPND.setdefault(chain.alternative_id,
                                                []).append(
                                                    chain.alternative_id)
                    self._COMPND[chain.alternative_id].append(chain.chain)
            else:
                chainsNIDs = chainsNIDs.replace(chain.chain, '')

            chain.renumerate_atoms(init=atom_count)
            atom_count += (chain.atom_length)

    def fuse_chains(self, chains_ids):
        """
        Fuses several chains into the first one.
        It will not allow to fuse different structural chains.
        It does not alter the {PDB}, but provides a new one
        @rtype: {Chain}

        @raise AttributeError if:
            a) A given chain ID is not present
            b) Try to fuse different structural chains
        """
        if len(self._chain_id.intersection(set(chains_ids))) < len(chains_ids):
            raise AttributeError(
                "Some of the given chains to fues do not exist")

        error_counter = 0
        error_control = [False, False]
        new_PDB = PDB()
        for c in chains_ids:
            chain = self.get_chain_by_id(id=c)
            new_PDB.add_chain(chain=chain.duplicate())
            if isinstance(chain, ChainOfProtein) and not error_control[0]:
                error_counter += 1
                error_control[0] = True
            elif isinstance(chain, ChainOfNucleotide) and not error_control[1]:
                error_counter += 1
                error_control[1] = True
            if error_counter == 2:
                raise AttributeError(
                    "Fuse different kinds of structural chain is not possible\n"
                )

        init_chain_num = new_PDB.chains[0].last_residue.number
        for x in range(1, len(new_PDB.chains)):
            new_PDB.chains[x].renumerate_residues(init=init_chain_num + 1)
            init_chain_num = new_PDB.chains[0].last_residue.number
            new_PDB.chains[0].fuse(chain=new_PDB.chains[x])

        return_PDB = PDB()
        return_PDB.add_chain(chain=new_PDB.chains[0])
        return return_PDB

    # def calculate_dssp(self, out_dir = None, store = True):
    #     """
    #     Executes DSSP and assigns the prediction to each chain

    #     @param  out_dir: directory to save the output
    #     @defaut out_dir: None

    #     @param store: Save the dssp output(?)
    #     """

    #     for chain in self.proteins:
    #         if out_dir is None:
    #             pdb_file  = chain.globalID + ".pdb2dssp"
    #             dssp_file = chain.globalID + ".dssp"
    #         else:
    #             Path.mkdir(newdir = out_dir)
    #             pdb_file  = os.path.join(os.path.abspath(out_dir), chain.globalID + ".pdb2dssp")
    #             dssp_file = os.path.join(os.path.abspath(out_dir), chain.globalID + ".dssp")

    #         pdb_fd = open(pdb_file, 'w')
    #         pdb_fd.write(chain.PDB_format())
    #         pdb_fd.close()

    #         dssp_calc = DSSPexec(pdb_file = pdb_file, dssp_file = dssp_file,
    #                              chain    = chain,    store     = store)

    def rotate(self, matrix=None):
        """
        Rotates each {Chain} according to a given matrix

        @type matrix: numpy.matrix
        """
        if matrix is None:
            matrix = numpy.identity(3, float)
        for chain in self.all_models:
            chain.rotate(matrix=matrix)

    def translate(self, vector=None):
        """
        Translates each {Chain} according to a translational vector

        @type vector: numpy.array
        """
        if vector is None:
            vector = numpy.zeros(3, float)
        for chain in self.all_models:
            chain.translate(vector=vector)

    def reposition(self, matrix=None, vector=None):
        """
        Rotates and Translates each {Chain} according to a matrix and a translational vector

        @type matrix: numpy.matrix

        @type vector: numpy.array
        """
        if matrix is None:
            matrix = numpy.identity(3, float)
        if vector is None:
            vector = numpy.zeros(3, float)
        for chain in self.all_models:
            chain.reposition(matrix=matrix, vector=vector)

    # def calculate_protein_heteroatom_contacts(self, distance = 6):
    #     """
    #     Returns a {HeteroatomContacts} list with the contacts between a protein and its heteroatoms
    #     at a maximum given distance
    #     @type distance: Integer
    #     @rtype: list of {HeteroatomContacts}
    #     """
    #     data = []
    #     for protein in self.proteins:
    #         data.append(HeteroatomContacts(chain = protein, max_distance = distance))
    #     return data

    #
    # OVERRIDE PARENT'S FUNCTIONS
    #
    @staticmethod
    def read(input_file, format='PDB'):
        """
        Reads a file of data in a specific format and returns the object

        @type  input_file: String
        @param input_file: File to read

        @type  format: String
        @param format: Format of the file to read
        """
        if format == 'PDB':
            pdb = PDB(pdb_file=input_file)
            return pdb

    def write(self, output_file=None, format='PDB', force=False, clean=False):
        """
        Writes the object in a specific format

        @type  output_file: String
        @param output_file: File to write

        @type  format: String
        @param format: Format of the file to print
        """
        outfile = File(file_name=output_file, action='w', overwrite=force)
        if format == 'PDB':
            self._write_PDB_file(pdb_file=outfile, clean=clean)

    #
    # IO
    #
    def _read_PDB_file(self,
                       header=False,
                       onlyheader=False,
                       biomolecule=False):
        """
        Process and load crystal data from a PDB formated file
        """
        from parse_pdb import read_PDB_file, read_PDB_header
        if header:
            read_PDB_header(self)
            self._pdb_file.close()
        if not onlyheader:
            # read_PDB_file(self, biomolecule=biomolecule)
            read_PDB_file(self)
        self._pdb_file.close()

    # def _represent_COMPND(self):
    #     if self._COMPND is None: return ''

    #     data = []
    #     mol_counter = 1
    #     for chain in self._COMPND:
    #         data.append("COMPND    MOL_ID: %d;" %mol_counter)
    #         data.append("COMPND   2 CHAIN: " + ",".join(self._COMPND[chain]) + ";")
    #         if len(self._biomolecA) > 0:
    #             matrices = []
    #             for mat in self._biomolecA:
    #                 if mat[1] == chain: matrices.append(str(mat[0]))
    #             data.append("COMPND   3 MATRICES: " + ",".join(sorted(matrices)))
    #         mol_counter += 1
    #     return "\n".join(data) + "\n"

    def _write_PDB_file(self, pdb_file, clean=False):
        """
        Print a crystal into a PDB formated file
        """
        out_fd = pdb_file.descriptor
        # out_fd.write(self._represent_COMPND())
        out_fd.write(self.PDB_format(clean=clean) + "\n")
        pdb_file.close()

    def PDB_format(self, clean=False, terminal=True):
        """
        Strings a {PDB} in PDB format
        @rtype: String
        """
        lines = []
        if clean:
            self.clean()
        for chain in self._chains:
            lines.append(chain.PDB_format(terminal=terminal))
        lines.append("END")

        return "\n".join(lines)

    def FASTA_format(self, gapped=True, protein=True, nucleotide=False):
        lines = []
        for c in self.chains:
            if isinstance(c, ChainOfProtein) and protein:
                lines.append(">{0}\t{1}".format(c.globalID,
                                                c.aminoacids[0].identifier))
                if gapped:
                    lines.append("{0}".format(c.gapped_protein_sequence))
                else:
                    lines.append("{0}".format(c.protein_sequence))
            if isinstance(c, ChainOfNucleotide) and nucleotide:
                lines.append(">{0}\t{1}".format(c.globalID,
                                                c.nucleotides[0].identifier))
                if gapped:
                    lines.append("{0}".format(c.gapped_nucleotide_sequence()))
                else:
                    lines.append("{0}".format(c.nucleotide_sequence()))
        if len(lines) == 0:
            return ""
        else:
            return "\n".join(lines) + "\n"

    def IDX_format(self, protein=True, nucleotide=False):
        lines = []
        for c in self.chains:
            if isinstance(c, ChainOfProtein) and protein:
                lines.append(">{0}\t{1}".format(c.globalID, c.protein_idx))
            if isinstance(c, ChainOfNucleotide) and nucleotide:
                lines.append(">{0}\t{1}".format(c.globalID,
                                                c.nucleotide_idx()))
        if len(lines) == 0:
            return ""
        else:
            return "\n".join(lines) + "\n"

    def FASTA_IDX(self, protein=True, nucleotide=False):
        data = {}
        data.setdefault('FASTA', [])
        data.setdefault('IDX', [])
        for c in self.chains:
            if isinstance(c, ChainOfProtein) and protein:
                data['FASTA'].append(">{0}\n{1}".format(
                    c.globalID, c.gapped_protein_sequence))
                data['IDX'].append(">{0}\t{1}".format(c.globalID,
                                                      c.protein_idx))
            if isinstance(c, ChainOfNucleotide) and nucleotide:
                data['FASTA'].append(">{0}\n{1}".format(
                    c.globalID, c.gapped_nucleotide_sequence()))
                data['IDX'].append(">{0}\t{1}".format(c.globalID,
                                                      c.nucleotide_idx()))

        return data

    #
    # OVERRIDE DEFAULT METHODS
    #
    def __len__(self):
        return len(self._chains)
Пример #28
0
    def get_FASTA_IDX_by_names_to_file(self, names, outfile):

        fastafile = Fasta(self.PDBseq)
        selectedfasta = fastafile.retrieve(copy.deepcopy(names))
        output_fasta = File(outfile, 'w')
        for sequence in selectedfasta:
            output_fasta.write(sequence.format('FASTA') + "\n")
        output_fasta.close()
        idxfile = self.PDBseq + '.idx'
        output_idx = File(outfile + '.idx', 'w')
        input_idx = File(idxfile, 'r')
        for line in input_idx.descriptor:
            info = line.split()
            pdbname = info[0][1:]
            if pdbname in names:
                output_idx.write(line)
        input_idx.close()
        output_idx.close()
Пример #29
0
class CDhitList(StorableObject):
    def __init__(self, cdhitfile):
        self._clusters = []
        self._allseqids = {}
        self._file = File(file_name=cdhitfile)

        self._parse_file()

    @property
    def clusters(self):
        return self._clusters

    def get_cluster4sequence(self, sequence):
        if sequence in self._allseqids:
            return self._clusters[self._allseqids[sequence]]
        else:
            return None

    def is_in_cluster(self, sequence):
        c = self.get_cluster4sequence(sequence)
        if c is None: return 'N'
        else: return 'M' if c.is_master(sequence) else 'H'

    def add_cluster(self, cluster):
        self._clusters.append(cluster)

    def add_sequence2cluster(self, sequence, clusterid=None):
        if clusterid is None:
            self.clusters[-1].add_sequence(sequence)
            self._allseqids[sequence.name] = len(self.clusters) - 1
        else:
            for x in range(len(self._clusters)):
                if self._clusters[x].identifier == clusterid:
                    self._clusters[x].add_sequence(sequence)
                    self._allseqids[sequence.name] = x
                    break

    def dictionary_role_summary(self):
        data = {'master': [], 'homolog': []}
        for c in self.clusters:
            data['master'].append(c.master.name)
            for s in c.sequences:
                data['homolog'].append(s)
        return data

    def _parse_file(self):
        for line in self._file.descriptor:
            if line.startswith('>'):
                c = CDhit(clusterid=line.split()[-1].strip())
                self.add_cluster(c)
            else:
                data = line.split()[1:]
                h = CDhitHomolog(name=data[1],
                                 length=data[0],
                                 homology=data[-1])
                self.add_sequence2cluster(sequence=h)
        self._file.close()

    def __repr__(self):
        text = []
        for c in self.clusters:
            text.append('{0}'.format(c))
        return '\n'.join(text)
Пример #30
0
    def __init__(self, cdhitfile):
        self._clusters = []
        self._allseqids = {}
        self._file = File(file_name=cdhitfile)

        self._parse_file()