Exemplo n.º 1
0
    def build_multifasta(file_name, sequence_list, force=None):
        '''
        Creates a Fasta object and a FASTA file. For multiple sequences.

        @param:    file_name
        @pdef:     name of the fasta file (with path, if necessary)
        @ptype:    {String}

        @param:    sequence_list
        @pdef:     list of sequences to create the FASTA from.
        @ptype:    {List} or {Set} of {Sequence}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @return: {Fasta}
        '''
        newFasta = File(file_name, 'w', overwrite=force)
        for sequence in sequence_list:
            newFasta.write(sequence.format('FASTA') + '\n')
        newFasta.close()
        fasta_file = Fasta(fasta_file=newFasta.full, auto_load=0)
        return fasta_file
Exemplo n.º 2
0
def sortarchs(inputdir, outputdir):
    
    archsdir              = outputdir
    Path.mkdir(archsdir)
    sorted_archs          = {}
    loop_file_name        = os.path.join(archsdir, 'ArchDB.{0}.db')
    loop_split_file_name  = os.path.join(archsdir, 'ArchDB.{0}.{1:02d}-{2:02d}.db')
    sections_ini          = [ 0, 4, 7,14,21]
    sections_end          = [ 4, 6,13,20, 0]
    for archfile in Path.list_files(root = inputdir, pattern = '*.archObj'):
        filename = os.path.basename(archfile)
        data     = filename.split('_')
        length   = int(data[0])
        archtype = data[1] 
        sorted_archs.setdefault(archtype,{}).setdefault(length,[])
        sorted_archs[archtype][length].append(archfile)
    
    for archtype in sorted_archs:
        SBIglobals.alert('verbose', None, "ARCHS: " + archtype + "\n")
        fd  = File(loop_file_name.format(archtype), 'w')
        fdp = []
        for x in range(len(sections_ini)):
            fdp.append(File(loop_split_file_name.format(archtype, sections_ini[x], sections_end[x]), 'w'))
        for length in sorted(sorted_archs[archtype]):
            SBIglobals.alert('verbose', None, '\t{0}'.format(length))
            for archfile in sorted_archs[archtype][length]:
                SBIglobals.alert('verbose', None, '\t\t{0}\n'.format(archfile))
                nsp = Arch.load(archfile)
                fd.descriptor.write(nsp.archtype_format() + "\n")
                for x in range(len(fdp)):
                    if length >= sections_ini[x] and (sections_end[x] == 0 or length <= sections_end[x]):
                        fdp[x].descriptor.write(nsp.archtype_format() + "\n")
        fd.close()
        for x in range(len(fdp)):
            fdp[x].close()
Exemplo n.º 3
0
    def build(file_name, sequence_id, sequence, force=None):
        '''
        Creates a Fasta object and a FASTA file from a sequence.

        @param:    file_name
        @pdef:     name of the fasta file (with path, if necessary)
        @ptype:    {String}

        @param:    sequence_id
        @pdef:     name of the sequence
        @ptype:    {String}

        @param:    sequence
        @pdef:     sequence
        @ptype:    {String} or {List}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @return: {Fasta}
        '''
        newFasta = File(file_name, 'w', overwrite=force)
        newSeq = Sequence(sequence_id=sequence_id, sequence=sequence)
        newFasta.write(newSeq.format('FASTA'))
        newFasta.close()
        return Fasta(fasta_file=newFasta.full, auto_load=0)
Exemplo n.º 4
0
    def _parse(self):
        file_fd    = File(self._dsspfile)
        read       = False
        continuity = -1000
        readline   = 0
        for line in file_fd.read():
            if line.startswith("  #  RESIDUE AA STRUCTURE BP1 BP2  ACC"):
                read = True
                continue
            if read:
                if line[13:14] != '!':
                    res_num = int(line[6:10].strip())
                    ss      = line[16:17] if line[16:17] != ' ' else '-'
                    buried  = int(line[35:38].strip())
                    aa      = line[13:15].strip()

                    self._dsspdata.append(DSSP(secondary_structure = ss,
                                               accessibility       = buried,
                                               amino               = aa))
                    self._dsspdata[-1].add_hydrogen_links(line[39:50],
                                                          line[50:61],
                                                          line[61:72],
                                                          line[72:84])
                    if readline > 0:
                        if res_num != continuity + 1:
                            self._gapped = True
                        continuity = res_num
                    readline += 1
                else:
                    msg = "truncated chain!{0}\n".format(self._dsspfile)
                    sys.stderr.write(msg)
                    SBIg.warn(self, msg)
                    self._gapped = True
        file_fd.close()
Exemplo n.º 5
0
 def _save_release(self):
     '''
     Store the release data into a file.
     '''
     f = File(os.path.join(self.local, self._CONTROL_FILE), 'w', True)
     f.write(json.dumps(self._RELEASE))
     f.close()
    def _process(self):
        enzymes = self._parse_enzclass() + self._parse_enzymedat()
        enzymes.sort()

        enzFile = File(self._enzfile, 'w', True)
        for e in enzymes:
            enzFile.write(repr(e) + "\n")
        enzFile.close()
Exemplo n.º 7
0
    def _process(self):

        targets = self._process_targets()
        drugs = self._process_drugs(targets)

        drugFile = File(self._drugfile, 'w', True)
        for d in drugs:
            drugFile.write(repr(d) + "\n")
        drugFile.close()
Exemplo n.º 8
0
 def format2file(self, filename, extension = 'pdb', center = False):
     if extension not in ('pdb', 'js'):
         raise AttributeError('Not accepted extension')
     structure = File('.'.join([filename, extension]), 'w')
     if extension == 'pdb':
         structure.write(self.pdb_format(center = center))
     elif extension == 'js':
         structure.write(self.js_format(center = center))
     structure.close()
Exemplo n.º 9
0
    def subset(self,
               sequence_ids,
               new_fasta_file,
               all_but=False,
               prefix_size=None,
               index=False,
               force=None):
        '''
        Creates a new {Fasta} with the requested subset of sequences.

        @param:    sequence_ids
        @pdef:     sequence identifier(s)
        @ptype:    {String}, {List} or {Set}

        @param:    new_fasta_file
        @pdef:     name of the new fasta file
        @ptype:    {String}

        @param:    all_but
        @pdef:     Flag. Instead of retrieving the given ids, we retrieve all
                   except the given ids.
        @pdefault: _False_
        @ptype:    {Boolean}

        @param:    prefix_size
        @pdef:     maximum characters for the prefix. If _None_, all the
                   characters are included.
        @pdefault: _None_
        @ptype:    {Integer}

        @param:    index
        @pdef:     create the index file also, in case it does exist
        @pdefault: _False_
        @ptype:    {Boolean}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @raises: {AttributeError} if sequence_ids is not a valid type.
        @return: {Fasta}
        '''
        sequences = self.retrieve(sequence_ids, all_but, prefix_size)
        fasta_file = Fasta.build_multifasta(new_fasta_file, sequences, force)
        if self.has_index and index:
            idxfile = File(self.index_file)
            newidx = File(fasta_file.file.full + '.idx', 'w')
            seqids = set(fasta_file.sequence_identifiers)
            for idx in idxfile.read():
                if idx.split()[0].strip('>') in seqids:
                    newidx.write(idx)
            idxfile.close()
            newidx.close()
            fasta_file.index_file = newidx.full
        return fasta_file
Exemplo n.º 10
0
    def read_compacted_blast(compacted_blast_file):
        '''
        Read data from a printed compacted blast into {BlastResult}.
        Not all options will be available in that new object.

        @param:    compacted_blast_file
        @pdef:     file of the compacted blast print
        @ptype:    {String}

        @return: {BlastResult}
        '''
        from BlastHit import BlastHit
        query_name, query_sequence = None, None
        version, matrix, database = None, None, None
        gap_open, gap_extend, self_hit = None, None, None

        br = None

        cbf = File(compacted_blast_file)
        for line in cbf.read():
            if line.startswith('#'):
                if line.startswith('#Query:'):
                    query_name = line.strip().split()[-1]
                if line.startswith('#Query Sequence:'):
                    query_sequence = line.strip().split()[-1]
                if line.startswith('#Blast Version:'):
                    version = line.strip().split()[-1]
                if line.startswith('#Search on matrix:'):
                    matrix = line.strip().split()[-1]
                if line.startswith('#Gap open penalty:'):
                    gap_open = line.strip().split()[-1]
                if line.startswith('#Gap extension penalty:'):
                    gap_extend = line.strip().split()[-1]
                if line.startswith('#Database searched:'):
                    database = line.strip().split()[-1]
                if line.startswith('#Self Hit is omitted:'):
                    self_hit = line.strip().split()[-1]
            else:
                if br is None:
                    if version is None:
                        bh = None
                    else:
                        bh = BlastHeader(version, matrix, gap_open, gap_extend,
                                         database, self_hit)
                    br = BlastResult(query_name, query_sequence, bh)
                d = line.strip().split()
                hit = BlastHit(
                    [d[2], d[3]], [d[8], d[9]],
                    [int(x) for x in d[10].split(',')[0].split(':')], 1,
                    [d[4], d[5], d[6], d[7]])
                br.add_hit(hit)
        cbf.close()

        return br
Exemplo n.º 11
0
    def make_PDBseq(self, log_file, resolution_threshold=None):
        if not self.has_local:
            raise NameError(
                'A local PDB database must be defined to do create a PDBseq database.'
            )
        outdir = self.PDBseq if self.PDBseq is not None else os.curdir

        Path.mkdir(self.PDBseq)
        fasta_file = File(file_name=os.path.join(outdir, 'PDBseq.fa'),
                          action='w',
                          overwrite=True)
        fasta_fd = fasta_file.descriptor
        idx_file = File(file_name=os.path.join(outdir, 'PDBseq.fa.idx'),
                        action='w',
                        overwrite=True)
        idx_fd = idx_file.descriptor
        # if resolution_threshold is not None:
        #     filtered_file_name = self.get_PDBseq_filtered(resolution_threshold)
        #     filtered_file      = File(file_name = filtered_file_name, action = 'w', overwrite = True)
        #     filtered_fd        = filtered_file.descriptor
        #     resolutions        = self.get_resolutions(resolution_threshold = resolution_threshold)
        log_file = File(file_name=log_file, action='w', overwrite=True)
        log_idx = log_file.descriptor

        for pdb_file in self.localPDBs:
            log_idx.write("Reading File: {0}\n".format(pdb_file))
            newPDB = PDB(pdb_file=pdb_file, dehydrate=True)
            fasta_idx = newPDB.FASTA_IDX(nucleotide=False)
            if len(fasta_idx['FASTA']) != len(fasta_idx['IDX']):
                log_idx.write(
                    'ERROR!!!!! Number of fastas and indexes are different for pdb {0}!!\n'
                    .format(newPDB.id))
            if len(fasta_idx['FASTA']) > 0:
                log_idx.write('\tPrinting FASTA and IDX...\n')
            else:
                log_idx.write('\tProblably just a nucleotide PDB...\n')
            for c in range(len(fasta_idx['FASTA'])):
                sequence = fasta_idx['FASTA'][c].split('\n')[1]
                sequence = sequence.replace('X', '').replace('x', '')
                if len(sequence) > 0:
                    fasta_fd.write(fasta_idx['FASTA'][c] + "\n")
                    if resolution_threshold is not None and newPDB.id in resolutions and not newPDB.is_all_ca:
                        filtered_fd.write(fasta_idx['FASTA'][c] + "\n")
                    idx_fd.write(fasta_idx['IDX'][c] + "\n")
            del (newPDB)

        #CLOSE & END
        fasta_file.close()
        idx_file.close()
        if resolution_threshold is not None:
            filtered_fd.close()
Exemplo n.º 12
0
    def release(self):
        '''
        Retrieves release data for the database.
        Not according to the DB release, but to when we downloaded it.

        @returns: {Dictionary}
        '''
        if os.path.isfile(os.path.join(self.local, self._CONTROL_FILE)):
            f = File(os.path.join(self.local, self._CONTROL_FILE))
            data = json.loads(f.read())
            f.close()
        else:
            data = self._RELEASE
        return data
Exemplo n.º 13
0
    def print_compacted_blast(self, out_file=None):
        '''
        Print the compacted format of the blast hit.

        @param:    out_file
        @pdef:     file to print the blast data into.
        @pdefault: _None_
        @ptype:    {String}
        '''
        if out_file is not None:
            output = File(out_file, 'w')
            output.write("%s\n" % self.str_compacted_blast())
            output.close()
        else:
            print self.str_compacted_blast()
Exemplo n.º 14
0
    def items(self):
        '''
        Loops through the items of the database

        @yields: Object depending on the database.
        '''
        if not self.has_local:
            SBIg.throw(self, 'A local database needs to be build first',
                       IOError)

        for ifile in self._ITEM_FILES:
            ifile = os.path.join(self.local, ifile)
            f = File(ifile)
            for line in f.read():
                yield self._DBOBJECT.grab(line.strip())
            f.close()
Exemplo n.º 15
0
    def _process(self):
        go_dic = {}
        parseFile = File(os.path.join(self.local, self._gfile), 'r')
        go = None
        for line in parseFile.descriptor:
            line = re.sub('\'', '\\\'', line)
            if line.startswith('[Term]'):
                if go is not None:
                    go_dic[go.id] = go
            if line.startswith('id:'):
                go = GOterm(id = line.split()[1].strip())
                continue
            if line.startswith('name:'):
                go.name = " ".join(line.split()[1:]).strip()
                continue
            if line.startswith('namespace:'):
                go.namespace = line.split()[1].strip()
                continue
            if line.startswith('alt_id:'):
                go.alt_id.append(line.split()[1].strip())
                continue
            if line.startswith('is_obsolete:'):
                go.obsolete = True
                continue
            if line.startswith('is_a:'):
                go.parents.add(line.split()[1].strip())
                continue
            if line.startswith('relationship:'):
                go.relations.append((line.split()[1].strip(),line.split()[2].strip()))
                continue
            if line.startswith('[Typedef]'):
                go_dic[go.id] = go
                break
        parseFile.close()

        for go in go_dic:
            go_dic[go].parents = self._search_parents(go_dic, go)

        goFile = File(self._gofile, 'w', True)
        for go in go_dic:
            go_dic[go].parents.add(go)
            goFile.write(str(go_dic[go]) + "\n")
        goFile.close()
Exemplo n.º 16
0
    def print_representation(self, line_split=160, out_file=None):
        '''
        Print the alignment representation of the blast hit.

        @param:    line_split
        @pdef:     number of characters per line
        @pdefault: 160
        @ptype:    {Integer}

        @param:    out_file
        @pdef:     file to print the blast data into.
        @pdefault: _None_
        @ptype:    {String}
        '''
        if out_file is not None:
            output = File(out_file, 'w')
            output.write("%s\n" % self.str_representation(line_split))
            output.close()
        else:
            print self.str_representation(line_split)
Exemplo n.º 17
0
 def _process(self):
     tmoFile = File(self._pdbtmfile, 'w', True)
     for xmlfile in Path.list_files(
             os.path.join(self._local, 'pdbtm/database/'), '*.xml'):
         xmldata = TM(
             pdb=os.path.splitext(os.path.split(xmlfile)[1])[0].upper())
         skip_chains = set()
         read = False
         fdxml = open(xmlfile)
         for line in fdxml:
             if line.startswith('    <TMRES>'): xmldata.tmres = line
             elif line.startswith('    <TMTYPE'): xmldata.tmtype = line
             elif line.startswith('    <PDBKWRES'): xmldata.kwres = line
             elif line.startswith('  <SIDEDEFINITION'):
                 m = re.search('Side1="(\S+)"', line)
                 xmldata.side = m.group(1)
             elif line.startswith('      <APPLY_TO_CHAIN'):
                 m = re.search('NEW_CHAINID=\"(\S{1})\"', line)
                 if m: skip_chains.add(m.group(1))
             elif line.startswith('  <CHAIN '):
                 m = re.search(
                     'CHAINID=\"(\S{1})\" NUM_TM=\"(\d{1})\" TYPE=\"(\S+)\"',
                     line)
                 if m:
                     chain, num, tmtype = m.group(1), m.group(2), m.group(3)
                     if not chain in skip_chains:
                         cdata = tuple([chain, num, tmtype])
                         xmldata.set_chain(cdata)
                         read = True
             elif line.startswith('    <REGION ') and read:
                 m = re.search(
                     'pdb_beg=\"(\-*\d+\w*)\"[\s\S]+pdb_end=\"(\-*\d+\w*)\"\s+type=\"(\w{1})\"',
                     line)
                 ini, end, tmtype = m.group(1), m.group(2), m.group(3)
                 xmldata.set_chain(cdata, tuple([ini, end, tmtype]))
             elif line.startswith('  </CHAIN>'):
                 read = False
         fdxml.close()
         if len(xmldata.chains) > 0:
             tmoFile.write(str(xmldata) + "\n")
     tmoFile.close()
Exemplo n.º 18
0
    def reduce(self, new_fasta_file, list_file, force=None):
        '''
        Reduces the {Fasta} by removing identical sequences.

        @param:    new_fasta_file
        @pdef:     name of the new fasta file
        @ptype:    {String}

        @param:    list_file
        @pdef:     name of the repetition list file
        @ptype:    {String}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @return: {Fasta} and {File} with the list of identical sequences.
        '''
        seq_md5 = {}
        sequences = []
        for seq in self.live_show():
            md5 = seq.md5
            if not md5 in seq_md5:
                sequences.append(seq)
                seq_md5.setdefault(md5, [])
            else:
                SBIg.alert(
                    'debug', self,
                    '{0} repeats of {1}'.format(seq.id, seq_md5[md5][0]))
            seq_md5[md5].append(seq.id)
        fasta = Fasta.build_multifasta(new_fasta_file, sequences, force)
        listfile = File(list_file, 'w')
        for md5 in seq_md5:
            listfile.write('\t'.join(seq_md5[md5]) + '\n')
        listfile.close()

        return fasta, listfile
Exemplo n.º 19
0
    def _process(self):
        inh = {}
        nodefile = File(file_name = self._nodes, action = 'r')
        for line in nodefile.descriptor:
            line = re.sub('\'', '\\\'', line)
            line_data = line.split('|')
            inh[line_data[0].strip()] = TaxID(line_data[0].strip())
            inh[line_data[0].strip()].parent = line_data[1].strip()
            inh[line_data[0].strip()].rank   = line_data[2].strip()
        nodefile.close()

        namefile = File(file_name = self._names, action = 'r')
        for line in namefile.descriptor:
            line = re.sub('\'', '\\\'', line)
            line_data = line.split('|')
            if line_data[3].strip() == 'scientific name':
                inh[line_data[0].strip()].name = line_data[1].strip()
        namefile.close()

        delefile = File(file_name = self._delet, action = 'r')
        for line in delefile.descriptor:
            data = line.split('|')
            inh[data[0].strip()]     = TaxID(data[0].strip())
            inh[data[0].strip()].old = True
        delefile.close()

        mrgefile = File(file_name = self._merged, action = 'r')
        for line in mrgefile.descriptor:
            data = line.split('|')
            inh[data[0].strip()]     = TaxID(data[0].strip())
            inh[data[0].strip()].old = True
            inh[data[0].strip()].new = data[1].strip()
        mrgefile.close()

        taxFile = File(self._taxid, 'w', True)
        for taxid in inh:
            taxFile.write(str(inh[taxid]) + "\n")
        taxFile.close()
Exemplo n.º 20
0
    def _process(self, update=False):
        '''
        Transform the source files into the final local db files.

        @param:    update
        @pdef:     toggles between create and update processing
        @pdefault: _False_
        @ptype:    {Boolean
        '''
        if update:
            old = self._RELEASE['total_items'].copy()
        j = 0
        for i in range(len(self._SOURCES)):
            dfilen = os.path.join(self.local, self._SOURCES[i])
            ofilen = os.path.join(self.local, self._MANDATORY_FILES[j])
            ffilen = os.path.join(self.local, self._MANDATORY_FILES[j + 1])
            if not os.path.isfile(dfilen):
                continue
            SBIg.alert('verbose', self, 'Parsing:       {0}'.format(dfilen))
            SBIg.alert('verbose', self, 'DB file to:    {0}'.format(ofilen))
            SBIg.alert('verbose', self, 'Fasta file to: {0}'.format(ffilen))
            dfile = File(dfilen)
            ofile = File(ofilen, 'w', update)
            ffile = File(ffilen, 'w', update)
            protein = None
            for protein in Connect._parse_uniprot(dfile):
                pname = protein.entry_name
                pvers = protein.version
                SBIg.alert('verbose', self, 'Protein: {0}'.format(pname))
                if not update:
                    self._RELEASE['total_items'][pname] = pvers
                else:
                    if pname not in self._RELEASE['total_items']:
                        self._RELEASE['new_items'][pname] = pvers
                    else:
                        del (old[pname])
                        if self._RELEASE['total_items'][pname] != pvers:
                            self._RELEASE['update_items'][pname] = pvers

                ffile.write(protein.sequence.format('FASTA') + '\n')
                ofile.write(protein.json() + '\n')
            j += 2
            dfile.close()
            ofile.close()
            ffile.close()

        if update:
            self._RELEASE['total_items'].update(self._RELEASE['new_items'])
            self._RELEASE['total_items'].update(self._RELEASE['update_items'])
            self._RELEASE['deleted_items'] = old
            for k in self._RELEASE['deleted_items']:
                del (self._RELEASE['total_items'][k])
Exemplo n.º 21
0
    def get_FASTA_IDX_by_names_to_file(self, names, outfile):

        fastafile = Fasta(self.PDBseq)
        selectedfasta = fastafile.retrieve(copy.deepcopy(names))
        output_fasta = File(outfile, 'w')
        for sequence in selectedfasta:
            output_fasta.write(sequence.format('FASTA') + "\n")
        output_fasta.close()
        idxfile = self.PDBseq + '.idx'
        output_idx = File(outfile + '.idx', 'w')
        input_idx = File(idxfile, 'r')
        for line in input_idx.descriptor:
            info = line.split()
            pdbname = info[0][1:]
            if pdbname in names:
                output_idx.write(line)
        input_idx.close()
        output_idx.close()
Exemplo n.º 22
0
 def localTaxIDs(self):
     taxFile = File(self._taxid, 'r')
     for tax_line in taxFile.descriptor:
         yield tax_line
     taxFile.close()
Exemplo n.º 23
0
    def correct_hit_count(self,
                          count_hit_file=None,
                          count_query_file=None,
                          return_correction_dict=False):
        '''
        Corrects the starting point of the hits and the query, if needed.
        Why?
        When blasting vs. PDB (for example), sometimes the hit positions given
        by blast are wrong, as the blast always consider the first position of
        the hit sequence as 1 and PDB does not.
        Even more, the position reference doesn't even need to be a number.
        As the specific location in the PDB is important, we need to adapt our
        blasts so than we can read that data.
        Keep in mind that hits and query must be corrected together in this step,
        as this function cannot be called twice for a same instance.

        @param:    count_hit_file
        @pdef:     file containing the idex data for the query database
                   each sequence in this file will have a format such as:
                   >3K2K_A -7 ;-6 ;-5 ;-4 ;-3 ;-2 ;-1 ;0 ;1 ;2 ;3 ;4 ;5 ;6 ;7 ...
        @ptype:    {String}

        @param:    count_query_file
        @pdef:     sometimes we might also need to correct the query (if PDB vs.
                   PDB). Same format as count_hit_file. They might be the same file.
        @ptype:    {String}

        @param:    return_correction_dict
        @pdef:     instead of actually executing the correction, it only returns
                   the dictionary for further use.
        @pdefault: _False_
        @ptype:    {Boolean}

        @raises: {IOError} if the correction index file does not exist.
        @raises: {AttributeError} if the BlastResult does not contain any BlastHit.
        @raises: {BlastError} if it has been called before for this instance.

        '''
        if not self.has_hits:
            SBIg.warn(
                self,
                "BlastResult of {0} has no hits to correct".format(self.query))
            return

        if self.are_hits_corrected:
            be = BlastExe.BlastError()
            raise be.corrected_hits()

        SBIg.alert('debug', self,
                   'Correcting indexes for {0}'.format(self.query))
        cfile = File(count_hit_file)
        cq = False

        codes_of_interest = set([hit.sequenceID for hit in self.raw_hits])
        if count_query_file == count_hit_file:
            codes_of_interest.add(self.query)
            count_query_file = None
            cq = True

        start_index_dic = {}
        for line in cfile.read():
            if len(line.strip()) > 0:
                k = line.split('\t')
                if k[0].lstrip('>') in codes_of_interest:
                    start_index_dic[k[0].lstrip('>')] = k[1].strip().split(';')
        cfile.close()

        if count_query_file is not None:
            cfile = File(count_query_file)
            for line in cfile.read():
                if len(line.strip()) > 0:
                    k = line.split('\t')
                    if k[0].lstrip('>') == self.query:
                        start_index_dic[k[0].lstrip('>')] = k[1].strip().split(
                            ';')
            cfile.read().close()
            cq = True

        if cq:
            SBIg.alert('debug', self, '\tFixing Query {0}'.format(self.query))
            self._query_index = start_index_dic[self.query]

        if return_correction_dict:
            return start_index_dic

        for hit in self._hits:
            # This tests between the options PDB/PDB_ID or PDB_ID in case
            # the TAB file has different codification
            h = hit.sequenceID
            hit_ID = h if h in start_index_dic else h.split("/")[-1]
            SBIg.alert('debug', self, '\tFixing {0}'.format(hit_ID))
            hit.correct_hit_count(new_index=start_index_dic[hit_ID])
            if cq:
                SBIg.alert('debug', self,
                           '\tFixing Query {0}'.format(self.query))
                hit.correct_query_count(new_index=start_index_dic[self.query])

        self._correctedHits = True
class PDBeChem(object):

    """
    """

    def __init__(self, cif_file):
        self._file    = File(file_name = cif_file, action = 'r')
        self.__name__ = 'databases.PDBeChem'    # This must be included in every class for the SBIglobals.alert()

        self._id        = None
        self._name      = None
        self._type      = None
        self._formula   = None
        self._parent    = None
        self._weight    = None
        self._fcharge   = None
        self._code1l    = None
        self._flformula = {}

        self._parse()
        self._decompose_formula()

    """ATTRIBUTES"""
    @property
    def id(self): return self._id

    @property
    def name(self): return self._name

    @property
    def type(self): return self._type

    @property
    def formula(self): return self._formula

    @property
    def full_formula(self): return self._flformula

    @property
    def parent(self): return self._parent

    @property
    def weight(self): return self._weight

    @property
    def formal_charge(self): return self._fcharge

    @property
    def code1(self): return self._code1l

    @property
    def code3(self): return self._id
    """PRIVATE METHODS"""

    def _parse(self):
        for line in self._file.descriptor:
            if line.startswith('_chem_comp.'):
                line  = line.replace('_chem_comp.', '')
                value = line[35:].strip().strip('"')
                value = value.replace(' (NON-PREFERRED NAME)', '')
                value = value if value != '?' else None
                if line.startswith('id'): self._id      = value
                if line.startswith('pdbx_type'): self._type    = value
                if line.startswith('formula '): self._formula = value
                if line.startswith('formula_weight'): self._weight  = value
                if line.startswith('pdbx_formal_charge'): self._fcharge = value
                if line.startswith('one_letter_code'): self._code1l  = value
                if line.startswith('name'): self._name    = value.upper()
                if line.startswith('mon_nstd_parent_comp_id'):
                    self._parent  = set([x.strip() for x in value.split(',')]) if value is not None else None
            if line.startswith(';') and self._name == '': self._name  += line.strip().lstrip(';').upper()
        self._file.close()

    def _decompose_formula(self):
        if self.formula is not None:
            data    = self.formula.split()
            atregex = re.compile('(\D+)(\d*)')
            for atom in data:
                m = atregex.search(atom)
                if m.group(1) in element_dic:
                    self._flformula[m.group(1)] = m.group(2) if m.group(2) != '' else 1

    """OVERWRITE INHERITED FUNCTIONS"""

    def __str__(self):
        if self.code1 is not None and self.parent is not None:
            return "[{0.id} - {0.code1} from {0.parent}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format(self)
        elif self.code1 is not None:
            return "[{0.id} - {0.code1}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format(self)
        elif self.parent is not None:
            return "[{0.id} from {0.parent}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format(self)
        else:
            return "[{0.id}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format(self)
Exemplo n.º 25
0
class CDhitList(StorableObject):
    '''
    List of cd-hit clusters.

    '''
    def __init__(self, cdhit_file=None):
        '''
        @param:    cdhit_file
        @pdef:     name of the cd-hit output file
        @pdefault: _None_. Create an empty list
        @ptype:    {String}

        '''
        self._clusters = []
        self._allseqids = {}
        if cdhit_file is not None:
            self._file = File(file_name=cdhit_file)
        else:
            self._file = None

        if self._file is not None:
            self._parse_file()

    ##############
    # ATTRIBUTES #
    ##############
    @property
    def clusters(self):
        '''
        List of cd-hit clusters.

        @return: {List} of {CDhit}
        '''
        return self._clusters

    ###########
    # METHODS #
    ###########
    def get_cluster4sequence(self, sequence):
        '''
        Retrieve a cluster for a given sequence. _None_ if the sequence is not
        found.

        @param:    sequence
        @pdef:     name of the query sequence
        @ptype:    {String}

        @return: {CDhit}
        '''
        if sequence in self._allseqids:
            return self._clusters[self._allseqids[sequence]]
        else:
            return None

    def is_in_cluster(self, sequence):
        '''
        Evaluate if the sequence is in a cluster.

        @param:    sequence
        @pdef:     name of the query sequence
        @ptype:    {String}

        @return: {String} as 'N' if no, 'H' if yes and 'M' if cluster master
        '''
        c = self.get_cluster4sequence(sequence)
        if c is None:
            return 'N'
        else:
            return 'M' if c.is_master(sequence) else 'H'

    def add_cluster(self, cluster):
        '''
        Add a cd-hit cluster to the object.

        @param:    cluster
        @pdef:     new cd-hit cluster to add
        @ptype:    {CDhit}
        '''
        self._clusters.append(cluster)

    def add_sequence2cluster(self, sequence, cluster_id=None):
        '''
        Add a new sequence to a given cluster.

        @param:    sequence
        @pdef:     name of the query sequence
        @ptype:    {String}

        @param:    cluster_id
        @pdef:     identifier of the cluster
        @pdefault: _None_. Refers to the last added cluster.
        @ptype:    {String}
        '''
        if cluster_id is None:
            self.clusters[-1].add_sequence(sequence)
            self._allseqids[sequence.name] = len(self.clusters) - 1
        else:
            for x in range(len(self._clusters)):
                if self._clusters[x].identifier == cluster_id:
                    self._clusters[x].add_sequence(sequence)
                    self._allseqids[sequence.name] = x
                    break

    def dictionary_role_summary(self):
        '''
        Creates a dictionary separating master sequences and homolog sequences.

        @return: {Dictionary}
        '''
        data = {'master': [], 'homolog': []}
        for c in self.clusters:
            data['master'].append(c.master.name)
            for s in c.sequences:
                data['homolog'].append(s)
        return data

    def merge_clusters(self, cluster_file):
        '''
        When using an intermediate state to cluster by homology,
        the result of the second clustering is a clustering of clusters.
        We need to transform this into the original sequences

        @param:    cluster_file
        @pdef:     name of the second-step cluster output
        @ptype:    {String}
        '''
        clustlist = CDhitList(cluster_file)
        newlist = CDhitList()
        cluster_re = re.compile('Cluster\s+(\d+)')
        for cl in clustlist.clusters:
            c = CDhit(cluster_id=cl.identifier)
            newlist.add_cluster(c)
            cnum = int(cluster_re.search(cl.master.name).group(1))
            oldclust = self.clusters[cnum]
            newlist.add_sequence2cluster(sequence=oldclust.master)
            for s in oldclust.sequences:
                newlist.add_sequence2cluster(sequence=oldclust.sequences[s])
            for s in cl.sequences:
                idclust = cl.sequences[s]
                cnum = int(cluster_re.search(idclust.name).group(1))
                oldclust = self.clusters[cnum]
                master = oldclust.master
                master.homology = idclust.homology
                newlist.add_sequence2cluster(sequence=master)
                for s in oldclust.sequences:
                    h = oldclust.sequences[s]
                    h.homology = int(h.homology * float(idclust.homology) / 10)
                    newlist.add_sequence2cluster(sequence=h)

        self._clusters = newlist._clusters
        self._allseqids = newlist._allseqids

    ###################
    # PRIVATE METHODS #
    ###################
    def _parse_file(self):
        '''
        Read the cd-hit output file into a {CDhitList}

        '''
        homolog_re = re.compile('(\d+)aa,\s+\>([\s\w]+)\.{3}')
        for line in self._file.read():
            if line.startswith('>'):
                c = CDhit(cluster_id=line.split()[-1].strip())
                self.add_cluster(c)
            else:
                data = homolog_re.search(line)
                d = line.split()
                h = CDhitHomolog(name=data.group(2),
                                 length=data.group(1),
                                 homology=d[-1])
                self.add_sequence2cluster(sequence=h)
        self._file.close()

    def __len__(self):
        return len(self._clusters)

    def __repr__(self):
        text = []
        for c in self.clusters:
            text.append('{0}'.format(c))
        return '\n'.join(text)
Exemplo n.º 26
0
class PDB(StorableObject):

    """
    A {PDB} is a collection of {Chain}
    """
    def __init__(self, pdb_file=None, dehydrate=False, header=False,
                 onlyheader=False, biomolecule=False):
        """
        @type  pdb_file: String
        @param pdb_file: PDB formated file to read

        @raise IOError if pdb_file does not exist and it is not an empty object
        """
        if biomolecule or onlyheader:
            header = True

        self._pdb_file      = pdb_file
        self._chains        = []
        self._NMR           = False
        self._NMR_chains    = []
        self._chain_id      = set()

        self._biomol_id     = -1    # -1 -> original
                                    #  0 -> symmetry
                                    # >0 -> biomolecule

        self._header        = None

        self._has_prot      = False
        self._has_nucl      = False

        self._COMPND        = None

        if self.pdb_file is not None:
            self._pdb_file  = File(file_name=self._pdb_file, action='r')
            self._read_PDB_file(header=header,
                                onlyheader=onlyheader,
                                biomolecule=biomolecule)

        if dehydrate:
            self.dehydrate()

    #
    # ATTRIBUTES
    #
    @property
    def pdb_file(self):
        """
        PDB file name
        @rtype: String
        """
        return self._pdb_file

    @pdb_file.setter
    def pdb_file(self, value):
        """
        Sets a PDB file if none has been given
        @raise UsedAttributeError
        """
        if self._pdb_file is not None:
            raise AttributeError(
                "The PDB object is loaded from file {0}. To load the new file {1} create a new PDB object".format(self._pdb_file.full, value))

        if isinstance(value, File):
            self._pdb_file = value
        else:
            self._pdb_file = File(file_name=value, type='r')

    @property
    def chain_identifiers(self):
        return self._chain_id

    @property
    def id(self):
        return self._chains[0].pdb

    @property
    def chains(self):
        """
        List of {Chain} contained in the PDB w/out NMR replicas
        @rtype: List of {Chain}
        """
        return self._chains

    @property
    def proteins(self):
        """
        List of {ProteinChain} contained in the PDB w/out NMR replicas
        @rtype: List of {ProteinChain} (iterator)
        """
        for chain in self.chains:
            if isinstance(chain, ChainOfProtein):
                yield chain

    @property
    def nucleotides(self):
        """
        List of {NucleotideChain} contained in the PDB w/out NMR replicas
        @rtype: List of {NucleotideChain} (iterator)
        """
        for chain in self.chains:
            if isinstance(chain, ChainOfNucleotide):
                yield chain

    @property
    def non_standard_chains(self):
        """
        List of non {NucleotideChain}/ non {ProteinChain} contained in the PDB w/out NMR replicas
        @rtype: List of non {NucleotideChain}/ non {ProteinChain} (iterator)
        """
        for chain in self.chains:
            if not isinstance(chain, ChainOfNucleotide) and not isinstance(chain, ChainOfProtein):
                yield chain

    @property
    def all_models(self):
        """
        List of {Chain} contained in the PDB w/ NMR replicas
        @rtype: List of {Chain}
        """
        return self._chains + self._NMR_chains

    @property
    def header(self):
        if self._header is None:
            return ''
        else:
            return self._header

    @property
    def biomolecule_identifier(self):
        return self._biomol_id

    #
    # COMPLEX GETTERS & SETTERS
    #
    def get_chain_by_id(self, id):
        """
        Returns a chain according to its id or None if no chain with that id is found
        @rtype: {Chain}
        """
        for chain in self._chains:
            if chain.chain == id:
                return chain
        return None

    def add_chain(self, chain, NMR=False):
        """
        Adds a new chain to the PDB
        """
        if not NMR:
            self._chains.append(chain)
        elif NMR and self._NMR:
            self._NMR_chains.append(chain)

        self._chain_id.add(chain.chain)

    def add_chains(self, chains, NMR=False):
        """
        Adds a new chains to the PDB
        """
        for chain in chains:
            self.add_chain(chain=chain, NMR=NMR)

    def _get_chain_position_by_id(self, id):
        """
        Returns the position in the chain array where the chain is
        @rtype: Integer
        """
        for x in range(len(self._chains)):
            if self._chains[x].chain == id:
                return x
        return None

    #
    # BOOLEANS
    #
    @property
    def is_NMR(self):
        """
        Identifies if the PDB contains NMRs
        @rtype: Boolean
        """
        return self._NMR

    def chain_exists(self, chain):
        """
        Confirms if a given chain exists in the PDB
        @rtype: Boolean
        """
        return chain in self._chain_id

    @property
    def has_protein(self):
        """
        Checks if the PDB contains a protein (not only)
        @rtype: Boolean
        """
        return self._has_prot

    @property
    def has_nucleotide(self):
        """
        Checks if the PDB contains a nucleotide chain (not only)
        @rtype: Boolean
        """
        return self._has_nucl

    @property
    def repeated_chain_ids(self):
        """
        Checks if more than one {Chain} has the same assigned ID
        @rtype: Boolean
        """
        return len(self._chain_id) < len(self._chains)

    @property
    def is_all_ca(self):
        for p in self.proteins:
            if p.is_only_ca():
                return True
        return False

    #
    # METHODS
    #
    def dehydrate(self):
        recheck_chains = False
        for c in self.chains:
            c.dehydrate()
            if c.is_empty:
                recheck_chains = True
        if recheck_chains:
            c = []
            for ch in self.chains:
                if not ch.is_empty:
                    c.append(ch)
                else:
                    self._chain_id.remove(ch.chain)
            self._chains = c

    def duplicate(self, hetero=True, water=False, NMR=False):
        """
        Returns a {PDB} identical to the original but as a new object
        @rtype: {PDB}
        """
        new_PDB = PDB()
        new_PDB.pdb_file = self.pdb_file

        for chain in self.chains:
            new_PDB.add_chain(
                chain=chain.duplicate(hetero=hetero, water=water))

        if NMR:
            for chain in self._NMR_chains:
                new_PDB.add_chain(chain=chain.duplicate(
                    hetero=hetero, water=water), NMR=True)

        new_PDB._NMR = self._NMR
        new_PDB._has_prot = self._has_prot
        new_PDB._has_nucl = self._has_nucl

        return new_PDB

    def apply_symmetry_matrices(self):
        """
        Only works if the PDB file is an original PDB file
        or the matrices have been added in the correct PDB format
        @rtype: {PDB}
        """
        if self._header is None:
            self._read_PDB_file(header=True, onlyheader=True)
        return self._apply_matrix(matrix=self.header.symmetry_matrix)

    def apply_biomolecule_matrices(self, keepchains=False, water=True):
        """
        Only works if the PDB file is an original PDB file or
        the matrices have been added in the correct PDB format
        @rtype: {PDB}
        """
        if self._header is None:
            self._read_PDB_file(header=True, onlyheader=True)
        PDB_list = []
        for matrix in self.header.biomolecules:
            PDB_list.append(self._apply_matrix(matrix=matrix,
                                               keepchains=keepchains,
                                               realchains=self._chain_id,
                                               water=water))
        return PDB_list

    def _apply_matrix(self, matrix, keepchains=False, realchains=None, water=True):
        new_PDB            = PDB()
        new_PDB._biomol_id = matrix.identifier

        for chain in self.chains:
            if chain.chain in matrix.chains:
                for mat in matrix.matrices:
                    new_chain = chain.duplicate(water=water)
                    new_chain.reposition(matrix=mat.matrix, vector=mat.vector)
                    if len(new_chain) >= 1:
                        new_PDB.add_chain(chain=new_chain)
        if not keepchains:
                new_PDB.tmpclean(cluster_by_alternative_id=True, exclude_chains = realchains)
        return new_PDB

    def clean(self):
        first_atom = 1
        for c in self.chains:
            c.clean(initatom=first_atom)
            first_atom = c.last_residue.last_atom_number + 1

    def tmpclean(self, cluster_by_alternative_id=False, exclude_chains = None):
        """
        Makes a clean version of the PDB, rechaining in order and renumerating atoms.
        Renumbering residues is optional
        """
        pchainsIDs = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890"
        chainsIDs = ""
        chainsNIDs = ""
        chainID = 0
        atom_count = 1

        for x in range(len(pchainsIDs)):
            if exclude_chains is not None and pchainsIDs[x] in exclude_chains:
                chainsNIDs += pchainsIDs[x]
            elif not self.chain_exists(chain=pchainsIDs[x]):
                chainsIDs += pchainsIDs[x]
            else:
                chainsNIDs += pchainsIDs[x]

        chain_change = len(self) <= len(chainsIDs)

        for chain in self.chains:
            if (chain.chain not in chainsNIDs) and chain_change:
                self._chain_id.add(chain.chain)
                chain.chain = chainsIDs[chainID]
                chainID += 1
                self._chain_id.add(chain.chain)
                if cluster_by_alternative_id:
                    if self._COMPND is None:
                        self._COMPND = {}
                    if chain.alternative_id not in self._COMPND:
                        self._COMPND.setdefault(
                            chain.alternative_id, []).append(chain.alternative_id)
                    self._COMPND[chain.alternative_id].append(chain.chain)
            else:
                chainsNIDs = chainsNIDs.replace(chain.chain, '')

            chain.renumerate_atoms(init=atom_count)
            atom_count += (chain.atom_length)

    def fuse_chains(self, chains_ids):
        """
        Fuses several chains into the first one.
        It will not allow to fuse different structural chains.
        It does not alter the {PDB}, but provides a new one
        @rtype: {Chain}

        @raise AttributeError if:
            a) A given chain ID is not present
            b) Try to fuse different structural chains
        """
        if len(self._chain_id.intersection(set(chains_ids))) < len(chains_ids):
            raise AttributeError(
                "Some of the given chains to fues do not exist")

        error_counter = 0
        error_control = [False, False]
        new_PDB = PDB()
        for c in chains_ids:
            chain = self.get_chain_by_id(id=c)
            new_PDB.add_chain(chain=chain.duplicate())
            if isinstance(chain, ChainOfProtein) and not error_control[0]:
                error_counter += 1
                error_control[0] = True
            elif isinstance(chain, ChainOfNucleotide) and not error_control[1]:
                error_counter += 1
                error_control[1] = True
            if error_counter == 2:
                raise AttributeError(
                    "Fuse different kinds of structural chain is not possible\n")

        init_chain_num = new_PDB.chains[0].last_residue.number
        for x in range(1, len(new_PDB.chains)):
            new_PDB.chains[x].renumerate_residues(init=init_chain_num + 1)
            init_chain_num = new_PDB.chains[0].last_residue.number
            new_PDB.chains[0].fuse(chain=new_PDB.chains[x])

        return_PDB = PDB()
        return_PDB.add_chain(chain=new_PDB.chains[0])
        return return_PDB

    # def calculate_dssp(self, out_dir = None, store = True):
    #     """
    #     Executes DSSP and assigns the prediction to each chain

    #     @param  out_dir: directory to save the output
    #     @defaut out_dir: None

    #     @param store: Save the dssp output(?)
    #     """

    #     for chain in self.proteins:
    #         if out_dir is None:
    #             pdb_file  = chain.globalID + ".pdb2dssp"
    #             dssp_file = chain.globalID + ".dssp"
    #         else:
    #             Path.mkdir(newdir = out_dir)
    #             pdb_file  = os.path.join(os.path.abspath(out_dir), chain.globalID + ".pdb2dssp")
    #             dssp_file = os.path.join(os.path.abspath(out_dir), chain.globalID + ".dssp")

    #         pdb_fd = open(pdb_file, 'w')
    #         pdb_fd.write(chain.PDB_format())
    #         pdb_fd.close()

    #         dssp_calc = DSSPexec(pdb_file = pdb_file, dssp_file = dssp_file,
    #                              chain    = chain,    store     = store)

    def rotate(self, matrix=None):
        """
        Rotates each {Chain} according to a given matrix

        @type matrix: numpy.matrix
        """
        if matrix is None:
            matrix = numpy.identity(3, float)
        for chain in self.all_models:
            chain.rotate(matrix=matrix)

    def translate(self, vector=None):
        """
        Translates each {Chain} according to a translational vector

        @type vector: numpy.array
        """
        if vector is None:
            vector = numpy.zeros(3, float)
        for chain in self.all_models:
            chain.translate(vector=vector)

    def reposition(self, matrix=None, vector=None):
        """
        Rotates and Translates each {Chain} according to a matrix and a translational vector

        @type matrix: numpy.matrix

        @type vector: numpy.array
        """
        if matrix is None:
            matrix = numpy.identity(3, float)
        if vector is None:
            vector = numpy.zeros(3, float)
        for chain in self.all_models:
            chain.reposition(matrix=matrix, vector=vector)

    # def calculate_protein_heteroatom_contacts(self, distance = 6):
    #     """
    #     Returns a {HeteroatomContacts} list with the contacts between a protein and its heteroatoms
    #     at a maximum given distance
    #     @type distance: Integer
    #     @rtype: list of {HeteroatomContacts}
    #     """
    #     data = []
    #     for protein in self.proteins:
    #         data.append(HeteroatomContacts(chain = protein, max_distance = distance))
    #     return data

    #
    # OVERRIDE PARENT'S FUNCTIONS
    #
    @staticmethod
    def read(input_file, format='PDB'):
        """
        Reads a file of data in a specific format and returns the object

        @type  input_file: String
        @param input_file: File to read

        @type  format: String
        @param format: Format of the file to read
        """
        if format == 'PDB':
            pdb = PDB(pdb_file=input_file)
            return pdb

    def write(self, output_file=None, format='PDB', force=None, clean=False):
        """
        Writes the object in a specific format

        @type  output_file: String
        @param output_file: File to write

        @type  format: String
        @param format: Format of the file to print
        """
        outfile = File(
            file_name=output_file, action='w', overwrite=SBIg.decide_overwrite(force))
        if format == 'PDB':
            self._write_PDB_file(pdb_file=outfile, clean=clean)

    #
    # IO
    #
    def _read_PDB_file(self, header=False, onlyheader=False, biomolecule=False):
        """
        Process and load crystal data from a PDB formated file
        """
        from parse_pdb import read_PDB_file, read_PDB_header
        if header:
            read_PDB_header(self)
            self._pdb_file.close()
            self._pdb_file.open()
        if not onlyheader:
            # read_PDB_file(self, biomolecule=biomolecule)
            read_PDB_file(self)
        self._pdb_file.close()

    # def _represent_COMPND(self):
    #     if self._COMPND is None: return ''

    #     data = []
    #     mol_counter = 1
    #     for chain in self._COMPND:
    #         data.append("COMPND    MOL_ID: %d;" %mol_counter)
    #         data.append("COMPND   2 CHAIN: " + ",".join(self._COMPND[chain]) + ";")
    #         if len(self._biomolecA) > 0:
    #             matrices = []
    #             for mat in self._biomolecA:
    #                 if mat[1] == chain: matrices.append(str(mat[0]))
    #             data.append("COMPND   3 MATRICES: " + ",".join(sorted(matrices)))
    #         mol_counter += 1
    #     return "\n".join(data) + "\n"

    def _write_PDB_file(self, pdb_file, clean=False):
        """
        Print a crystal into a PDB formated file
        """
        # out_fd = pdb_file.descriptor
        # out_fd.write(self._represent_COMPND())
        pdb_file.write(self.PDB_format(clean=clean) + "\n")
        pdb_file.close()

    def PDB_format(self, clean=False, terminal=True):
        """
        Strings a {PDB} in PDB format
        @rtype: String
        """
        lines = []
        if clean:
            self.clean()
        for chain in self._chains:
            lines.append(chain.PDB_format(terminal=terminal))
        lines.append("END")
        return "\n".join(lines)

    def FASTA_format(self, gapped=True, protein=True, nucleotide=False):
        # TODO: return fasta object
        lines = []
        for c in self.chains:
            if isinstance(c, ChainOfProtein) and protein:
                lines.append(
                    ">{0}\t{1}".format(c.globalID, c.aminoacids[0].identifier))
                if gapped:
                    lines.append("{0}".format(c.gapped_protein_sequence))
                else:
                    lines.append("{0}".format(c.protein_sequence))
            if isinstance(c, ChainOfNucleotide) and nucleotide:
                lines.append(
                    ">{0}\t{1}".format(c.globalID, c.nucleotides[0].identifier))
                if gapped:
                    lines.append("{0}".format(c.gapped_nucleotide_sequence()))
                else:
                    lines.append("{0}".format(c.nucleotide_sequence()))
        if len(lines) == 0:
            return ""
        else:
            return "\n".join(lines) + "\n"

    def IDX_format(self, protein=True, nucleotide=False):
        lines = []
        for c in self.chains:
            if isinstance(c, ChainOfProtein) and protein:
                lines.append(">{0}\t{1}".format(c.globalID, c.protein_idx))
            if isinstance(c, ChainOfNucleotide) and nucleotide:
                lines.append(
                    ">{0}\t{1}".format(c.globalID, c.nucleotide_idx()))
        if len(lines) == 0:
            return ""
        else:
            return "\n".join(lines) + "\n"

    def FASTA_IDX(self, protein=True, nucleotide=False):
        data = {}
        data.setdefault('FASTA', [])
        data.setdefault('IDX', [])
        for c in self.chains:
            if isinstance(c, ChainOfProtein) and protein:
                data['FASTA'].append(
                    ">{0}\n{1}".format(c.globalID, c.gapped_protein_sequence))
                data['IDX'].append(
                    ">{0}\t{1}".format(c.globalID, c.protein_idx))
            if isinstance(c, ChainOfNucleotide) and nucleotide:
                data['FASTA'].append(
                    ">{0}\n{1}".format(c.globalID, c.gapped_nucleotide_sequence()))
                data['IDX'].append(
                    ">{0}\t{1}".format(c.globalID, c.nucleotide_idx()))

        return data

    #
    # OVERRIDE DEFAULT METHODS
    #
    def __len__(self):
        return len(self._chains)