Exemplo n.º 1
0
    def _parse(self):
        file_fd    = File(self._dsspfile)
        read       = False
        continuity = -1000
        readline   = 0
        for line in file_fd.read():
            if line.startswith("  #  RESIDUE AA STRUCTURE BP1 BP2  ACC"):
                read = True
                continue
            if read:
                if line[13:14] != '!':
                    res_num = int(line[6:10].strip())
                    ss      = line[16:17] if line[16:17] != ' ' else '-'
                    buried  = int(line[35:38].strip())
                    aa      = line[13:15].strip()

                    self._dsspdata.append(DSSP(secondary_structure = ss,
                                               accessibility       = buried,
                                               amino               = aa))
                    self._dsspdata[-1].add_hydrogen_links(line[39:50],
                                                          line[50:61],
                                                          line[61:72],
                                                          line[72:84])
                    if readline > 0:
                        if res_num != continuity + 1:
                            self._gapped = True
                        continuity = res_num
                    readline += 1
                else:
                    msg = "truncated chain!{0}\n".format(self._dsspfile)
                    sys.stderr.write(msg)
                    SBIg.warn(self, msg)
                    self._gapped = True
        file_fd.close()
Exemplo n.º 2
0
def sortarchs(inputdir, outputdir):
    
    archsdir              = outputdir
    Path.mkdir(archsdir)
    sorted_archs          = {}
    loop_file_name        = os.path.join(archsdir, 'ArchDB.{0}.db')
    loop_split_file_name  = os.path.join(archsdir, 'ArchDB.{0}.{1:02d}-{2:02d}.db')
    sections_ini          = [ 0, 4, 7,14,21]
    sections_end          = [ 4, 6,13,20, 0]
    for archfile in Path.list_files(root = inputdir, pattern = '*.archObj'):
        filename = os.path.basename(archfile)
        data     = filename.split('_')
        length   = int(data[0])
        archtype = data[1] 
        sorted_archs.setdefault(archtype,{}).setdefault(length,[])
        sorted_archs[archtype][length].append(archfile)
    
    for archtype in sorted_archs:
        SBIglobals.alert('verbose', None, "ARCHS: " + archtype + "\n")
        fd  = File(loop_file_name.format(archtype), 'w')
        fdp = []
        for x in range(len(sections_ini)):
            fdp.append(File(loop_split_file_name.format(archtype, sections_ini[x], sections_end[x]), 'w'))
        for length in sorted(sorted_archs[archtype]):
            SBIglobals.alert('verbose', None, '\t{0}'.format(length))
            for archfile in sorted_archs[archtype][length]:
                SBIglobals.alert('verbose', None, '\t\t{0}\n'.format(archfile))
                nsp = Arch.load(archfile)
                fd.descriptor.write(nsp.archtype_format() + "\n")
                for x in range(len(fdp)):
                    if length >= sections_ini[x] and (sections_end[x] == 0 or length <= sections_end[x]):
                        fdp[x].descriptor.write(nsp.archtype_format() + "\n")
        fd.close()
        for x in range(len(fdp)):
            fdp[x].close()
Exemplo n.º 3
0
    def __init__(self, fasta_file, auto_load=10):
        '''
        @param:    fasta_file
        @pdef:     name of the FASTA file.
        @ptype:    {String} or {File}

        @@param:   auto_load
        @pdef:     maximum number of sequences to autoload.
        @pdefault: 10
        @ptype:    {Integer}
        '''
        if isinstance(fasta_file, basestring):
            self._file = File(file_name=fasta_file, action='r')
        elif isinstance(fasta_file, File):
            self._file = File(file_name=fasta_file.full, action='r')
        else:
            raise AttributeError('Check the input of the Fasta object')

        self._sequences = []
        self._sequenceID = {}

        self._total_sequences = 0
        self._loaded = False
        self._auto_load = auto_load
        self._check_multifasta()

        self._index_file = None
        self._check_index()
Exemplo n.º 4
0
    def read_compacted_blast(compacted_blast_file):
        '''
        Read data from a printed compacted blast into {BlastResult}.
        Not all options will be available in that new object.

        @param:    compacted_blast_file
        @pdef:     file of the compacted blast print
        @ptype:    {String}

        @return: {BlastResult}
        '''
        from BlastHit import BlastHit
        query_name, query_sequence = None, None
        version, matrix, database = None, None, None
        gap_open, gap_extend, self_hit = None, None, None

        br = None

        cbf = File(compacted_blast_file)
        for line in cbf.read():
            if line.startswith('#'):
                if line.startswith('#Query:'):
                    query_name = line.strip().split()[-1]
                if line.startswith('#Query Sequence:'):
                    query_sequence = line.strip().split()[-1]
                if line.startswith('#Blast Version:'):
                    version = line.strip().split()[-1]
                if line.startswith('#Search on matrix:'):
                    matrix = line.strip().split()[-1]
                if line.startswith('#Gap open penalty:'):
                    gap_open = line.strip().split()[-1]
                if line.startswith('#Gap extension penalty:'):
                    gap_extend = line.strip().split()[-1]
                if line.startswith('#Database searched:'):
                    database = line.strip().split()[-1]
                if line.startswith('#Self Hit is omitted:'):
                    self_hit = line.strip().split()[-1]
            else:
                if br is None:
                    if version is None:
                        bh = None
                    else:
                        bh = BlastHeader(version, matrix, gap_open, gap_extend,
                                         database, self_hit)
                    br = BlastResult(query_name, query_sequence, bh)
                d = line.strip().split()
                hit = BlastHit(
                    [d[2], d[3]], [d[8], d[9]],
                    [int(x) for x in d[10].split(',')[0].split(':')], 1,
                    [d[4], d[5], d[6], d[7]])
                br.add_hit(hit)
        cbf.close()

        return br
Exemplo n.º 5
0
    def pdb_file(self, value):
        """
        Sets a PDB file if none has been given
        @raise UsedAttributeError
        """
        if self._pdb_file is not None:
            raise AttributeError(
                "The PDB object is loaded from file {0}. To load the new file {1} create a new PDB object".format(self._pdb_file.full, value))

        if isinstance(value, File):
            self._pdb_file = value
        else:
            self._pdb_file = File(file_name=value, type='r')
Exemplo n.º 6
0
    def release(self):
        '''
        Retrieves release data for the database.
        Not according to the DB release, but to when we downloaded it.

        @returns: {Dictionary}
        '''
        if os.path.isfile(os.path.join(self.local, self._CONTROL_FILE)):
            f = File(os.path.join(self.local, self._CONTROL_FILE))
            data = json.loads(f.read())
            f.close()
        else:
            data = self._RELEASE
        return data
    def __init__(self, cif_file):
        self._file    = File(file_name = cif_file, action = 'r')
        self.__name__ = 'databases.PDBeChem'    # This must be included in every class for the SBIglobals.alert()

        self._id        = None
        self._name      = None
        self._type      = None
        self._formula   = None
        self._parent    = None
        self._weight    = None
        self._fcharge   = None
        self._code1l    = None
        self._flformula = {}

        self._parse()
        self._decompose_formula()
Exemplo n.º 8
0
    def items(self):
        '''
        Loops through the items of the database

        @yields: Object depending on the database.
        '''
        if not self.has_local:
            SBIg.throw(self, 'A local database needs to be build first',
                       IOError)

        for ifile in self._ITEM_FILES:
            ifile = os.path.join(self.local, ifile)
            f = File(ifile)
            for line in f.read():
                yield self._DBOBJECT.grab(line.strip())
            f.close()
Exemplo n.º 9
0
 def format2file(self, filename, extension = 'pdb', center = False):
     if extension not in ('pdb', 'js'):
         raise AttributeError('Not accepted extension')
     structure = File('.'.join([filename, extension]), 'w')
     if extension == 'pdb':
         structure.write(self.pdb_format(center = center))
     elif extension == 'js':
         structure.write(self.js_format(center = center))
     structure.close()
Exemplo n.º 10
0
    def make_PDBseq(self, log_file, resolution_threshold=None):
        if not self.has_local:
            raise NameError(
                'A local PDB database must be defined to do create a PDBseq database.'
            )
        outdir = self.PDBseq if self.PDBseq is not None else os.curdir

        Path.mkdir(self.PDBseq)
        fasta_file = File(file_name=os.path.join(outdir, 'PDBseq.fa'),
                          action='w',
                          overwrite=True)
        fasta_fd = fasta_file.descriptor
        idx_file = File(file_name=os.path.join(outdir, 'PDBseq.fa.idx'),
                        action='w',
                        overwrite=True)
        idx_fd = idx_file.descriptor
        # if resolution_threshold is not None:
        #     filtered_file_name = self.get_PDBseq_filtered(resolution_threshold)
        #     filtered_file      = File(file_name = filtered_file_name, action = 'w', overwrite = True)
        #     filtered_fd        = filtered_file.descriptor
        #     resolutions        = self.get_resolutions(resolution_threshold = resolution_threshold)
        log_file = File(file_name=log_file, action='w', overwrite=True)
        log_idx = log_file.descriptor

        for pdb_file in self.localPDBs:
            log_idx.write("Reading File: {0}\n".format(pdb_file))
            newPDB = PDB(pdb_file=pdb_file, dehydrate=True)
            fasta_idx = newPDB.FASTA_IDX(nucleotide=False)
            if len(fasta_idx['FASTA']) != len(fasta_idx['IDX']):
                log_idx.write(
                    'ERROR!!!!! Number of fastas and indexes are different for pdb {0}!!\n'
                    .format(newPDB.id))
            if len(fasta_idx['FASTA']) > 0:
                log_idx.write('\tPrinting FASTA and IDX...\n')
            else:
                log_idx.write('\tProblably just a nucleotide PDB...\n')
            for c in range(len(fasta_idx['FASTA'])):
                sequence = fasta_idx['FASTA'][c].split('\n')[1]
                sequence = sequence.replace('X', '').replace('x', '')
                if len(sequence) > 0:
                    fasta_fd.write(fasta_idx['FASTA'][c] + "\n")
                    if resolution_threshold is not None and newPDB.id in resolutions and not newPDB.is_all_ca:
                        filtered_fd.write(fasta_idx['FASTA'][c] + "\n")
                    idx_fd.write(fasta_idx['IDX'][c] + "\n")
            del (newPDB)

        #CLOSE & END
        fasta_file.close()
        idx_file.close()
        if resolution_threshold is not None:
            filtered_fd.close()
Exemplo n.º 11
0
    def __init__(self, cdhit_file=None):
        '''
        @param:    cdhit_file
        @pdef:     name of the cd-hit output file
        @pdefault: _None_. Create an empty list
        @ptype:    {String}

        '''
        self._clusters = []
        self._allseqids = {}
        if cdhit_file is not None:
            self._file = File(file_name=cdhit_file)
        else:
            self._file = None

        if self._file is not None:
            self._parse_file()
Exemplo n.º 12
0
    def _process(self):
        go_dic = {}
        parseFile = File(os.path.join(self.local, self._gfile), 'r')
        go = None
        for line in parseFile.descriptor:
            line = re.sub('\'', '\\\'', line)
            if line.startswith('[Term]'):
                if go is not None:
                    go_dic[go.id] = go
            if line.startswith('id:'):
                go = GOterm(id = line.split()[1].strip())
                continue
            if line.startswith('name:'):
                go.name = " ".join(line.split()[1:]).strip()
                continue
            if line.startswith('namespace:'):
                go.namespace = line.split()[1].strip()
                continue
            if line.startswith('alt_id:'):
                go.alt_id.append(line.split()[1].strip())
                continue
            if line.startswith('is_obsolete:'):
                go.obsolete = True
                continue
            if line.startswith('is_a:'):
                go.parents.add(line.split()[1].strip())
                continue
            if line.startswith('relationship:'):
                go.relations.append((line.split()[1].strip(),line.split()[2].strip()))
                continue
            if line.startswith('[Typedef]'):
                go_dic[go.id] = go
                break
        parseFile.close()

        for go in go_dic:
            go_dic[go].parents = self._search_parents(go_dic, go)

        goFile = File(self._gofile, 'w', True)
        for go in go_dic:
            go_dic[go].parents.add(go)
            goFile.write(str(go_dic[go]) + "\n")
        goFile.close()
Exemplo n.º 13
0
 def _save_release(self):
     '''
     Store the release data into a file.
     '''
     f = File(os.path.join(self.local, self._CONTROL_FILE), 'w', True)
     f.write(json.dumps(self._RELEASE))
     f.close()
Exemplo n.º 14
0
    def build_multifasta(file_name, sequence_list, force=None):
        '''
        Creates a Fasta object and a FASTA file. For multiple sequences.

        @param:    file_name
        @pdef:     name of the fasta file (with path, if necessary)
        @ptype:    {String}

        @param:    sequence_list
        @pdef:     list of sequences to create the FASTA from.
        @ptype:    {List} or {Set} of {Sequence}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @return: {Fasta}
        '''
        newFasta = File(file_name, 'w', overwrite=force)
        for sequence in sequence_list:
            newFasta.write(sequence.format('FASTA') + '\n')
        newFasta.close()
        fasta_file = Fasta(fasta_file=newFasta.full, auto_load=0)
        return fasta_file
Exemplo n.º 15
0
    def build(file_name, sequence_id, sequence, force=None):
        '''
        Creates a Fasta object and a FASTA file from a sequence.

        @param:    file_name
        @pdef:     name of the fasta file (with path, if necessary)
        @ptype:    {String}

        @param:    sequence_id
        @pdef:     name of the sequence
        @ptype:    {String}

        @param:    sequence
        @pdef:     sequence
        @ptype:    {String} or {List}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @return: {Fasta}
        '''
        newFasta = File(file_name, 'w', overwrite=force)
        newSeq = Sequence(sequence_id=sequence_id, sequence=sequence)
        newFasta.write(newSeq.format('FASTA'))
        newFasta.close()
        return Fasta(fasta_file=newFasta.full, auto_load=0)
Exemplo n.º 16
0
    def _process(self):
        enzymes = self._parse_enzclass() + self._parse_enzymedat()
        enzymes.sort()

        enzFile = File(self._enzfile, 'w', True)
        for e in enzymes:
            enzFile.write(repr(e) + "\n")
        enzFile.close()
Exemplo n.º 17
0
    def write(self, output_file=None, format='PDB', force=None, clean=False):
        """
        Writes the object in a specific format

        @type  output_file: String
        @param output_file: File to write

        @type  format: String
        @param format: Format of the file to print
        """
        outfile = File(
            file_name=output_file, action='w', overwrite=SBIg.decide_overwrite(force))
        if format == 'PDB':
            self._write_PDB_file(pdb_file=outfile, clean=clean)
Exemplo n.º 18
0
    def _process(self):

        targets = self._process_targets()
        drugs = self._process_drugs(targets)

        drugFile = File(self._drugfile, 'w', True)
        for d in drugs:
            drugFile.write(repr(d) + "\n")
        drugFile.close()
    def get_PDBeChems(self, chemIDset):
        if isintance(chemIDset, str):
            warnings.warn('For single PDBeChem search the get_PDBeChem function is recomended.')
            yield self.get_PDBeChem(chemIDset)
        else:
            chemIDset = set([x.upper() for x in chemIDset])

        if self.has_local:
            for chem_file in self.localPDBeChems:
                newfile = File(file_name = chem_file, action = 'r')
                if newfile.prefix.lstrip('pdb').upper() in chemIDset:
                    yield chem_file
        else:
            for chemID in chemIDset:
                yield self.get_PDBeChem(chemID)
    def get_PDBeChem(self, chemID):
        if self.has_local:
            for chem_file in self.localPDBeChems:
                newfile = File(file_name = chem_file, action = 'r')
                if newfile.prefix.upper() == chemID.upper():
                    return chem_file

        # If we do not find it in local (or we do not have a local) we search it on the FTP
        chem_file = chemID.upper() + '.cif'
        source = PDBeChemftp['single'] + chem_file
        try:
            urllib.urlretrieve(source, chem_file)
        except:
            return False
        return os.path.abspath(chem_file)
Exemplo n.º 21
0
    def __init__(self, pdb_file=None, dehydrate=False, header=False,
                 onlyheader=False, biomolecule=False):
        """
        @type  pdb_file: String
        @param pdb_file: PDB formated file to read

        @raise IOError if pdb_file does not exist and it is not an empty object
        """
        if biomolecule or onlyheader:
            header = True

        self._pdb_file      = pdb_file
        self._chains        = []
        self._NMR           = False
        self._NMR_chains    = []
        self._chain_id      = set()

        self._biomol_id     = -1    # -1 -> original
                                    #  0 -> symmetry
                                    # >0 -> biomolecule

        self._header        = None

        self._has_prot      = False
        self._has_nucl      = False

        self._COMPND        = None

        if self.pdb_file is not None:
            self._pdb_file  = File(file_name=self._pdb_file, action='r')
            self._read_PDB_file(header=header,
                                onlyheader=onlyheader,
                                biomolecule=biomolecule)

        if dehydrate:
            self.dehydrate()
Exemplo n.º 22
0
    def get_PDBs(self, pdbIDset):
        if isintance(pdbIDset, str):
            warnings.warn(
                'For single PDB search the get_PDB function is recomended.')
            yield self.get_PDB(pdbIDset)
        else:
            pdbIDset = set([x.upper() for x in pdbIDset])

        if self.has_local:
            for pdb_file in self.localPDBs:
                newfile = File(file_name=pdb_file, action='r')
                if newfile.prefix.lstrip('pdb').upper() in pdbIDset:
                    yield pdb_file
        else:
            for pdbID in pdbIDset:
                yield self.get_PDB(pdbID)
Exemplo n.º 23
0
    def get_PDB(self, pdbID):
        if self.has_local:
            rootdir = os.path.join(self.local, pdbID.lower()[1:3])
            for pdb_file in Path.list_files(root=rootdir, pattern='*.ent.gz'):
                newfile = File(file_name=pdb_file, action='r')
                if newfile.prefix.lstrip('pdb').upper() == pdbID.upper():
                    return pdb_file

        #If we do not find it in local (or we do not have a local) we search it on the FTP
        pdb_file = 'pdb' + pdbID.lower() + '.ent.gz'
        source = 'ftp://' + PDBftp['address'] + os.path.join(
            PDBftp['structures'], pdbID[1:3].lower(), pdb_file)
        try:
            urllib.urlretrieve(source, pdb_file)
        except:
            return False
        return os.path.abspath(pdb_file)
Exemplo n.º 24
0
    def print_compacted_blast(self, out_file=None):
        '''
        Print the compacted format of the blast hit.

        @param:    out_file
        @pdef:     file to print the blast data into.
        @pdefault: _None_
        @ptype:    {String}
        '''
        if out_file is not None:
            output = File(out_file, 'w')
            output.write("%s\n" % self.str_compacted_blast())
            output.close()
        else:
            print self.str_compacted_blast()
Exemplo n.º 25
0
    def get_resolutions(self):
        # resolutions (-1) are for methods that do not define resolution
        resolutions = {}

        ftp = ftplib.FTP(PDBftp['address'])
        ftp.login()
        ftp.cwd(PDBftp['derived'])
        resoluIDX = []
        ftp.retrlines('RETR ' + PDBftp['resolution'], resoluIDX.append)
        ftp.quit()

        SBIglobals.alert('debug', self,
                         'Retrieving resolution data from PDB FTP...')

        active = False
        for line in resoluIDX:
            if line.startswith('-'):
                active = True
                continue
            if active and len(line.strip()) > 0:
                data = [x.strip() for x in line.split(';')]
                if len(data[1]) > 0:
                    SBIglobals.alert(
                        'debug', self,
                        '\tResolution for {0[0]} is {0[1]}...'.format(data))
                    # if resolution_threshold is None:
                    resolutions[data[0]] = data[1]

        #rsync is accumulative, we might have structures that are not in the residu.idx anymore.. must check
        for pdb_file in self.localPDBs:
            newfile = File(file_name=pdb_file, action='r')
            pdbid = newfile.prefix.lstrip('pdb').upper()
            if pdbid not in resolutions:
                pdbobj = PDB(pdb_file=pdb_file, header=True, onlyheader=True)
                SBIglobals.alert(
                    'debug', self,
                    '\tGrabbing Resolution for {0} is {1}...'.format(
                        pdbid, pdbobj.header.resolution))
                resolutions[pdbid] = pdbobj.header.resolution

        return resolutions
Exemplo n.º 26
0
    def print_representation(self, line_split=160, out_file=None):
        '''
        Print the alignment representation of the blast hit.

        @param:    line_split
        @pdef:     number of characters per line
        @pdefault: 160
        @ptype:    {Integer}

        @param:    out_file
        @pdef:     file to print the blast data into.
        @pdefault: _None_
        @ptype:    {String}
        '''
        if out_file is not None:
            output = File(out_file, 'w')
            output.write("%s\n" % self.str_representation(line_split))
            output.close()
        else:
            print self.str_representation(line_split)
Exemplo n.º 27
0
 def _process(self):
     tmoFile = File(self._pdbtmfile, 'w', True)
     for xmlfile in Path.list_files(
             os.path.join(self._local, 'pdbtm/database/'), '*.xml'):
         xmldata = TM(
             pdb=os.path.splitext(os.path.split(xmlfile)[1])[0].upper())
         skip_chains = set()
         read = False
         fdxml = open(xmlfile)
         for line in fdxml:
             if line.startswith('    <TMRES>'): xmldata.tmres = line
             elif line.startswith('    <TMTYPE'): xmldata.tmtype = line
             elif line.startswith('    <PDBKWRES'): xmldata.kwres = line
             elif line.startswith('  <SIDEDEFINITION'):
                 m = re.search('Side1="(\S+)"', line)
                 xmldata.side = m.group(1)
             elif line.startswith('      <APPLY_TO_CHAIN'):
                 m = re.search('NEW_CHAINID=\"(\S{1})\"', line)
                 if m: skip_chains.add(m.group(1))
             elif line.startswith('  <CHAIN '):
                 m = re.search(
                     'CHAINID=\"(\S{1})\" NUM_TM=\"(\d{1})\" TYPE=\"(\S+)\"',
                     line)
                 if m:
                     chain, num, tmtype = m.group(1), m.group(2), m.group(3)
                     if not chain in skip_chains:
                         cdata = tuple([chain, num, tmtype])
                         xmldata.set_chain(cdata)
                         read = True
             elif line.startswith('    <REGION ') and read:
                 m = re.search(
                     'pdb_beg=\"(\-*\d+\w*)\"[\s\S]+pdb_end=\"(\-*\d+\w*)\"\s+type=\"(\w{1})\"',
                     line)
                 ini, end, tmtype = m.group(1), m.group(2), m.group(3)
                 xmldata.set_chain(cdata, tuple([ini, end, tmtype]))
             elif line.startswith('  </CHAIN>'):
                 read = False
         fdxml.close()
         if len(xmldata.chains) > 0:
             tmoFile.write(str(xmldata) + "\n")
     tmoFile.close()
Exemplo n.º 28
0
    def reduce(self, new_fasta_file, list_file, force=None):
        '''
        Reduces the {Fasta} by removing identical sequences.

        @param:    new_fasta_file
        @pdef:     name of the new fasta file
        @ptype:    {String}

        @param:    list_file
        @pdef:     name of the repetition list file
        @ptype:    {String}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @return: {Fasta} and {File} with the list of identical sequences.
        '''
        seq_md5 = {}
        sequences = []
        for seq in self.live_show():
            md5 = seq.md5
            if not md5 in seq_md5:
                sequences.append(seq)
                seq_md5.setdefault(md5, [])
            else:
                SBIg.alert(
                    'debug', self,
                    '{0} repeats of {1}'.format(seq.id, seq_md5[md5][0]))
            seq_md5[md5].append(seq.id)
        fasta = Fasta.build_multifasta(new_fasta_file, sequences, force)
        listfile = File(list_file, 'w')
        for md5 in seq_md5:
            listfile.write('\t'.join(seq_md5[md5]) + '\n')
        listfile.close()

        return fasta, listfile
Exemplo n.º 29
0
    def correct_hit_count(self,
                          count_hit_file=None,
                          count_query_file=None,
                          return_correction_dict=False):
        '''
        Corrects the starting point of the hits and the query, if needed.
        Why?
        When blasting vs. PDB (for example), sometimes the hit positions given
        by blast are wrong, as the blast always consider the first position of
        the hit sequence as 1 and PDB does not.
        Even more, the position reference doesn't even need to be a number.
        As the specific location in the PDB is important, we need to adapt our
        blasts so than we can read that data.
        Keep in mind that hits and query must be corrected together in this step,
        as this function cannot be called twice for a same instance.

        @param:    count_hit_file
        @pdef:     file containing the idex data for the query database
                   each sequence in this file will have a format such as:
                   >3K2K_A -7 ;-6 ;-5 ;-4 ;-3 ;-2 ;-1 ;0 ;1 ;2 ;3 ;4 ;5 ;6 ;7 ...
        @ptype:    {String}

        @param:    count_query_file
        @pdef:     sometimes we might also need to correct the query (if PDB vs.
                   PDB). Same format as count_hit_file. They might be the same file.
        @ptype:    {String}

        @param:    return_correction_dict
        @pdef:     instead of actually executing the correction, it only returns
                   the dictionary for further use.
        @pdefault: _False_
        @ptype:    {Boolean}

        @raises: {IOError} if the correction index file does not exist.
        @raises: {AttributeError} if the BlastResult does not contain any BlastHit.
        @raises: {BlastError} if it has been called before for this instance.

        '''
        if not self.has_hits:
            SBIg.warn(
                self,
                "BlastResult of {0} has no hits to correct".format(self.query))
            return

        if self.are_hits_corrected:
            be = BlastExe.BlastError()
            raise be.corrected_hits()

        SBIg.alert('debug', self,
                   'Correcting indexes for {0}'.format(self.query))
        cfile = File(count_hit_file)
        cq = False

        codes_of_interest = set([hit.sequenceID for hit in self.raw_hits])
        if count_query_file == count_hit_file:
            codes_of_interest.add(self.query)
            count_query_file = None
            cq = True

        start_index_dic = {}
        for line in cfile.read():
            if len(line.strip()) > 0:
                k = line.split('\t')
                if k[0].lstrip('>') in codes_of_interest:
                    start_index_dic[k[0].lstrip('>')] = k[1].strip().split(';')
        cfile.close()

        if count_query_file is not None:
            cfile = File(count_query_file)
            for line in cfile.read():
                if len(line.strip()) > 0:
                    k = line.split('\t')
                    if k[0].lstrip('>') == self.query:
                        start_index_dic[k[0].lstrip('>')] = k[1].strip().split(
                            ';')
            cfile.read().close()
            cq = True

        if cq:
            SBIg.alert('debug', self, '\tFixing Query {0}'.format(self.query))
            self._query_index = start_index_dic[self.query]

        if return_correction_dict:
            return start_index_dic

        for hit in self._hits:
            # This tests between the options PDB/PDB_ID or PDB_ID in case
            # the TAB file has different codification
            h = hit.sequenceID
            hit_ID = h if h in start_index_dic else h.split("/")[-1]
            SBIg.alert('debug', self, '\tFixing {0}'.format(hit_ID))
            hit.correct_hit_count(new_index=start_index_dic[hit_ID])
            if cq:
                SBIg.alert('debug', self,
                           '\tFixing Query {0}'.format(self.query))
                hit.correct_query_count(new_index=start_index_dic[self.query])

        self._correctedHits = True
Exemplo n.º 30
0
 def relations(self):
     relFile = File(self._rel, 'r')
     for rel_line in relFile.descriptor:
         if not rel_line.startswith('#'):
             yield rel_line
Exemplo n.º 31
0
 def descriptions(self):
     dscFile = File(self._desc, 'r')
     for dsc_line in dscFile.descriptor:
         if not dsc_line.startswith('#'):
             yield dsc_line