Exemplo n.º 1
0
    def build_multifasta(file_name, sequence_list, force=None):
        '''
        Creates a Fasta object and a FASTA file. For multiple sequences.

        @param:    file_name
        @pdef:     name of the fasta file (with path, if necessary)
        @ptype:    {String}

        @param:    sequence_list
        @pdef:     list of sequences to create the FASTA from.
        @ptype:    {List} or {Set} of {Sequence}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @return: {Fasta}
        '''
        newFasta = File(file_name, 'w', overwrite=force)
        for sequence in sequence_list:
            newFasta.write(sequence.format('FASTA') + '\n')
        newFasta.close()
        fasta_file = Fasta(fasta_file=newFasta.full, auto_load=0)
        return fasta_file
Exemplo n.º 2
0
 def _save_release(self):
     '''
     Store the release data into a file.
     '''
     f = File(os.path.join(self.local, self._CONTROL_FILE), 'w', True)
     f.write(json.dumps(self._RELEASE))
     f.close()
Exemplo n.º 3
0
    def build(file_name, sequence_id, sequence, force=None):
        '''
        Creates a Fasta object and a FASTA file from a sequence.

        @param:    file_name
        @pdef:     name of the fasta file (with path, if necessary)
        @ptype:    {String}

        @param:    sequence_id
        @pdef:     name of the sequence
        @ptype:    {String}

        @param:    sequence
        @pdef:     sequence
        @ptype:    {String} or {List}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @return: {Fasta}
        '''
        newFasta = File(file_name, 'w', overwrite=force)
        newSeq = Sequence(sequence_id=sequence_id, sequence=sequence)
        newFasta.write(newSeq.format('FASTA'))
        newFasta.close()
        return Fasta(fasta_file=newFasta.full, auto_load=0)
    def _process(self):
        enzymes = self._parse_enzclass() + self._parse_enzymedat()
        enzymes.sort()

        enzFile = File(self._enzfile, 'w', True)
        for e in enzymes:
            enzFile.write(repr(e) + "\n")
        enzFile.close()
Exemplo n.º 5
0
    def _process(self):

        targets = self._process_targets()
        drugs = self._process_drugs(targets)

        drugFile = File(self._drugfile, 'w', True)
        for d in drugs:
            drugFile.write(repr(d) + "\n")
        drugFile.close()
Exemplo n.º 6
0
 def format2file(self, filename, extension = 'pdb', center = False):
     if extension not in ('pdb', 'js'):
         raise AttributeError('Not accepted extension')
     structure = File('.'.join([filename, extension]), 'w')
     if extension == 'pdb':
         structure.write(self.pdb_format(center = center))
     elif extension == 'js':
         structure.write(self.js_format(center = center))
     structure.close()
Exemplo n.º 7
0
    def subset(self,
               sequence_ids,
               new_fasta_file,
               all_but=False,
               prefix_size=None,
               index=False,
               force=None):
        '''
        Creates a new {Fasta} with the requested subset of sequences.

        @param:    sequence_ids
        @pdef:     sequence identifier(s)
        @ptype:    {String}, {List} or {Set}

        @param:    new_fasta_file
        @pdef:     name of the new fasta file
        @ptype:    {String}

        @param:    all_but
        @pdef:     Flag. Instead of retrieving the given ids, we retrieve all
                   except the given ids.
        @pdefault: _False_
        @ptype:    {Boolean}

        @param:    prefix_size
        @pdef:     maximum characters for the prefix. If _None_, all the
                   characters are included.
        @pdefault: _None_
        @ptype:    {Integer}

        @param:    index
        @pdef:     create the index file also, in case it does exist
        @pdefault: _False_
        @ptype:    {Boolean}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @raises: {AttributeError} if sequence_ids is not a valid type.
        @return: {Fasta}
        '''
        sequences = self.retrieve(sequence_ids, all_but, prefix_size)
        fasta_file = Fasta.build_multifasta(new_fasta_file, sequences, force)
        if self.has_index and index:
            idxfile = File(self.index_file)
            newidx = File(fasta_file.file.full + '.idx', 'w')
            seqids = set(fasta_file.sequence_identifiers)
            for idx in idxfile.read():
                if idx.split()[0].strip('>') in seqids:
                    newidx.write(idx)
            idxfile.close()
            newidx.close()
            fasta_file.index_file = newidx.full
        return fasta_file
Exemplo n.º 8
0
    def _process(self, update=False):
        '''
        Transform the source files into the final local db files.

        @param:    update
        @pdef:     toggles between create and update processing
        @pdefault: _False_
        @ptype:    {Boolean
        '''
        if update:
            old = self._RELEASE['total_items'].copy()
        j = 0
        for i in range(len(self._SOURCES)):
            dfilen = os.path.join(self.local, self._SOURCES[i])
            ofilen = os.path.join(self.local, self._MANDATORY_FILES[j])
            ffilen = os.path.join(self.local, self._MANDATORY_FILES[j + 1])
            if not os.path.isfile(dfilen):
                continue
            SBIg.alert('verbose', self, 'Parsing:       {0}'.format(dfilen))
            SBIg.alert('verbose', self, 'DB file to:    {0}'.format(ofilen))
            SBIg.alert('verbose', self, 'Fasta file to: {0}'.format(ffilen))
            dfile = File(dfilen)
            ofile = File(ofilen, 'w', update)
            ffile = File(ffilen, 'w', update)
            protein = None
            for protein in Connect._parse_uniprot(dfile):
                pname = protein.entry_name
                pvers = protein.version
                SBIg.alert('verbose', self, 'Protein: {0}'.format(pname))
                if not update:
                    self._RELEASE['total_items'][pname] = pvers
                else:
                    if pname not in self._RELEASE['total_items']:
                        self._RELEASE['new_items'][pname] = pvers
                    else:
                        del (old[pname])
                        if self._RELEASE['total_items'][pname] != pvers:
                            self._RELEASE['update_items'][pname] = pvers

                ffile.write(protein.sequence.format('FASTA') + '\n')
                ofile.write(protein.json() + '\n')
            j += 2
            dfile.close()
            ofile.close()
            ffile.close()

        if update:
            self._RELEASE['total_items'].update(self._RELEASE['new_items'])
            self._RELEASE['total_items'].update(self._RELEASE['update_items'])
            self._RELEASE['deleted_items'] = old
            for k in self._RELEASE['deleted_items']:
                del (self._RELEASE['total_items'][k])
Exemplo n.º 9
0
    def print_compacted_blast(self, out_file=None):
        '''
        Print the compacted format of the blast hit.

        @param:    out_file
        @pdef:     file to print the blast data into.
        @pdefault: _None_
        @ptype:    {String}
        '''
        if out_file is not None:
            output = File(out_file, 'w')
            output.write("%s\n" % self.str_compacted_blast())
            output.close()
        else:
            print self.str_compacted_blast()
Exemplo n.º 10
0
    def get_FASTA_IDX_by_names_to_file(self, names, outfile):

        fastafile = Fasta(self.PDBseq)
        selectedfasta = fastafile.retrieve(copy.deepcopy(names))
        output_fasta = File(outfile, 'w')
        for sequence in selectedfasta:
            output_fasta.write(sequence.format('FASTA') + "\n")
        output_fasta.close()
        idxfile = self.PDBseq + '.idx'
        output_idx = File(outfile + '.idx', 'w')
        input_idx = File(idxfile, 'r')
        for line in input_idx.descriptor:
            info = line.split()
            pdbname = info[0][1:]
            if pdbname in names:
                output_idx.write(line)
        input_idx.close()
        output_idx.close()
Exemplo n.º 11
0
    def _process(self):
        go_dic = {}
        parseFile = File(os.path.join(self.local, self._gfile), 'r')
        go = None
        for line in parseFile.descriptor:
            line = re.sub('\'', '\\\'', line)
            if line.startswith('[Term]'):
                if go is not None:
                    go_dic[go.id] = go
            if line.startswith('id:'):
                go = GOterm(id = line.split()[1].strip())
                continue
            if line.startswith('name:'):
                go.name = " ".join(line.split()[1:]).strip()
                continue
            if line.startswith('namespace:'):
                go.namespace = line.split()[1].strip()
                continue
            if line.startswith('alt_id:'):
                go.alt_id.append(line.split()[1].strip())
                continue
            if line.startswith('is_obsolete:'):
                go.obsolete = True
                continue
            if line.startswith('is_a:'):
                go.parents.add(line.split()[1].strip())
                continue
            if line.startswith('relationship:'):
                go.relations.append((line.split()[1].strip(),line.split()[2].strip()))
                continue
            if line.startswith('[Typedef]'):
                go_dic[go.id] = go
                break
        parseFile.close()

        for go in go_dic:
            go_dic[go].parents = self._search_parents(go_dic, go)

        goFile = File(self._gofile, 'w', True)
        for go in go_dic:
            go_dic[go].parents.add(go)
            goFile.write(str(go_dic[go]) + "\n")
        goFile.close()
Exemplo n.º 12
0
    def print_representation(self, line_split=160, out_file=None):
        '''
        Print the alignment representation of the blast hit.

        @param:    line_split
        @pdef:     number of characters per line
        @pdefault: 160
        @ptype:    {Integer}

        @param:    out_file
        @pdef:     file to print the blast data into.
        @pdefault: _None_
        @ptype:    {String}
        '''
        if out_file is not None:
            output = File(out_file, 'w')
            output.write("%s\n" % self.str_representation(line_split))
            output.close()
        else:
            print self.str_representation(line_split)
Exemplo n.º 13
0
 def _process(self):
     tmoFile = File(self._pdbtmfile, 'w', True)
     for xmlfile in Path.list_files(
             os.path.join(self._local, 'pdbtm/database/'), '*.xml'):
         xmldata = TM(
             pdb=os.path.splitext(os.path.split(xmlfile)[1])[0].upper())
         skip_chains = set()
         read = False
         fdxml = open(xmlfile)
         for line in fdxml:
             if line.startswith('    <TMRES>'): xmldata.tmres = line
             elif line.startswith('    <TMTYPE'): xmldata.tmtype = line
             elif line.startswith('    <PDBKWRES'): xmldata.kwres = line
             elif line.startswith('  <SIDEDEFINITION'):
                 m = re.search('Side1="(\S+)"', line)
                 xmldata.side = m.group(1)
             elif line.startswith('      <APPLY_TO_CHAIN'):
                 m = re.search('NEW_CHAINID=\"(\S{1})\"', line)
                 if m: skip_chains.add(m.group(1))
             elif line.startswith('  <CHAIN '):
                 m = re.search(
                     'CHAINID=\"(\S{1})\" NUM_TM=\"(\d{1})\" TYPE=\"(\S+)\"',
                     line)
                 if m:
                     chain, num, tmtype = m.group(1), m.group(2), m.group(3)
                     if not chain in skip_chains:
                         cdata = tuple([chain, num, tmtype])
                         xmldata.set_chain(cdata)
                         read = True
             elif line.startswith('    <REGION ') and read:
                 m = re.search(
                     'pdb_beg=\"(\-*\d+\w*)\"[\s\S]+pdb_end=\"(\-*\d+\w*)\"\s+type=\"(\w{1})\"',
                     line)
                 ini, end, tmtype = m.group(1), m.group(2), m.group(3)
                 xmldata.set_chain(cdata, tuple([ini, end, tmtype]))
             elif line.startswith('  </CHAIN>'):
                 read = False
         fdxml.close()
         if len(xmldata.chains) > 0:
             tmoFile.write(str(xmldata) + "\n")
     tmoFile.close()
Exemplo n.º 14
0
    def _process(self):
        inh = {}
        nodefile = File(file_name = self._nodes, action = 'r')
        for line in nodefile.descriptor:
            line = re.sub('\'', '\\\'', line)
            line_data = line.split('|')
            inh[line_data[0].strip()] = TaxID(line_data[0].strip())
            inh[line_data[0].strip()].parent = line_data[1].strip()
            inh[line_data[0].strip()].rank   = line_data[2].strip()
        nodefile.close()

        namefile = File(file_name = self._names, action = 'r')
        for line in namefile.descriptor:
            line = re.sub('\'', '\\\'', line)
            line_data = line.split('|')
            if line_data[3].strip() == 'scientific name':
                inh[line_data[0].strip()].name = line_data[1].strip()
        namefile.close()

        delefile = File(file_name = self._delet, action = 'r')
        for line in delefile.descriptor:
            data = line.split('|')
            inh[data[0].strip()]     = TaxID(data[0].strip())
            inh[data[0].strip()].old = True
        delefile.close()

        mrgefile = File(file_name = self._merged, action = 'r')
        for line in mrgefile.descriptor:
            data = line.split('|')
            inh[data[0].strip()]     = TaxID(data[0].strip())
            inh[data[0].strip()].old = True
            inh[data[0].strip()].new = data[1].strip()
        mrgefile.close()

        taxFile = File(self._taxid, 'w', True)
        for taxid in inh:
            taxFile.write(str(inh[taxid]) + "\n")
        taxFile.close()
Exemplo n.º 15
0
    def reduce(self, new_fasta_file, list_file, force=None):
        '''
        Reduces the {Fasta} by removing identical sequences.

        @param:    new_fasta_file
        @pdef:     name of the new fasta file
        @ptype:    {String}

        @param:    list_file
        @pdef:     name of the repetition list file
        @ptype:    {String}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @return: {Fasta} and {File} with the list of identical sequences.
        '''
        seq_md5 = {}
        sequences = []
        for seq in self.live_show():
            md5 = seq.md5
            if not md5 in seq_md5:
                sequences.append(seq)
                seq_md5.setdefault(md5, [])
            else:
                SBIg.alert(
                    'debug', self,
                    '{0} repeats of {1}'.format(seq.id, seq_md5[md5][0]))
            seq_md5[md5].append(seq.id)
        fasta = Fasta.build_multifasta(new_fasta_file, sequences, force)
        listfile = File(list_file, 'w')
        for md5 in seq_md5:
            listfile.write('\t'.join(seq_md5[md5]) + '\n')
        listfile.close()

        return fasta, listfile