def build_multifasta(file_name, sequence_list, force=None): ''' Creates a Fasta object and a FASTA file. For multiple sequences. @param: file_name @pdef: name of the fasta file (with path, if necessary) @ptype: {String} @param: sequence_list @pdef: list of sequences to create the FASTA from. @ptype: {List} or {Set} of {Sequence} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @return: {Fasta} ''' newFasta = File(file_name, 'w', overwrite=force) for sequence in sequence_list: newFasta.write(sequence.format('FASTA') + '\n') newFasta.close() fasta_file = Fasta(fasta_file=newFasta.full, auto_load=0) return fasta_file
def _save_release(self): ''' Store the release data into a file. ''' f = File(os.path.join(self.local, self._CONTROL_FILE), 'w', True) f.write(json.dumps(self._RELEASE)) f.close()
def build(file_name, sequence_id, sequence, force=None): ''' Creates a Fasta object and a FASTA file from a sequence. @param: file_name @pdef: name of the fasta file (with path, if necessary) @ptype: {String} @param: sequence_id @pdef: name of the sequence @ptype: {String} @param: sequence @pdef: sequence @ptype: {String} or {List} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @return: {Fasta} ''' newFasta = File(file_name, 'w', overwrite=force) newSeq = Sequence(sequence_id=sequence_id, sequence=sequence) newFasta.write(newSeq.format('FASTA')) newFasta.close() return Fasta(fasta_file=newFasta.full, auto_load=0)
def _process(self): enzymes = self._parse_enzclass() + self._parse_enzymedat() enzymes.sort() enzFile = File(self._enzfile, 'w', True) for e in enzymes: enzFile.write(repr(e) + "\n") enzFile.close()
def _process(self): targets = self._process_targets() drugs = self._process_drugs(targets) drugFile = File(self._drugfile, 'w', True) for d in drugs: drugFile.write(repr(d) + "\n") drugFile.close()
def format2file(self, filename, extension = 'pdb', center = False): if extension not in ('pdb', 'js'): raise AttributeError('Not accepted extension') structure = File('.'.join([filename, extension]), 'w') if extension == 'pdb': structure.write(self.pdb_format(center = center)) elif extension == 'js': structure.write(self.js_format(center = center)) structure.close()
def subset(self, sequence_ids, new_fasta_file, all_but=False, prefix_size=None, index=False, force=None): ''' Creates a new {Fasta} with the requested subset of sequences. @param: sequence_ids @pdef: sequence identifier(s) @ptype: {String}, {List} or {Set} @param: new_fasta_file @pdef: name of the new fasta file @ptype: {String} @param: all_but @pdef: Flag. Instead of retrieving the given ids, we retrieve all except the given ids. @pdefault: _False_ @ptype: {Boolean} @param: prefix_size @pdef: maximum characters for the prefix. If _None_, all the characters are included. @pdefault: _None_ @ptype: {Integer} @param: index @pdef: create the index file also, in case it does exist @pdefault: _False_ @ptype: {Boolean} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @raises: {AttributeError} if sequence_ids is not a valid type. @return: {Fasta} ''' sequences = self.retrieve(sequence_ids, all_but, prefix_size) fasta_file = Fasta.build_multifasta(new_fasta_file, sequences, force) if self.has_index and index: idxfile = File(self.index_file) newidx = File(fasta_file.file.full + '.idx', 'w') seqids = set(fasta_file.sequence_identifiers) for idx in idxfile.read(): if idx.split()[0].strip('>') in seqids: newidx.write(idx) idxfile.close() newidx.close() fasta_file.index_file = newidx.full return fasta_file
def _process(self, update=False): ''' Transform the source files into the final local db files. @param: update @pdef: toggles between create and update processing @pdefault: _False_ @ptype: {Boolean ''' if update: old = self._RELEASE['total_items'].copy() j = 0 for i in range(len(self._SOURCES)): dfilen = os.path.join(self.local, self._SOURCES[i]) ofilen = os.path.join(self.local, self._MANDATORY_FILES[j]) ffilen = os.path.join(self.local, self._MANDATORY_FILES[j + 1]) if not os.path.isfile(dfilen): continue SBIg.alert('verbose', self, 'Parsing: {0}'.format(dfilen)) SBIg.alert('verbose', self, 'DB file to: {0}'.format(ofilen)) SBIg.alert('verbose', self, 'Fasta file to: {0}'.format(ffilen)) dfile = File(dfilen) ofile = File(ofilen, 'w', update) ffile = File(ffilen, 'w', update) protein = None for protein in Connect._parse_uniprot(dfile): pname = protein.entry_name pvers = protein.version SBIg.alert('verbose', self, 'Protein: {0}'.format(pname)) if not update: self._RELEASE['total_items'][pname] = pvers else: if pname not in self._RELEASE['total_items']: self._RELEASE['new_items'][pname] = pvers else: del (old[pname]) if self._RELEASE['total_items'][pname] != pvers: self._RELEASE['update_items'][pname] = pvers ffile.write(protein.sequence.format('FASTA') + '\n') ofile.write(protein.json() + '\n') j += 2 dfile.close() ofile.close() ffile.close() if update: self._RELEASE['total_items'].update(self._RELEASE['new_items']) self._RELEASE['total_items'].update(self._RELEASE['update_items']) self._RELEASE['deleted_items'] = old for k in self._RELEASE['deleted_items']: del (self._RELEASE['total_items'][k])
def print_compacted_blast(self, out_file=None): ''' Print the compacted format of the blast hit. @param: out_file @pdef: file to print the blast data into. @pdefault: _None_ @ptype: {String} ''' if out_file is not None: output = File(out_file, 'w') output.write("%s\n" % self.str_compacted_blast()) output.close() else: print self.str_compacted_blast()
def get_FASTA_IDX_by_names_to_file(self, names, outfile): fastafile = Fasta(self.PDBseq) selectedfasta = fastafile.retrieve(copy.deepcopy(names)) output_fasta = File(outfile, 'w') for sequence in selectedfasta: output_fasta.write(sequence.format('FASTA') + "\n") output_fasta.close() idxfile = self.PDBseq + '.idx' output_idx = File(outfile + '.idx', 'w') input_idx = File(idxfile, 'r') for line in input_idx.descriptor: info = line.split() pdbname = info[0][1:] if pdbname in names: output_idx.write(line) input_idx.close() output_idx.close()
def _process(self): go_dic = {} parseFile = File(os.path.join(self.local, self._gfile), 'r') go = None for line in parseFile.descriptor: line = re.sub('\'', '\\\'', line) if line.startswith('[Term]'): if go is not None: go_dic[go.id] = go if line.startswith('id:'): go = GOterm(id = line.split()[1].strip()) continue if line.startswith('name:'): go.name = " ".join(line.split()[1:]).strip() continue if line.startswith('namespace:'): go.namespace = line.split()[1].strip() continue if line.startswith('alt_id:'): go.alt_id.append(line.split()[1].strip()) continue if line.startswith('is_obsolete:'): go.obsolete = True continue if line.startswith('is_a:'): go.parents.add(line.split()[1].strip()) continue if line.startswith('relationship:'): go.relations.append((line.split()[1].strip(),line.split()[2].strip())) continue if line.startswith('[Typedef]'): go_dic[go.id] = go break parseFile.close() for go in go_dic: go_dic[go].parents = self._search_parents(go_dic, go) goFile = File(self._gofile, 'w', True) for go in go_dic: go_dic[go].parents.add(go) goFile.write(str(go_dic[go]) + "\n") goFile.close()
def print_representation(self, line_split=160, out_file=None): ''' Print the alignment representation of the blast hit. @param: line_split @pdef: number of characters per line @pdefault: 160 @ptype: {Integer} @param: out_file @pdef: file to print the blast data into. @pdefault: _None_ @ptype: {String} ''' if out_file is not None: output = File(out_file, 'w') output.write("%s\n" % self.str_representation(line_split)) output.close() else: print self.str_representation(line_split)
def _process(self): tmoFile = File(self._pdbtmfile, 'w', True) for xmlfile in Path.list_files( os.path.join(self._local, 'pdbtm/database/'), '*.xml'): xmldata = TM( pdb=os.path.splitext(os.path.split(xmlfile)[1])[0].upper()) skip_chains = set() read = False fdxml = open(xmlfile) for line in fdxml: if line.startswith(' <TMRES>'): xmldata.tmres = line elif line.startswith(' <TMTYPE'): xmldata.tmtype = line elif line.startswith(' <PDBKWRES'): xmldata.kwres = line elif line.startswith(' <SIDEDEFINITION'): m = re.search('Side1="(\S+)"', line) xmldata.side = m.group(1) elif line.startswith(' <APPLY_TO_CHAIN'): m = re.search('NEW_CHAINID=\"(\S{1})\"', line) if m: skip_chains.add(m.group(1)) elif line.startswith(' <CHAIN '): m = re.search( 'CHAINID=\"(\S{1})\" NUM_TM=\"(\d{1})\" TYPE=\"(\S+)\"', line) if m: chain, num, tmtype = m.group(1), m.group(2), m.group(3) if not chain in skip_chains: cdata = tuple([chain, num, tmtype]) xmldata.set_chain(cdata) read = True elif line.startswith(' <REGION ') and read: m = re.search( 'pdb_beg=\"(\-*\d+\w*)\"[\s\S]+pdb_end=\"(\-*\d+\w*)\"\s+type=\"(\w{1})\"', line) ini, end, tmtype = m.group(1), m.group(2), m.group(3) xmldata.set_chain(cdata, tuple([ini, end, tmtype])) elif line.startswith(' </CHAIN>'): read = False fdxml.close() if len(xmldata.chains) > 0: tmoFile.write(str(xmldata) + "\n") tmoFile.close()
def _process(self): inh = {} nodefile = File(file_name = self._nodes, action = 'r') for line in nodefile.descriptor: line = re.sub('\'', '\\\'', line) line_data = line.split('|') inh[line_data[0].strip()] = TaxID(line_data[0].strip()) inh[line_data[0].strip()].parent = line_data[1].strip() inh[line_data[0].strip()].rank = line_data[2].strip() nodefile.close() namefile = File(file_name = self._names, action = 'r') for line in namefile.descriptor: line = re.sub('\'', '\\\'', line) line_data = line.split('|') if line_data[3].strip() == 'scientific name': inh[line_data[0].strip()].name = line_data[1].strip() namefile.close() delefile = File(file_name = self._delet, action = 'r') for line in delefile.descriptor: data = line.split('|') inh[data[0].strip()] = TaxID(data[0].strip()) inh[data[0].strip()].old = True delefile.close() mrgefile = File(file_name = self._merged, action = 'r') for line in mrgefile.descriptor: data = line.split('|') inh[data[0].strip()] = TaxID(data[0].strip()) inh[data[0].strip()].old = True inh[data[0].strip()].new = data[1].strip() mrgefile.close() taxFile = File(self._taxid, 'w', True) for taxid in inh: taxFile.write(str(inh[taxid]) + "\n") taxFile.close()
def reduce(self, new_fasta_file, list_file, force=None): ''' Reduces the {Fasta} by removing identical sequences. @param: new_fasta_file @pdef: name of the new fasta file @ptype: {String} @param: list_file @pdef: name of the repetition list file @ptype: {String} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @return: {Fasta} and {File} with the list of identical sequences. ''' seq_md5 = {} sequences = [] for seq in self.live_show(): md5 = seq.md5 if not md5 in seq_md5: sequences.append(seq) seq_md5.setdefault(md5, []) else: SBIg.alert( 'debug', self, '{0} repeats of {1}'.format(seq.id, seq_md5[md5][0])) seq_md5[md5].append(seq.id) fasta = Fasta.build_multifasta(new_fasta_file, sequences, force) listfile = File(list_file, 'w') for md5 in seq_md5: listfile.write('\t'.join(seq_md5[md5]) + '\n') listfile.close() return fasta, listfile