def process_file(self, filename): """Processes the given input file""" self.log.info("Processing %s..." % filename) parser = fasta.Parser(open_anything(filename)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) for seq in parser: print(seq.id)
def process_file(self, filename): """Processes the given input file""" self.log.info("Processing %s..." % filename) parser = fasta.Parser(open_anything(filename)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) for seq in parser: print seq.id
def process_sequences_file(self, fname): self.log.info("Loading sequences from %s..." % fname) self.seq_ids_to_length = {} parser = fasta.Parser(open_anything(fname)) parser = fasta.regexp_remapper(parser, self.sequence_id_regexp ) for seq in parser: self.seq_ids_to_length[seq.id] = len(seq.seq)
def load_sequences(self, seq_file): """Loads the sequences from the given sequence file in FASTA format""" self.log.info("Loading sequences from %s..." % seq_file) parser = fasta.Parser(open_anything(seq_file)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) self.seqs = dict(((seq.id, seq) for seq in parser))
def process_sequences_file_old(self, fname): """ This is the old version, all the entries are loaded into memory """ self.log.info("Loading sequences from %s..." % fname) parser = fasta.Parser(open_anything(fname)) parser = fasta.regexp_remapper(parser, self.sequence_id_regexp) seqs, lens = [], [] for i, seq in enumerate(parser): seqs.append(seq.id) lens.append(len(seq.seq)) if i % 1000000 == 0: self.log.info("Read {} seqs".format(i)) self.log.info("...loaded") self.seq_ids_to_length = dict(zip(seqs, lens))
def run_real(self): """Runs the application""" # Load valid sequence IDs (if necessary) if self.options.sequences_file: self.log.info("Loading sequences from %s..." % self.options.sequences_file) self.total_sequence_length = 0 self.valid_sequence_ids = set() parser = fasta.Parser(open_anything(self.options.sequences_file)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) for seq in parser: self.valid_sequence_ids.add(seq.id) self.total_sequence_length += len(seq.seq) else: self.valid_sequence_ids = complementerset() self.total_sequence_length = None # Find which sources will be allowed if not self.options.include_sources: self.sources = complementerset() else: self.sources = set(self.options.include_sources) self.sources.difference_update(self.options.exclude_sources) if isinstance(self.sources, complementerset): self.log.info("Ignored sources: %s" % ", ".join(self.sources.iterexcluded())) else: self.log.info("Accepted sources: %s" % ", ".join(self.sources)) if not self.args: self.args = ["-"] for arg in self.args: # Set up the output formatter if self.options.print_totals: self.output_formatter = GenomeLevelOutputFormatter(self) else: self.output_formatter = SequenceLevelOutputFormatter(self) # Process the file self.process_infile(arg) # Print the results self.output_formatter.finish()
def process_sequences_file(self, fname): """ In this version we use `shelve` to save memory (the pairs (protein accession, length) are stored in a temporary database. See `process_sequences_file_old` for the old version. """ self.log.info("Loading sequences from {}...".format(fname)) parser = fasta.Parser(open_anything(fname)) parser = fasta.regexp_remapper(parser, self.sequence_id_regexp) self.filename_shelve = os.path.join(tempfile.gettempdir(), "shelve_file") self.seq_ids_to_length = shelve.open(self.filename_shelve) for i, seq in enumerate(parser): self.seq_ids_to_length[seq.id] = len(seq.seq) if i % 1000000 == 0: self.log.info("Read {} seqs".format(i)) self.seq_ids_to_length.sync() self.log.info("...loaded")
def process_sequences_file(self, seq_file): """Processes the sequences one by one, extracting all the pieces into an output fasta file""" self.log.info("Processing fasta file %s..." %seq_file) parser = fasta.Parser(open_anything(seq_file)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) ids_to_process = set(self.parts.keys()) writer = fasta.FastWriter(sys.stdout) if self.output_file is not None: output_fd = open(self.output_file,"w") writer_file = fasta.FastWriter(output_fd) for seq in parser: seq_id = seq.id if seq_id not in self.parts: if self.options.try_alternative_splicing: seq_id = seq_id.strip().rstrip(".1") if seq_id not in self.parts: continue else: continue sequence = seq.seq length_seq = len(sequence) ids_to_process.remove(seq_id) for left, right in self.parts[seq_id]: if left < 0: left = length_seq + left + 1 if right < 0: right = length_seq + right + 1 right = min(right, length_seq) #just in case... if left > right: #again, just in case self.log.warning("Problem with fragment of %s, " "the right part is smaller than the left" % seq_id) continue new_record = None if left == 1 and right == length_seq: new_record = seq.fragment(not self.options.keep_ids) else: if not self.options.keep_ids: new_id = "%s:%d-%d" % (seq_id, left, right) else: new_id = seq_id new_record = SeqRecord(sequence[(left-1):right], id=new_id, name=seq.name, description="") writer.write(new_record) if self.output_file is not None: writer_file.write(new_record) if self.output_file is not None: output_fd.close() if len(ids_to_process) > 0: self.log.fatal("The following identifiers of sequences (%s) were" "found in the fragments file, but not in the fasta file" % ",".join(ids_to_process)) return 1
def process_sequences_file(self, seq_file): """Processes the sequences one by one, extracting all the pieces into an output fasta file""" self.log.info("Processing fasta file %s...", seq_file) parser = fasta.Parser(open_anything(seq_file)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) ids_to_process = set(self.parts.keys()) writer = fasta.FastWriter(sys.stdout) if self.output_file is not None: output_fd = open(self.output_file, "w") writer_file = fasta.FastWriter(output_fd) for seq in parser: seq_id = seq.id if seq_id not in self.parts: if self.options.try_alternative_splicing: seq_id = seq_id.strip().rstrip(".1") if seq_id not in self.parts: continue else: continue sequence = seq.seq length_seq = len(sequence) ids_to_process.remove(seq_id) for left, right in self.parts[seq_id]: if left < 0: left = length_seq + left + 1 if right < 0: right = length_seq + right + 1 right = min(right, length_seq) # just in case... if left > right: # again, just in case self.log.warning( "Problem with fragment of %s, " "the right part is smaller than " "the left", seq_id) continue new_record = None if left == 1 and right == length_seq: new_record = seq.fragment(not self.options.keep_ids) else: if not self.options.keep_ids: new_id = "%s:%d-%d" % (seq_id, left, right) else: new_id = seq_id new_record = SeqRecord(sequence[(left - 1):right], id=new_id, name=seq.name, description="") writer.write(new_record) if self.output_file is not None: writer_file.write(new_record) if self.output_file is not None: output_fd.close() if ids_to_process: self.log.fatal( "The following identifiers of sequences (%s) were" "found in the fragments file, but not in the " "fasta file ", ",".join(ids_to_process)) return 1 return 0