def parse_fastx_sam_parallel(fastx_infile, sam_infile): """ Parse fastx and resulting sam file in parallel - generator yielding (name, seq, alignment_list) tuples. The sam file may contain multiple alignments per read. Program checks that the readnames match. """ fastx_generator = basic_seq_utilities.name_seq_generator_from_fasta_fastq(fastx_infile) sam_generator = iter(HTSeq.bundle_multiple_alignments(HTSeq.SAM_Reader(sam_infile))) if_finished_fastx, if_finished_sam = False, False while True: try: name, seq = fastx_generator.next() except StopIteration: if_finished_fastx = True try: alns = sam_generator.next() except StopIteration: if_finished_sam = True # if both finished, good, we're doine if if_finished_fastx and if_finished_sam: raise StopIteration # if one file was finished but the other wasn't, error! elif if_finished_fastx or if_finished_sam: raise DeepseqError("Parsing seq/aln files in parallel - inconsistent finished states! " +"(If finished: %s %s, %s %s)"%(fastx_infile, if_finished_fastx, sam_infile, if_finished_sam)) # if all the files still contained data, yield it else: name = name.split()[0] name2 = alns[0].read.name.split()[0] if not name2 == name: raise DeepseqError("Non-matching readnames between files! %s in %s, %s in %s"%(fastx_infile, name, sam_infile, name2)) yield (name, seq, alns)
def trim_prefix(prefix_bases, infile, trimmed_outfile, wrong_prefix_outfile=os.devnull, INFOFILE=None, verbosity=1): """ Trim prefix_bases from seqs in infile and print to trimmed_outfile; print other seqs to wrong_prefix_outfile. Reads fasta or fastq files; outputs fasta files only. For each seq in infile, if seq starts with prefix_bases, trim them and print result to trimmed_outfile; otherwise print full seq to wrong_prefix_outfile (if not None). INFOFILE should be an open file handle to print summary info to, or None; verbosity governs how much is printed to stdout """ text = "### Trimming %s from start of each sequence in %s (output to %s, untrimmed to %s)\n"%(prefix_bases, infile, trimmed_outfile, wrong_prefix_outfile) # MAYBE-TODO modify so it can output fastq too? if INFOFILE is not None: INFOFILE.write(text+'\n') if verbosity>0: print text N_trimmed, N_untrimmed = 0, 0 with open(trimmed_outfile, 'w') as TRIMMED_OUTFILE: with open(wrong_prefix_outfile, 'w') as WRONG_PREFIX_OUTFILE: # MAYBE-TODO right now if wrong_prefix_outfile==None, /dev/null is used - it would be faster with a custom file-like object that doesn't touch the OS, but I'm not sure how to write one so it can be opened! See general_utilities.FAKE_OUTFILE for an already open one. name_seq_generator = name_seq_generator_from_fasta_fastq(infile, verbosity>2) for name,seq in name_seq_generator: if_trimmed = _trim_prefix_single(name, seq, prefix_bases, TRIMMED_OUTFILE, WRONG_PREFIX_OUTFILE) if if_trimmed: N_trimmed += 1 else: N_untrimmed += 1 N_total = N_trimmed + N_untrimmed text = "Trimmed sequences: %s\nUntrimmed sequences: %s\n"%(value_and_percentages(N_trimmed, [N_total]), value_and_percentages(N_untrimmed, [N_total])) if INFOFILE is not None: INFOFILE.write(text+'\n') if verbosity>1: print text return N_trimmed, N_untrimmed
def trim_prefix(prefix_bases, infile, trimmed_outfile, wrong_prefix_outfile=os.devnull, INFOFILE=None, verbosity=1): """ Trim prefix_bases from seqs in infile and print to trimmed_outfile; print other seqs to wrong_prefix_outfile. Reads fasta or fastq files; outputs fasta files only. For each seq in infile, if seq starts with prefix_bases, trim them and print result to trimmed_outfile; otherwise print full seq to wrong_prefix_outfile (if not None). INFOFILE should be an open file handle to print summary info to, or None; verbosity governs how much is printed to stdout """ text = "### Trimming %s from start of each sequence in %s (output to %s, untrimmed to %s)\n" % ( prefix_bases, infile, trimmed_outfile, wrong_prefix_outfile) # MAYBE-TODO modify so it can output fastq too? if INFOFILE is not None: INFOFILE.write(text + '\n') if verbosity > 0: print text N_trimmed, N_untrimmed = 0, 0 with open(trimmed_outfile, 'w') as TRIMMED_OUTFILE: with open(wrong_prefix_outfile, 'w') as WRONG_PREFIX_OUTFILE: # MAYBE-TODO right now if wrong_prefix_outfile==None, /dev/null is used - it would be faster with a custom file-like object that doesn't touch the OS, but I'm not sure how to write one so it can be opened! See general_utilities.FAKE_OUTFILE for an already open one. name_seq_generator = name_seq_generator_from_fasta_fastq( infile, verbosity > 2) for name, seq in name_seq_generator: if_trimmed = _trim_prefix_single(name, seq, prefix_bases, TRIMMED_OUTFILE, WRONG_PREFIX_OUTFILE) if if_trimmed: N_trimmed += 1 else: N_untrimmed += 1 N_total = N_trimmed + N_untrimmed text = "Trimmed sequences: %s\nUntrimmed sequences: %s\n" % ( value_and_percentages(N_trimmed, [N_total]), value_and_percentages(N_untrimmed, [N_total])) if INFOFILE is not None: INFOFILE.write(text + '\n') if verbosity > 1: print text return N_trimmed, N_untrimmed