def categorize_reads_print_to_files(readname_to_aln_list, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=True, multiple_to_write=-1, input_collapsed_to_unique=False, no_warnings=False): """ Decide the proper category for each read, write to appropriate output file; return category counts. Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments. If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, with N determined from readname using the fastx-collapser encoding. In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette. Each read is printed to the appropriate outfile (all outfiles should be open file handles); for multiple-genomic, multiple_to_write lines will be written; if unaligned_as_fasta, unaligned reads will be written as fasta instead of SAM format (and so will multiple-genomic if multiple_to_write is 0). """ category_readcounts = {'unaligned':0, 'cassette':0, 'multiple-genomic':0, 'genomic-unique':0, 'cassette-multiple':0} for readname,aln_list in sorted(readname_to_aln_list.items()): readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header(readname) # if there's a single alignment, it's unaligned, cassette or genomic-unique if len(aln_list) == 1: aln = aln_list[0] if not aln.aligned: category_readcounts['unaligned'] += readcount if unaligned_as_fasta: write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE) else: write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE) elif is_cassette_chromosome(aln.iv.chrom): category_readcounts['cassette'] += readcount write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE) else: category_readcounts['genomic-unique'] += readcount write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE) # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic else: assert all([aln.aligned for aln in aln_list]), "Shouldn't see multiple unaligned lines per read!" # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen. if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]): assert all([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]), "Mixed cassette/other!" category_readcounts['cassette'] += readcount if not no_warnings: print "Warning: multiple cassette alignments! Printing all to cassette file.\n\t%s"%(aln_list) category_readcounts['cassette-multiple'] += readcount for aln in aln_list: write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE) # multiple genomic alignments - how many get written depends on multiple_to_write; # if it's 0, the outfile should be fasta, or else I guess it should be written as unaligned? # (MAYBE-TODO writing single multiple as unaligned not implemented!) else: category_readcounts['multiple-genomic'] += readcount if multiple_to_write == 0: if unaligned_as_fasta: write_fasta_line(readname, aln_list[0].read.seq, MULTIPLE_GENOMIC_FILE) else: raise Exception("Writing 0 multiple alignments in SAM format NOT IMPLEMENTED!") else: for aln in aln_list[:multiple_to_write]: write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE) return category_readcounts
def subsequence_counts(infile_reader, seq_length=None, input_collapsed_to_unique=False): """ Given an iterator of Biopython seq objects and desired subsequence length/end, return subsequence:count dict. seq_length: if None, take whole seq; if N>0, take first N bases, if N<0, take last -N bases. If input_collapsed_to_unique is True, consider each sequence to be X reads, determined from seq.name, using the format used by fastx_collapser from FastX Toolkit. """ seq_counter = defaultdict(lambda: 0) # a counter with a default value of 0 for sequence in infile_reader: N_seqs = get_seq_count_from_collapsed_header(sequence.name) if input_collapsed_to_unique else 1 # convert Biopython Seq objects to plain strings - Seq objects aren't hashable correctly if seq_length > 0: subsequence = str(sequence.seq[0:seq_length]) else: subsequence = str(sequence.seq[seq_length:]) seq_counter[subsequence] += N_seqs return dict(seq_counter)
def seq_count_and_lengths(seq_iterator, count_only=False, input_collapsed_to_unique=False): """ Given an iterator over sequences, return N_seqs and a seq_len:seq_count dict (empty if count_only). Sequence length is determined by len(seq) - will fail if len() doesn't work on the elements of seq_iterator. If input_collapsed_to_unique, decode the read count from seq header instead of counting each seq as 1, using basic_seq_utilities.get_seq_count_from_collapsed_header (see docstring for that). """ total_count = 0 seqlen_counter = defaultdict(lambda: 0) for seq in seq_iterator: N_seqs = get_seq_count_from_collapsed_header(seq.name) if input_collapsed_to_unique else 1 total_count += N_seqs if not count_only: seqlen_counter[len(seq)] += N_seqs return total_count, dict(seqlen_counter)
def seq_split_by_length(infile, min_length=None, max_length=None, force_fasta_output=False, include_empty_files=False, ignore_zero_length_sequences=False, pad_filenames_for_sort=0, input_collapsed_to_unique=False, quiet=False): """ See module docstring and optparse option help messages - avoiding duplication. """ # file format recognition (I could do it by trying to use FastaReader/FastqReader on it, but it's annoying) fasta_extensions = ['fa','fasta'] fastq_extensions = ['fq','fastq'] extension = os.path.splitext(infile)[1].lower()[1:] if extension in fasta_extensions: infile_reader = FastaReader(infile) elif extension in fastq_extensions: infile_reader = FastqReader(infile,qual_scale="solexa") else: sys.exit("Error: input file %s (extension %s) needs to have a %s extension to be recognized!"%(infile, extension, '/'.join(fasta_extensions+fastq_extensions))) if force_fasta_output: extension = 'fa' ### make the output folder, and outfiles infile_base = os.path.splitext(infile)[0] outfolder = infile_base os.mkdir(outfolder) # a (length: open file object) dictionary, so I can keep them all open and close them at the end. # Yes, I know I should really be using with/as, but I don't think you can do multiples of that at once, and I can't have a level of indent for every possible sequence length! len_to_outfile_dict = {} # a counter with a default value of 0 seq_counter = defaultdict(lambda: 0) for seq in infile_reader: seqlen = len(seq) # add the N_seqs to the seq counter N_seqs = get_seq_count_from_collapsed_header(seq.name) if input_collapsed_to_unique else 1 seq_counter[seqlen] += N_seqs if ignore_zero_length_sequences and seqlen==0: continue # special length cases for when min/max length is set if min_length is not None and seqlen<min_length: seqlen = min_length-1 elif max_length is not None and seqlen>max_length: seqlen = max_length+1 # if outfile for that length doesn't exist, create it if seqlen not in len_to_outfile_dict.keys(): seqlen_string = "%0*dbp"%(pad_filenames_for_sort, seqlen) if min_length is not None and seqlen<min_length: seqlen_string += '_or_less' elif max_length is not None and seqlen>max_length: seqlen_string += '_or_more' filename = "%s.%s"%(seqlen_string,extension) len_to_outfile_dict[seqlen] = open(os.path.join(outfolder,filename), 'w') # write the sequence (fasta or fastq!) to the outfile! if force_fasta_output or extension in fasta_extensions: seq.write_to_fasta_file(len_to_outfile_dict[seqlen]) else: seq.write_to_fastq_file(len_to_outfile_dict[seqlen]) # optionally add the empty files that had no sequences of that length if include_empty_files: if min_length is None: min_length = min(len_to_outfile_dict.keys()) if max_length is None: max_length = max(len_to_outfile_dict.keys()) for seqlen in range(min_length+1,max_length): if seqlen not in len_to_outfile_dict.keys(): len_to_outfile_dict[seqlen] = open(os.path.join(outfilder, "%s_%s.%s"%(infile_base,seqlen,extension)),'w') # close all the files for FILE in len_to_outfile_dict.values(): FILE.close() # format and print the seq counts by length if not quiet: if 0 in seq_counter.keys() and ignore_zero_length_sequences: print "(discarding zero-length sequences)" for line in _format_lengths(seq_counter, include_empty_files, 1): print(line),
def categorize_reads_print_to_files(readname, aln_list, category_readcounts, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=False, multiple_to_write=-1, input_collapsed_to_unique=False, no_multi_cassette_warnings=False): """ Decide the proper category for the read, write to appropriate output file; adjust category counts. Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments). The reads will be categorized, and printed to the appropriate file (all the uppercase arguments should be open file objects; they can all be the SAME file object if desired.) If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, with N determined from readname using the fastx-collapser encoding. In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette. The read is printed to the appropriate outfile (all outfiles should be open file handles); for multiple-genomic, only N=multiple_to_write lines will be written; if N=0, one line will be written that treats the read as unaligned, but with XM:i:M optional tag field added, where M is the number of multiple alignments. If unaligned_as_fasta, unaligned reads will be written as fasta instead of SAM format, and so will multiple if multiple_to_write is 0. """ readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header( readname) # if there's a single alignment, it's unaligned, cassette or genomic-unique if len(aln_list) == 1: aln = aln_list[0] if not aln.aligned: category = 'unaligned' if unaligned_as_fasta: write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE) else: write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE) elif is_cassette_chromosome(aln.iv.chrom): category = 'cassette' write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE) else: category = 'genomic-unique' write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE) # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic else: assert all([aln.aligned for aln in aln_list ]), "Shouldn't see multiple unaligned lines per read!" # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen. # (NOTE: sometimes they happen because I'm actually aligning to multiple cassettes - then they're fine.) if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]): assert all([ is_cassette_chromosome(aln.iv.chrom) for aln in aln_list ]), "Mixed cassette/other!" if not no_multi_cassette_warnings: print( "Warning: multiple cassette alignments! Printing only one to cassette file. Seq %s, " % aln_list[0].read.seq, "first 3 positions %s" % ', '.join([ "%s %s %s" % (a.iv.chrom, a.iv.strand, a.iv.start) for a in aln_list[:3] ])) category = 'cassette-multiple' else: category = 'cassette' # first position alphabetically is chosen - MAYBE-TODO add other choice options? aln_to_print = sorted( aln_list, key=lambda a: (a.iv.chrom, a.iv.strand, a.iv.start, a.iv.end))[0] # just add _and_others to the chromosome - MAYBE-TODO add something more informative, like list of names? # but that would be tricky, need to strip matching prefixes from them, # what about multiple alignments to SAME chromosome, etc. aln_to_print.iv.chrom = aln_to_print.iv.chrom + '_and_others' write_SAM_line_from_HTSeq_aln(aln_to_print, CASSETTE_FILE) # multiple genomic alignments: # - if multiple_to_write=0, treat multiple as unaligned - if unaligned_as_fasta, print fasta line, # else single unaligned SAM line, with XM:i:M optional tag field added, where M is the number of multiple alignments. # - if multiple_to_write>0, print that many normal SAM lines for N alignments # MAYBE-TODO add an option to write multiple as unaligned to the main SAM file AND full multiple lines to another file? else: category = 'multiple-genomic' if multiple_to_write == 0: if unaligned_as_fasta: write_fasta_line(readname, aln_list[0].read.seq, MULTIPLE_GENOMIC_FILE) else: aln = aln_list[0] MULTIPLE_GENOMIC_FILE.write( '%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\tXM:i:%s\n' % (aln.read.name, aln.read.seq, aln.read.qualstr, len(aln_list))) else: for aln in aln_list[:multiple_to_write]: write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE) category_readcounts[category] += readcount return category
def categorize_reads_print_to_files(readname, aln_list, category_readcounts, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=False, multiple_to_write=-1, input_collapsed_to_unique=False, no_multi_cassette_warnings=False): """ Decide the proper category for the read, write to appropriate output file; adjust category counts. Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments). The reads will be categorized, and printed to the appropriate file (all the uppercase arguments should be open file objects; they can all be the SAME file object if desired.) If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, with N determined from readname using the fastx-collapser encoding. In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette. The read is printed to the appropriate outfile (all outfiles should be open file handles); for multiple-genomic, only N=multiple_to_write lines will be written; if N=0, one line will be written that treats the read as unaligned, but with XM:i:M optional tag field added, where M is the number of multiple alignments. If unaligned_as_fasta, unaligned reads will be written as fasta instead of SAM format, and so will multiple if multiple_to_write is 0. """ readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header(readname) # if there's a single alignment, it's unaligned, cassette or genomic-unique if len(aln_list) == 1: aln = aln_list[0] if not aln.aligned: category = 'unaligned' if unaligned_as_fasta: write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE) else: write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE) elif is_cassette_chromosome(aln.iv.chrom): category = 'cassette' write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE) else: category = 'genomic-unique' write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE) # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic else: assert all([aln.aligned for aln in aln_list]), "Shouldn't see multiple unaligned lines per read!" # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen. # (NOTE: sometimes they happen because I'm actually aligning to multiple cassettes - then they're fine.) if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]): assert all([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]), "Mixed cassette/other!" if not no_multi_cassette_warnings: print ("Warning: multiple cassette alignments! Printing only one to cassette file. Seq %s, "%aln_list[0].read.seq, "first 3 positions %s"%', '.join(["%s %s %s"%(a.iv.chrom, a.iv.strand, a.iv.start) for a in aln_list[:3]])) category = 'cassette-multiple' else: category = 'cassette' # first position alphabetically is chosen - MAYBE-TODO add other choice options? aln_to_print = sorted(aln_list, key=lambda a: (a.iv.chrom, a.iv.strand, a.iv.start, a.iv.end))[0] # just add _and_others to the chromosome - MAYBE-TODO add something more informative, like list of names? # but that would be tricky, need to strip matching prefixes from them, # what about multiple alignments to SAME chromosome, etc. aln_to_print.iv.chrom = aln_to_print.iv.chrom + '_and_others' write_SAM_line_from_HTSeq_aln(aln_to_print, CASSETTE_FILE) # multiple genomic alignments: # - if multiple_to_write=0, treat multiple as unaligned - if unaligned_as_fasta, print fasta line, # else single unaligned SAM line, with XM:i:M optional tag field added, where M is the number of multiple alignments. # - if multiple_to_write>0, print that many normal SAM lines for N alignments # MAYBE-TODO add an option to write multiple as unaligned to the main SAM file AND full multiple lines to another file? else: category = 'multiple-genomic' if multiple_to_write == 0: if unaligned_as_fasta: write_fasta_line(readname, aln_list[0].read.seq, MULTIPLE_GENOMIC_FILE) else: aln = aln_list[0] MULTIPLE_GENOMIC_FILE.write('%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\tXM:i:%s\n'%(aln.read.name, aln.read.seq, aln.read.qualstr, len(aln_list))) else: for aln in aln_list[:multiple_to_write]: write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE) category_readcounts[category] += readcount return category