def categorize_reads_print_to_files(readname_to_aln_list, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=True, multiple_to_write=-1, input_collapsed_to_unique=False, no_warnings=False): """ Decide the proper category for each read, write to appropriate output file; return category counts. Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments. If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, with N determined from readname using the fastx-collapser encoding. In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette. Each read is printed to the appropriate outfile (all outfiles should be open file handles); for multiple-genomic, multiple_to_write lines will be written; if unaligned_as_fasta, unaligned reads will be written as fasta instead of SAM format (and so will multiple-genomic if multiple_to_write is 0). """ category_readcounts = {'unaligned':0, 'cassette':0, 'multiple-genomic':0, 'genomic-unique':0, 'cassette-multiple':0} for readname,aln_list in sorted(readname_to_aln_list.items()): readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header(readname) # if there's a single alignment, it's unaligned, cassette or genomic-unique if len(aln_list) == 1: aln = aln_list[0] if not aln.aligned: category_readcounts['unaligned'] += readcount if unaligned_as_fasta: write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE) else: write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE) elif is_cassette_chromosome(aln.iv.chrom): category_readcounts['cassette'] += readcount write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE) else: category_readcounts['genomic-unique'] += readcount write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE) # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic else: assert all([aln.aligned for aln in aln_list]), "Shouldn't see multiple unaligned lines per read!" # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen. if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]): assert all([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]), "Mixed cassette/other!" category_readcounts['cassette'] += readcount if not no_warnings: print "Warning: multiple cassette alignments! Printing all to cassette file.\n\t%s"%(aln_list) category_readcounts['cassette-multiple'] += readcount for aln in aln_list: write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE) # multiple genomic alignments - how many get written depends on multiple_to_write; # if it's 0, the outfile should be fasta, or else I guess it should be written as unaligned? # (MAYBE-TODO writing single multiple as unaligned not implemented!) else: category_readcounts['multiple-genomic'] += readcount if multiple_to_write == 0: if unaligned_as_fasta: write_fasta_line(readname, aln_list[0].read.seq, MULTIPLE_GENOMIC_FILE) else: raise Exception("Writing 0 multiple alignments in SAM format NOT IMPLEMENTED!") else: for aln in aln_list[:multiple_to_write]: write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE) return category_readcounts
def grab_flanking_regions_from_pos_dict( insertion_position_dict, genome, flanksize=200, padding_char=".", chromosome_check_function=lambda x: True, ignore_both_strand_mutants=False, ): """ Same as grab_flanking_regions_from_mutantfile, but takes input as a (chrom,strand):pos_list dictionary instead, and assumes all readcounts to be 1. """ flanking_region_count_list = [] for (chromosome, strand), pos_list in insertion_position_dict.items(): for position_before_insertion in pos_list: # filter out positions in wrong chromosomes; filter out both-stranded positions if desired if not chromosome_check_function(chromosome): continue if strand not in "+-": if strand == "both" and ignore_both_strand_mutants: continue else: raise ValueError("Unexpected strand! %s" % strand) # ignore cassette tandems (i.e. insertions that map to start or end of cassette) if mutant_analysis_classes.is_cassette_chromosome(chromosome): if position_before_insertion in [0, len(genome[chromosome])]: continue # grab the actual flanking sequence, with padding, correct orientation etc full_flanking_seq = flanking_region_from_pos( position_before_insertion, chromosome, strand, genome, flanksize, padding_char ) # append the sequence and readcount to output data flanking_region_count_list.append((full_flanking_seq, 1)) return flanking_region_count_list
def grab_flanking_region_motif_counts_from_pos_dict(insertion_position_dict, genome, flanksize=2, chromosome_check_function=lambda x: True, ignore_both_strand_mutants=False): """ Get a flanking_seq:count dictionary for the flanking seqs for insertion_position_dict ((chrom,strand):pos_list dictionary). Only really makes sense for small flanksizes - otherwise the total number of possible motifs will be huge (4^(flanksize*2). Assumes all readcounts to be 1. """ # initialize the motif-count lists to the right length, and fill it out by going over all the seqs motif_count_dict = {''.join(four_bases): 0 for four_bases in itertools.product(NORMAL_DNA_BASES,NORMAL_DNA_BASES,NORMAL_DNA_BASES,NORMAL_DNA_BASES)} # for each position, grab the flanking region and add it to the motif_count_dict for (chromosome,strand),pos_list in insertion_position_dict.items(): for position_before_insertion in pos_list: # filter out positions in wrong chromosomes; filter out both-stranded positions if desired if not chromosome_check_function(chromosome): continue if strand not in '+-': if strand=='both' and ignore_both_strand_mutants: continue else: raise ValueError("Unexpected strand! %s"%strand) # ignore cassette tandems (i.e. insertions that map to start or end of cassette) if mutant_analysis_classes.is_cassette_chromosome(chromosome): if position_before_insertion in [0, len(genome[chromosome])]: continue # grab the actual flanking sequence, with padding, correct orientation etc full_flanking_seq = flanking_region_from_pos(position_before_insertion, chromosome, strand, genome, flanksize) # add motif-count of full_flanking_seq to motif_count_dict try: motif_count_dict[full_flanking_seq] += 1 except KeyError: pass # MAYBE-TODO add an option to NOT ignore motifs that aren't in NORMAL_DNA_BASES? return motif_count_dict
def grab_flanking_region_base_counts_from_pos_dict(insertion_position_dict, genome, flanksize=200, chromosome_check_function=lambda x: True, ignore_both_strand_mutants=False): """ Same as base_count_dict(grab_flanking_regions_from_mutantfile(*args)), but saves memory by not keeping all the sequences. Basically instead of making a full dataset of flanking regions with grab_flanking_regions_from_mutantfile (which can be BIG) and then converting those to a base-count dict with base_count_dict, just go over each position in insertion_position_dict, grab that flanking region, add it to the current base-count dict, and go on to the next one, without saving. Assumes all readcounts to be 1. """ # initialize the base-count lists to the right length, and fill it out by going over all the seqs base_count_dict = {base: [0 for _ in range(flanksize*2)] for base in NORMAL_DNA_BASES} # for each position, grab the flanking region and add it to the base_count_dict for (chromosome,strand),pos_list in insertion_position_dict.items(): for position_before_insertion in pos_list: # filter out positions in wrong chromosomes; filter out both-stranded positions if desired if not chromosome_check_function(chromosome): continue if strand not in '+-': if strand=='both' and ignore_both_strand_mutants: continue else: raise ValueError("Unexpected strand! %s"%strand) # ignore cassette tandems (i.e. insertions that map to start or end of cassette) if mutant_analysis_classes.is_cassette_chromosome(chromosome): if position_before_insertion in [0, len(genome[chromosome])]: continue # grab the actual flanking sequence, with padding, correct orientation etc full_flanking_seq = flanking_region_from_pos(position_before_insertion, chromosome, strand, genome, flanksize) # add base-counts from full_flanking_seq to base_count_dict for position, base in enumerate(full_flanking_seq.upper()): try: base_count_dict[base][position] += 1 except KeyError: pass # MAYBE-TODO add an option to NOT ignore bases that aren't in NORMAL_DNA_BASES? return base_count_dict
def grab_flanking_regions_from_mutantfile( mutant_dataset_infile, genome, flanksize=200, padding_char=".", min_readcount=0, chromosome_check_function=lambda x: True, ignore_both_strand_mutants=False, ): """ Return (flanking_seq,readcount) with both-side genomic flanking sequences for insertional mutants in mutant_dataset_infile. Grab all the insertion positions from mutant_dataset_infile (pickled mutant_analysis_classes.Insertional_mutant_dataset object), use genome (a chrom_name:seq dict) to figure out the flanksize-length flanking sequences on both sides (padded with padding_char if the end of the chromosome is too close), reverse-complement if needed (if strand=='-') to get it in the same orientation as the insertion. Filter the mutants: - by readcount - ignore mutants with total readcount below min_readcount=0 - by chromosome - ignore mutants in chromosomes for which chromosome_check_function returns False - by strand - both-strand (merged tandem) mutants will be ignored if ignore_both_strand_mutants is True, otherwise ValueError will be raised; ValueError will be raised for other unexpected strand values. For all remaining mutants, append (flanking region seq, total_readcount) to output list. """ dataset = mutant_analysis_classes.read_mutant_file(mutant_dataset_infile) flanking_region_count_list = [] for mutant in sorted(dataset, key=lambda m: m.position): # filter out mutants with wrong readcounts or in wrong chromosomes if not chromosome_check_function(mutant.position.chromosome): continue if mutant.total_read_count < min_readcount: continue # filter out both-stranded mutants if desired; if mutant.position.strand not in "+-": if mutant.position.strand == "both" and ignore_both_strand_mutants: continue else: raise ValueError("Unexpected mutant strand! %s" % mutant.position) # grab mutant position/chromosome position_before_insertion = mutant.position.min_position # ignore cassette tandems (i.e. insertions that map to start or end of cassette) if mutant_analysis_classes.is_cassette_chromosome(mutant.position.chromosome): if position_before_insertion in [0, len(genome[mutant.position.chromosome])]: continue # grab the actual flanking sequence, with padding, correct orientation etc full_flanking_seq = flanking_region_from_pos( position_before_insertion, mutant.position.chromosome, mutant.position.strand, genome, flanksize, padding_char, ) # append the sequence and readcount to output data flanking_region_count_list.append((full_flanking_seq, mutant.total_read_count)) return flanking_region_count_list
def categorize_reads_print_to_files(readname, aln_list, category_readcounts, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=False, multiple_to_write=-1, input_collapsed_to_unique=False, no_multi_cassette_warnings=False): """ Decide the proper category for the read, write to appropriate output file; adjust category counts. Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments). The reads will be categorized, and printed to the appropriate file (all the uppercase arguments should be open file objects; they can all be the SAME file object if desired.) If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, with N determined from readname using the fastx-collapser encoding. In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette. The read is printed to the appropriate outfile (all outfiles should be open file handles); for multiple-genomic, only N=multiple_to_write lines will be written; if N=0, one line will be written that treats the read as unaligned, but with XM:i:M optional tag field added, where M is the number of multiple alignments. If unaligned_as_fasta, unaligned reads will be written as fasta instead of SAM format, and so will multiple if multiple_to_write is 0. """ readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header( readname) # if there's a single alignment, it's unaligned, cassette or genomic-unique if len(aln_list) == 1: aln = aln_list[0] if not aln.aligned: category = 'unaligned' if unaligned_as_fasta: write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE) else: write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE) elif is_cassette_chromosome(aln.iv.chrom): category = 'cassette' write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE) else: category = 'genomic-unique' write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE) # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic else: assert all([aln.aligned for aln in aln_list ]), "Shouldn't see multiple unaligned lines per read!" # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen. # (NOTE: sometimes they happen because I'm actually aligning to multiple cassettes - then they're fine.) if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]): assert all([ is_cassette_chromosome(aln.iv.chrom) for aln in aln_list ]), "Mixed cassette/other!" if not no_multi_cassette_warnings: print( "Warning: multiple cassette alignments! Printing only one to cassette file. Seq %s, " % aln_list[0].read.seq, "first 3 positions %s" % ', '.join([ "%s %s %s" % (a.iv.chrom, a.iv.strand, a.iv.start) for a in aln_list[:3] ])) category = 'cassette-multiple' else: category = 'cassette' # first position alphabetically is chosen - MAYBE-TODO add other choice options? aln_to_print = sorted( aln_list, key=lambda a: (a.iv.chrom, a.iv.strand, a.iv.start, a.iv.end))[0] # just add _and_others to the chromosome - MAYBE-TODO add something more informative, like list of names? # but that would be tricky, need to strip matching prefixes from them, # what about multiple alignments to SAME chromosome, etc. aln_to_print.iv.chrom = aln_to_print.iv.chrom + '_and_others' write_SAM_line_from_HTSeq_aln(aln_to_print, CASSETTE_FILE) # multiple genomic alignments: # - if multiple_to_write=0, treat multiple as unaligned - if unaligned_as_fasta, print fasta line, # else single unaligned SAM line, with XM:i:M optional tag field added, where M is the number of multiple alignments. # - if multiple_to_write>0, print that many normal SAM lines for N alignments # MAYBE-TODO add an option to write multiple as unaligned to the main SAM file AND full multiple lines to another file? else: category = 'multiple-genomic' if multiple_to_write == 0: if unaligned_as_fasta: write_fasta_line(readname, aln_list[0].read.seq, MULTIPLE_GENOMIC_FILE) else: aln = aln_list[0] MULTIPLE_GENOMIC_FILE.write( '%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\tXM:i:%s\n' % (aln.read.name, aln.read.seq, aln.read.qualstr, len(aln_list))) else: for aln in aln_list[:multiple_to_write]: write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE) category_readcounts[category] += readcount return category
def categorize_reads_print_to_files(readname, aln_list, category_readcounts, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=False, multiple_to_write=-1, input_collapsed_to_unique=False, no_multi_cassette_warnings=False): """ Decide the proper category for the read, write to appropriate output file; adjust category counts. Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments). The reads will be categorized, and printed to the appropriate file (all the uppercase arguments should be open file objects; they can all be the SAME file object if desired.) If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, with N determined from readname using the fastx-collapser encoding. In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette. The read is printed to the appropriate outfile (all outfiles should be open file handles); for multiple-genomic, only N=multiple_to_write lines will be written; if N=0, one line will be written that treats the read as unaligned, but with XM:i:M optional tag field added, where M is the number of multiple alignments. If unaligned_as_fasta, unaligned reads will be written as fasta instead of SAM format, and so will multiple if multiple_to_write is 0. """ readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header(readname) # if there's a single alignment, it's unaligned, cassette or genomic-unique if len(aln_list) == 1: aln = aln_list[0] if not aln.aligned: category = 'unaligned' if unaligned_as_fasta: write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE) else: write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE) elif is_cassette_chromosome(aln.iv.chrom): category = 'cassette' write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE) else: category = 'genomic-unique' write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE) # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic else: assert all([aln.aligned for aln in aln_list]), "Shouldn't see multiple unaligned lines per read!" # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen. # (NOTE: sometimes they happen because I'm actually aligning to multiple cassettes - then they're fine.) if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]): assert all([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]), "Mixed cassette/other!" if not no_multi_cassette_warnings: print ("Warning: multiple cassette alignments! Printing only one to cassette file. Seq %s, "%aln_list[0].read.seq, "first 3 positions %s"%', '.join(["%s %s %s"%(a.iv.chrom, a.iv.strand, a.iv.start) for a in aln_list[:3]])) category = 'cassette-multiple' else: category = 'cassette' # first position alphabetically is chosen - MAYBE-TODO add other choice options? aln_to_print = sorted(aln_list, key=lambda a: (a.iv.chrom, a.iv.strand, a.iv.start, a.iv.end))[0] # just add _and_others to the chromosome - MAYBE-TODO add something more informative, like list of names? # but that would be tricky, need to strip matching prefixes from them, # what about multiple alignments to SAME chromosome, etc. aln_to_print.iv.chrom = aln_to_print.iv.chrom + '_and_others' write_SAM_line_from_HTSeq_aln(aln_to_print, CASSETTE_FILE) # multiple genomic alignments: # - if multiple_to_write=0, treat multiple as unaligned - if unaligned_as_fasta, print fasta line, # else single unaligned SAM line, with XM:i:M optional tag field added, where M is the number of multiple alignments. # - if multiple_to_write>0, print that many normal SAM lines for N alignments # MAYBE-TODO add an option to write multiple as unaligned to the main SAM file AND full multiple lines to another file? else: category = 'multiple-genomic' if multiple_to_write == 0: if unaligned_as_fasta: write_fasta_line(readname, aln_list[0].read.seq, MULTIPLE_GENOMIC_FILE) else: aln = aln_list[0] MULTIPLE_GENOMIC_FILE.write('%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\tXM:i:%s\n'%(aln.read.name, aln.read.seq, aln.read.qualstr, len(aln_list))) else: for aln in aln_list[:multiple_to_write]: write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE) category_readcounts[category] += readcount return category