def categorize_reads_print_to_files(readname_to_aln_list, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, 
                                    GENOMIC_UNIQUE_FILE, unaligned_as_fasta=True, multiple_to_write=-1, 
                                    input_collapsed_to_unique=False, no_warnings=False):
    """ Decide the proper category for each read, write to appropriate output file; return category counts. 
    
    Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), 
     genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments. 
    If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, 
     with N determined from readname using the fastx-collapser encoding.
    In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette.

    Each read is printed to the appropriate outfile (all outfiles should be open file handles); 
     for multiple-genomic, multiple_to_write lines will be written; if unaligned_as_fasta, unaligned reads
     will be written as fasta instead of SAM format (and so will multiple-genomic if multiple_to_write is 0).
    """
    category_readcounts = {'unaligned':0, 'cassette':0, 'multiple-genomic':0, 'genomic-unique':0, 'cassette-multiple':0}

    for readname,aln_list in sorted(readname_to_aln_list.items()):
        readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header(readname)
        # if there's a single alignment, it's unaligned, cassette or genomic-unique
        if len(aln_list) == 1:
            aln = aln_list[0]
            if not aln.aligned:
                category_readcounts['unaligned'] += readcount
                if unaligned_as_fasta:  write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE)
                else:                   write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE)
            elif is_cassette_chromosome(aln.iv.chrom):
                category_readcounts['cassette'] += readcount
                write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE)
            else:
                category_readcounts['genomic-unique'] += readcount
                write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE)
        # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic
        else:
            assert all([aln.aligned for aln in aln_list]), "Shouldn't see multiple unaligned lines per read!"
            # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE
            # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen.
            if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]):
                assert all([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]), "Mixed cassette/other!"
                category_readcounts['cassette'] += readcount
                if not no_warnings:
                    print "Warning: multiple cassette alignments! Printing all to cassette file.\n\t%s"%(aln_list)
                category_readcounts['cassette-multiple'] += readcount
                for aln in aln_list:
                    write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE)
            # multiple genomic alignments - how many get written depends on multiple_to_write; 
            #  if it's 0, the outfile should be fasta, or else I guess it should be written as unaligned?
            #   (MAYBE-TODO writing single multiple as unaligned not implemented!)
            else:
                category_readcounts['multiple-genomic'] += readcount
                if multiple_to_write == 0:
                    if unaligned_as_fasta:
                        write_fasta_line(readname, aln_list[0].read.seq, MULTIPLE_GENOMIC_FILE)
                    else:
                        raise Exception("Writing 0 multiple alignments in SAM format NOT IMPLEMENTED!")
                else:
                    for aln in aln_list[:multiple_to_write]:
                        write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE)
    return category_readcounts
def grab_flanking_regions_from_pos_dict(
    insertion_position_dict,
    genome,
    flanksize=200,
    padding_char=".",
    chromosome_check_function=lambda x: True,
    ignore_both_strand_mutants=False,
):
    """ Same as grab_flanking_regions_from_mutantfile, but takes input as a (chrom,strand):pos_list dictionary instead, 
    and assumes all readcounts to be 1.
    """
    flanking_region_count_list = []
    for (chromosome, strand), pos_list in insertion_position_dict.items():
        for position_before_insertion in pos_list:
            # filter out positions in wrong chromosomes; filter out both-stranded positions if desired
            if not chromosome_check_function(chromosome):
                continue
            if strand not in "+-":
                if strand == "both" and ignore_both_strand_mutants:
                    continue
                else:
                    raise ValueError("Unexpected strand! %s" % strand)
            # ignore cassette tandems (i.e. insertions that map to start or end of cassette)
            if mutant_analysis_classes.is_cassette_chromosome(chromosome):
                if position_before_insertion in [0, len(genome[chromosome])]:
                    continue
            # grab the actual flanking sequence, with padding, correct orientation etc
            full_flanking_seq = flanking_region_from_pos(
                position_before_insertion, chromosome, strand, genome, flanksize, padding_char
            )
            # append the sequence and readcount to output data
            flanking_region_count_list.append((full_flanking_seq, 1))
    return flanking_region_count_list
def grab_flanking_region_motif_counts_from_pos_dict(insertion_position_dict, genome, flanksize=2, 
                                                    chromosome_check_function=lambda x: True, ignore_both_strand_mutants=False):
    """ Get a flanking_seq:count dictionary for the flanking seqs for insertion_position_dict ((chrom,strand):pos_list dictionary).

    Only really makes sense for small flanksizes - otherwise the total number of possible motifs will be huge (4^(flanksize*2).
    Assumes all readcounts to be 1.
    """
    # initialize the motif-count lists to the right length, and fill it out by going over all the seqs
    motif_count_dict = {''.join(four_bases): 0 for four_bases 
                        in itertools.product(NORMAL_DNA_BASES,NORMAL_DNA_BASES,NORMAL_DNA_BASES,NORMAL_DNA_BASES)}
    # for each position, grab the flanking region and add it to the motif_count_dict
    for (chromosome,strand),pos_list in insertion_position_dict.items():
        for position_before_insertion in pos_list:
            # filter out positions in wrong chromosomes; filter out both-stranded positions if desired
            if not chromosome_check_function(chromosome):           continue
            if strand not in '+-':  
                if strand=='both' and ignore_both_strand_mutants:   continue
                else:                                               raise ValueError("Unexpected strand! %s"%strand)
            # ignore cassette tandems (i.e. insertions that map to start or end of cassette)
            if mutant_analysis_classes.is_cassette_chromosome(chromosome):
                if position_before_insertion in [0, len(genome[chromosome])]:  continue
            # grab the actual flanking sequence, with padding, correct orientation etc
            full_flanking_seq = flanking_region_from_pos(position_before_insertion, chromosome, strand, genome, flanksize)
            # add motif-count of full_flanking_seq to motif_count_dict
            try:                motif_count_dict[full_flanking_seq] += 1
            except KeyError:    pass
            # MAYBE-TODO add an option to NOT ignore motifs that aren't in NORMAL_DNA_BASES?
    return motif_count_dict
def grab_flanking_region_base_counts_from_pos_dict(insertion_position_dict, genome, flanksize=200, 
                                                   chromosome_check_function=lambda x: True, ignore_both_strand_mutants=False):
    """ Same as base_count_dict(grab_flanking_regions_from_mutantfile(*args)), but saves memory by not keeping all the sequences.

    Basically instead of making a full dataset of flanking regions with grab_flanking_regions_from_mutantfile (which can be BIG)
     and then converting those to a base-count dict with base_count_dict, just go over each position in insertion_position_dict,
     grab that flanking region, add it to the current base-count dict, and go on to the next one, without saving.
    Assumes all readcounts to be 1.
    """
    # initialize the base-count lists to the right length, and fill it out by going over all the seqs
    base_count_dict = {base: [0 for _ in range(flanksize*2)] for base in NORMAL_DNA_BASES}
    # for each position, grab the flanking region and add it to the base_count_dict
    for (chromosome,strand),pos_list in insertion_position_dict.items():
        for position_before_insertion in pos_list:
            # filter out positions in wrong chromosomes; filter out both-stranded positions if desired
            if not chromosome_check_function(chromosome):           continue
            if strand not in '+-':  
                if strand=='both' and ignore_both_strand_mutants:   continue
                else:                                               raise ValueError("Unexpected strand! %s"%strand)
            # ignore cassette tandems (i.e. insertions that map to start or end of cassette)
            if mutant_analysis_classes.is_cassette_chromosome(chromosome):
                if position_before_insertion in [0, len(genome[chromosome])]:  continue
            # grab the actual flanking sequence, with padding, correct orientation etc
            full_flanking_seq = flanking_region_from_pos(position_before_insertion, chromosome, strand, genome, flanksize)
            # add base-counts from full_flanking_seq to base_count_dict
            for position, base in enumerate(full_flanking_seq.upper()):
                try:
                    base_count_dict[base][position] += 1
                except KeyError:
                    pass
                    # MAYBE-TODO add an option to NOT ignore bases that aren't in NORMAL_DNA_BASES?
    return base_count_dict
def grab_flanking_regions_from_mutantfile(
    mutant_dataset_infile,
    genome,
    flanksize=200,
    padding_char=".",
    min_readcount=0,
    chromosome_check_function=lambda x: True,
    ignore_both_strand_mutants=False,
):
    """ Return (flanking_seq,readcount) with both-side genomic flanking sequences for insertional mutants in mutant_dataset_infile.

    Grab all the insertion positions from mutant_dataset_infile (pickled mutant_analysis_classes.Insertional_mutant_dataset object), 
     use genome (a chrom_name:seq dict) to figure out the flanksize-length flanking sequences on both sides
      (padded with padding_char if the end of the chromosome is too close), reverse-complement if needed (if strand=='-') 
      to get it in the same orientation as the insertion.

    Filter the mutants: 
     - by readcount - ignore mutants with total readcount below min_readcount=0
     - by chromosome - ignore mutants in chromosomes for which chromosome_check_function returns False
     - by strand - both-strand (merged tandem) mutants will be ignored if ignore_both_strand_mutants is True, 
                otherwise ValueError will be raised; ValueError will be raised for other unexpected strand values.

    For all remaining mutants, append (flanking region seq, total_readcount) to output list.
    """
    dataset = mutant_analysis_classes.read_mutant_file(mutant_dataset_infile)
    flanking_region_count_list = []
    for mutant in sorted(dataset, key=lambda m: m.position):
        # filter out mutants with wrong readcounts or in wrong chromosomes
        if not chromosome_check_function(mutant.position.chromosome):
            continue
        if mutant.total_read_count < min_readcount:
            continue
        # filter out both-stranded mutants if desired;
        if mutant.position.strand not in "+-":
            if mutant.position.strand == "both" and ignore_both_strand_mutants:
                continue
            else:
                raise ValueError("Unexpected mutant strand! %s" % mutant.position)
        # grab mutant position/chromosome
        position_before_insertion = mutant.position.min_position
        # ignore cassette tandems (i.e. insertions that map to start or end of cassette)
        if mutant_analysis_classes.is_cassette_chromosome(mutant.position.chromosome):
            if position_before_insertion in [0, len(genome[mutant.position.chromosome])]:
                continue
        # grab the actual flanking sequence, with padding, correct orientation etc
        full_flanking_seq = flanking_region_from_pos(
            position_before_insertion,
            mutant.position.chromosome,
            mutant.position.strand,
            genome,
            flanksize,
            padding_char,
        )
        # append the sequence and readcount to output data
        flanking_region_count_list.append((full_flanking_seq, mutant.total_read_count))
    return flanking_region_count_list
def categorize_reads_print_to_files(readname,
                                    aln_list,
                                    category_readcounts,
                                    UNALIGNED_FILE,
                                    CASSETTE_FILE,
                                    MULTIPLE_GENOMIC_FILE,
                                    GENOMIC_UNIQUE_FILE,
                                    unaligned_as_fasta=False,
                                    multiple_to_write=-1,
                                    input_collapsed_to_unique=False,
                                    no_multi_cassette_warnings=False):
    """ Decide the proper category for the read, write to appropriate output file; adjust category counts. 
    
    Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), 
     genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments). 

    The reads will be categorized, and printed to the appropriate file (all the uppercase arguments should be open file objects; 
     they can all be the SAME file object if desired.)

    If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, 
     with N determined from readname using the fastx-collapser encoding.

    In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette.

    The read is printed to the appropriate outfile (all outfiles should be open file handles); 
     for multiple-genomic, only N=multiple_to_write lines will be written; if N=0, one line will be written that treats 
      the read as unaligned, but with XM:i:M optional tag field added, where M is the number of multiple alignments.
      
    If unaligned_as_fasta, unaligned reads will be written as fasta instead of SAM format, 
     and so will multiple if multiple_to_write is 0.
    """
    readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header(
        readname)
    # if there's a single alignment, it's unaligned, cassette or genomic-unique
    if len(aln_list) == 1:
        aln = aln_list[0]
        if not aln.aligned:
            category = 'unaligned'
            if unaligned_as_fasta:
                write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE)
            else:
                write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE)
        elif is_cassette_chromosome(aln.iv.chrom):
            category = 'cassette'
            write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE)
        else:
            category = 'genomic-unique'
            write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE)
    # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic
    else:
        assert all([aln.aligned for aln in aln_list
                    ]), "Shouldn't see multiple unaligned lines per read!"
        # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE
        # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen.
        # (NOTE: sometimes they happen because I'm actually aligning to multiple cassettes - then they're fine.)
        if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]):
            assert all([
                is_cassette_chromosome(aln.iv.chrom) for aln in aln_list
            ]), "Mixed cassette/other!"
            if not no_multi_cassette_warnings:
                print(
                    "Warning: multiple cassette alignments! Printing only one to cassette file. Seq %s, "
                    % aln_list[0].read.seq,
                    "first 3 positions %s" % ', '.join([
                        "%s %s %s" % (a.iv.chrom, a.iv.strand, a.iv.start)
                        for a in aln_list[:3]
                    ]))
                category = 'cassette-multiple'
            else:
                category = 'cassette'
            # first position alphabetically is chosen - MAYBE-TODO add other choice options?
            aln_to_print = sorted(
                aln_list,
                key=lambda a:
                (a.iv.chrom, a.iv.strand, a.iv.start, a.iv.end))[0]
            # just add _and_others to the chromosome - MAYBE-TODO add something more informative, like list of names?
            #   but that would be tricky, need to strip matching prefixes from them,
            #   what about multiple alignments to SAME chromosome, etc.
            aln_to_print.iv.chrom = aln_to_print.iv.chrom + '_and_others'
            write_SAM_line_from_HTSeq_aln(aln_to_print, CASSETTE_FILE)
        # multiple genomic alignments:
        # - if multiple_to_write=0, treat multiple as unaligned - if unaligned_as_fasta, print fasta line,
        #   else single unaligned SAM line, with XM:i:M optional tag field added, where M is the number of multiple alignments.
        # - if multiple_to_write>0, print that many normal SAM lines for N alignments
        # MAYBE-TODO add an option to write multiple as unaligned to the main SAM file AND full multiple lines to another file?
        else:
            category = 'multiple-genomic'
            if multiple_to_write == 0:
                if unaligned_as_fasta:
                    write_fasta_line(readname, aln_list[0].read.seq,
                                     MULTIPLE_GENOMIC_FILE)
                else:
                    aln = aln_list[0]
                    MULTIPLE_GENOMIC_FILE.write(
                        '%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\tXM:i:%s\n' %
                        (aln.read.name, aln.read.seq, aln.read.qualstr,
                         len(aln_list)))
            else:
                for aln in aln_list[:multiple_to_write]:
                    write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE)
    category_readcounts[category] += readcount
    return category
def categorize_reads_print_to_files(readname, aln_list, category_readcounts, UNALIGNED_FILE, CASSETTE_FILE, 
                                    MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=False, multiple_to_write=-1, 
                                    input_collapsed_to_unique=False, no_multi_cassette_warnings=False):
    """ Decide the proper category for the read, write to appropriate output file; adjust category counts. 
    
    Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), 
     genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments). 

    The reads will be categorized, and printed to the appropriate file (all the uppercase arguments should be open file objects; 
     they can all be the SAME file object if desired.)

    If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, 
     with N determined from readname using the fastx-collapser encoding.

    In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette.

    The read is printed to the appropriate outfile (all outfiles should be open file handles); 
     for multiple-genomic, only N=multiple_to_write lines will be written; if N=0, one line will be written that treats 
      the read as unaligned, but with XM:i:M optional tag field added, where M is the number of multiple alignments.
      
    If unaligned_as_fasta, unaligned reads will be written as fasta instead of SAM format, 
     and so will multiple if multiple_to_write is 0.
    """
    readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header(readname)
    # if there's a single alignment, it's unaligned, cassette or genomic-unique
    if len(aln_list) == 1:
        aln = aln_list[0]
        if not aln.aligned:
            category = 'unaligned'
            if unaligned_as_fasta:  write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE)
            else:                   write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE)
        elif is_cassette_chromosome(aln.iv.chrom):
            category = 'cassette'
            write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE)
        else:
            category = 'genomic-unique'
            write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE)
    # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic
    else:
        assert all([aln.aligned for aln in aln_list]), "Shouldn't see multiple unaligned lines per read!"
        # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE
        # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen.
        # (NOTE: sometimes they happen because I'm actually aligning to multiple cassettes - then they're fine.)
        if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]):
            assert all([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]), "Mixed cassette/other!"
            if not no_multi_cassette_warnings:
                print ("Warning: multiple cassette alignments! Printing only one to cassette file. Seq %s, "%aln_list[0].read.seq, 
                       "first 3 positions %s"%', '.join(["%s %s %s"%(a.iv.chrom, a.iv.strand, a.iv.start) for a in aln_list[:3]]))
                category = 'cassette-multiple'
            else:
                category = 'cassette'
            # first position alphabetically is chosen - MAYBE-TODO add other choice options?
            aln_to_print = sorted(aln_list, key=lambda a: (a.iv.chrom, a.iv.strand, a.iv.start, a.iv.end))[0]
            # just add _and_others to the chromosome - MAYBE-TODO add something more informative, like list of names?
            #   but that would be tricky, need to strip matching prefixes from them, 
            #   what about multiple alignments to SAME chromosome, etc.
            aln_to_print.iv.chrom = aln_to_print.iv.chrom + '_and_others'
            write_SAM_line_from_HTSeq_aln(aln_to_print, CASSETTE_FILE)
        # multiple genomic alignments: 
        # - if multiple_to_write=0, treat multiple as unaligned - if unaligned_as_fasta, print fasta line, 
        #   else single unaligned SAM line, with XM:i:M optional tag field added, where M is the number of multiple alignments.
        # - if multiple_to_write>0, print that many normal SAM lines for N alignments
        # MAYBE-TODO add an option to write multiple as unaligned to the main SAM file AND full multiple lines to another file?
        else:
            category = 'multiple-genomic'
            if multiple_to_write == 0:
                if unaligned_as_fasta:
                    write_fasta_line(readname, aln_list[0].read.seq, MULTIPLE_GENOMIC_FILE)
                else:
                    aln = aln_list[0]
                    MULTIPLE_GENOMIC_FILE.write('%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\tXM:i:%s\n'%(aln.read.name, aln.read.seq, 
                                                                                                 aln.read.qualstr, len(aln_list)))
            else:
                for aln in aln_list[:multiple_to_write]:
                    write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE)
    category_readcounts[category] += readcount
    return category