def print_base_count_fraction_for_dist(flanking_region_count_list, distance_from_middle, convert_counts=lambda x: x, ignore_bases_pattern=None, average_both_sides=False): """ Print the base counts/fractions for the N bases around the middle, for all the flanking regions, weighed by converted count. Flanking_region_count_list should be a (seq,count) list; each sequence will be weighed as convert_counts(count). Ignore_bases_pattern should be None to count all the bases, or a string giving a base pattern (centered around the middle), in which case only the bases with an N will be counted - so for instnace if ignore_bases_pattern is CANN, bases -2 and -1 will be ignored (presumed to have been filtered through a pattern that requires them to be CA, although this isn't checked), and only data for bases 1 and 2 (as well as bases before -2 and after 2, depending on distance_from_middle) will be given. If average_both_sides is True, bases from the two sides around the middle will be averaged together (after reverse-complementing one side, of course): CA|GG will be converted to |GG and |TG (rev-compl of CA|) and the GG and TG treated together for calculating base frequencies/counts. """ flanking_region_length = get_all_seq_length(zip(*flanking_region_count_list)[0]) if flanking_region_length % 2: raise ValueError("Flanking region length must be an even number!") flank_length = int(flanking_region_length/2) # grab only the flanking region length we're actually interested in, and convert the counts local_flanking_region_length = 2*distance_from_middle local_flanking_region_count_list = [(seq[flank_length-distance_from_middle:flank_length+distance_from_middle], convert_counts(count)) for (seq,count) in flanking_region_count_list] # apply ignore_bases_pattern by changing the relevant bases to N # note that I'm doing it this way, instead of just skipping these positions in the final output, # because that wouldn't work if average_both_sides is True: if ignore_bases_pattern is ANAN, # the ignore pattern isn't symmetrical around the middle, so half the position 1 bases will be ignored (first N in ANAN), # and half the position 2 bases will be ignored (second N in ANAN) - there will be no single position with all bases ignored. if ignore_bases_pattern is not None: if len(ignore_bases_pattern) % 2: raise ValueError("Ignore_bases_pattern length must be an even number!") length_diff = int((len(ignore_bases_pattern)-local_flanking_region_length)/2) if length_diff>0: raise ValueError("Ignore_bases_pattern is longer than 2*distance_from_middle - probably error!") if length_diff<0: ignore_bases_pattern = 'N'*length_diff + ignore_bases_pattern + 'N'*length_diff def mask_seq(seq, mask_pattern): return ''.join([(base if if_mask=='N' else 'N') for (base,if_mask) in zip(seq, mask_pattern)]) local_flanking_region_count_list = [(mask_seq(seq, ignore_bases_pattern), count) for (seq,count) in local_flanking_region_count_list] # if average_both_sides, make a new local_flanking_region_count_list that has each half of each sequence separately if average_both_sides: new_flanking_region_count_list = [] for flanking_region,count in local_flanking_region_count_list: first_half = reverse_complement(flanking_region[:distance_from_middle]) second_half = flanking_region[distance_from_middle:] new_flanking_region_count_list.extend([(first_half,count), (second_half,count)]) local_flanking_region_count_list = new_flanking_region_count_list local_flanking_region_length = int(local_flanking_region_length/2) base_count_list_dict = base_count_dict(local_flanking_region_count_list) base_fraction_list_dict = base_fraction_dict(local_flanking_region_count_list) # for each position in the final flanking regions, give the base fraction/count; # ignore positions in which there were no non-N bases. all_lines = '' for position in range(local_flanking_region_length): if sum(base_count_list_dict[base][position] for base in NORMAL_DNA_BASES): data = ['%s %.0f%% (%s)'%(base, base_fraction_list_dict[base][position]*100, base_count_list_dict[base][position]) for base in NORMAL_DNA_BASES] display_pos = position+1 if average_both_sides else _relative_position_vs_cut(position, distance_from_middle) all_lines += " - position %s: \t%s\n"%(display_pos, ', \t'.join(data)) return all_lines
def filter_flanking_regions_by_pattern( flanking_region_count_list, pattern, either_orientation=True, print_info=True, category=None, meaning_of_seqs="positions", meaning_of_counts="counts", ): """ Return separate lists of flanking regions that do and don't match given sequence pattern. flanking_region_count_list should be a list of (flanking_region, count) pairs (like from grab_flanking_regions_from_mutantfile); the two return values (flanking regions that match and don't match the pattern) are the same format. The pattern should be a sequence string (allowed letters are ACTGN). It'ss considered to be centered around the cut site; the flanking regions likewise. E.g. if pattern is GNAN, a fl.region of GCAC or TTGCACTT would match, but TTTTGCAC would not. If either_orientation is True, each flanking region will be tried against the pattern in both the forward and the reverse orientation, and the returned flanking region will be in the orientation that matched - e.g. if pattern is GNAN, a flanking region of either TTGCACTT or TTCTCCTT would match (forward and rev-compl respectively), and the latter would be returned as rev-compl, AAGGAGAA. If print_info is True, some information will be printed about what number/percentage matched and didn't: it'll be given two ways: - by flanking region, counting each once, if meaning_of_seqs is not None, and meaning_of_seqs will be used as the description - by count, if some counts are not 1 and meaning_of_counts is not None, and meaning_of_counts will be used as the description. """ if not flanking_region_count_list: return [] flanking_region_length = get_all_seq_length(zip(*flanking_region_count_list)[0]) if flanking_region_length % 2: raise ValueError("Flanking region length must be an even number!") if len(pattern) % 2: raise ValueError("Pattern length must be an even number!") if len(pattern) > flanking_region_length: raise ValueError("Pattern cannot be longer than flanking regions!") # pad the pattern to match the flanking region length orig_pattern = pattern if len(pattern) < flanking_region_length: padding_len = int((flanking_region_length - len(pattern)) / 2) pattern = "N" * padding_len + pattern + "N" * padding_len # go over all the flanking regions: flanking_region_count_list_match, flanking_region_count_list_nomatch = [], [] for (flanking_region, count) in flanking_region_count_list: # if the flanking region is padded with .'s, change them to N's to make check_seq_against_pattern take it flanking_region = flanking_region.replace(".", "N") # if we're looking at both orientations, then first randomize the orientation to avoid bias if either_orientation and random.random() < 0.5: flanking_region = reverse_complement(flanking_region) # if it matches the pattern, save it as a match and go to the next one if check_seq_against_pattern(flanking_region, pattern): flanking_region_count_list_match.append((flanking_region, count)) continue # or if its rev-compl matches the pattern and either_orientation is True, save it as a match and go on to the next one; if either_orientation: flanking_region = reverse_complement(flanking_region) if check_seq_against_pattern(flanking_region, pattern): flanking_region_count_list_match.append((flanking_region, count)) continue # if it didn't match anywhere, save it as a no-match. flanking_region_count_list_nomatch.append((flanking_region, count)) if print_info: if meaning_of_seqs is None and meaning_of_counts is None: raise ValueError("To get info printed, at least one of meaning_of_seqs/meaning_of_counts must be not None!") print_data = "%smatched %s: " % ("" if category is None else category + " ", orig_pattern) if meaning_of_seqs is not None: positions_matched, positions_unmatched = ( len(flanking_region_count_list_match), len(flanking_region_count_list_nomatch), ) positions_all = positions_matched + positions_unmatched print_data += "%s, unmatched %s/%s" % ( general_utilities.value_and_percentages( positions_matched, [positions_all], insert_word=meaning_of_seqs ), positions_unmatched, positions_all, ) if meaning_of_counts is not None: counts_matched, counts_unmatched = [ sum(zip(*data)[1]) for data in (flanking_region_count_list_match, flanking_region_count_list_nomatch) ] counts_all = counts_matched + counts_unmatched print_data += "; %s, unmatched %s/%s." % ( general_utilities.value_and_percentages(counts_matched, [counts_all], insert_word=meaning_of_counts), counts_unmatched, counts_all, ) print print_data return flanking_region_count_list_match, flanking_region_count_list_nomatch