def add_start_and_length_to_events(basecall_events, sampling_freq, start_time): check_numpy_table(basecall_events, req_fields=('raw_start', 'raw_length')) assert basecall_events["raw_start"].dtype == np.dtype('uint64'), "Event raw_start should be uint64 type: {}" \ .format(basecall_events["raw_start"].dtype) assert basecall_events["raw_length"].dtype == np.dtype('uint64'), "Event raw_length should be uint64 type: {}" \ .format(basecall_events["raw_length"].dtype) assert sampling_freq >= 0, "Invalid sampling frequency: {}".format( sampling_freq) assert start_time != 0, "Invalid start time: {}".format(start_time) calc_start = lambda x: np.float64( (basecall_events[x]["raw_start"] / float(sampling_freq)) + (start_time / float(sampling_freq))) calc_length = lambda x: np.float64(basecall_events[x]["raw_length"] / float(sampling_freq)) starts = list(map(calc_start, range(len(basecall_events)))) lengths = list(map(calc_length, range(len(basecall_events)))) basecall_events = append_fields(basecall_events, "start", starts, usemask=False) basecall_events = append_fields(basecall_events, "length", lengths, usemask=False) return basecall_events
def create_label_from_events(mea_events): """Trim input events to just required fields req_fields: 'reference_index'', 'path_kmer', 'posterior_probability', 'raw_start', 'raw_length' :param mea_events: events table reference_index', 'event_index', 'aligned_kmer', 'posterior_probability """ check_numpy_table(mea_events, req_fields=('reference_index', 'path_kmer', 'posterior_probability', 'raw_start', 'raw_length')) label = np.zeros(len(mea_events), dtype=[('raw_start', int), ('raw_length', int), ('reference_index', int), ('posterior_probability', float), ('kmer', 'S5')]) label['raw_start'] = mea_events["raw_start"] label['raw_length'] = mea_events["raw_length"] label['reference_index'] = mea_events["reference_index"] label['kmer'] = [convert_to_str(x) for x in mea_events["path_kmer"]] label['posterior_probability'] = mea_events["posterior_probability"] np.sort(label, order='raw_start', kind='mergesort') return label
def get_mea_params_from_events(events): """Get the posterior matrix, shortest_ref_per_event and event matrix from events table :param events: events table with required fields""" check_numpy_table( events, req_fields=('contig', 'reference_index', 'reference_kmer', 'strand', 'event_index', 'event_mean', 'event_noise', 'event_duration', 'aligned_kmer', 'scaled_mean_current', 'scaled_noise', 'posterior_probability', 'descaled_event_mean', 'ont_model_mean', 'path_kmer')) # get min/max args ref_start = min(events["reference_index"]) ref_end = max(events["reference_index"]) # sort events to collect the min ref position per event events = np.sort(events, order=['event_index'], kind='mergesort') event_start = events["event_index"][0] event_end = events["event_index"][-1] # check strand of the read minus_strand = False if events[0]["reference_index"] > events[-1]["reference_index"]: minus_strand = True # print("minus_strand", minus_strand) ref_length = int(ref_end - ref_start + 1) event_length = int(event_end - event_start + 1) # initialize data structures event_matrix = [[0 for _ in range(ref_length)] for _ in range(event_length)] posterior_matrix = np.zeros([event_length, ref_length]) shortest_ref_per_event = [np.inf for _ in range(event_length)] min_shortest_ref = np.inf # print(ref_start, ref_end) # go through events backward to make sure the shortest ref per event is calculated at the same time for i in range(1, len(events) + 1): event = events[-i] event_indx = event["event_index"] - event_start if minus_strand: ref_indx = event["reference_index"] - ref_end ref_indx *= -1 else: ref_indx = event["reference_index"] - ref_start # using full event so we can use the same matrix to assemble the training data later posterior_matrix[event_indx][ref_indx] = event['posterior_probability'] event_matrix[event_indx][ref_indx] = event # edit shortest ref per event list if shortest_ref_per_event[event_indx] > ref_indx: if min_shortest_ref > ref_indx: min_shortest_ref = ref_indx # print(event_indx, ref_indx, min_shortest_ref) shortest_ref_per_event[event_indx] = min_shortest_ref return posterior_matrix, shortest_ref_per_event, event_matrix
def index_bases_from_events(events, kmer_index=2): """Map basecalled sequence to events from a table with required fields :param kmer_index: index of kmer to create map :param events: original base-called events with required fields """ check_numpy_table(events, req_fields=('raw_start', 'model_state', 'p_model_state', 'raw_length', 'move')) assert len(events[0]['model_state']) > kmer_index, \ "Selected too big of a kmer_index len(kmer) !> kmer_index, {} !> {} ".format(len(events[0]['model_state']), kmer_index) probs = [] base_raw_starts = [] bases = [] base_raw_lengths = [] for i, event in enumerate(events): if i == 0: # initialize with first kmer base_raw_starts.extend([ event['raw_start'] for _ in event['model_state'][:kmer_index + 1] ]) probs.extend([ event['p_model_state'] for _ in event['model_state'][:kmer_index + 1] ]) bases.extend( [chr(x) for x in event['model_state'][:kmer_index + 1]]) base_raw_lengths.extend([ event['raw_length'] for _ in event['model_state'][:kmer_index + 1] ]) else: # if there was a move, gather the information for each base by index if event['move'] > 0: char_moves = bytes.decode( event['model_state'][kmer_index:kmer_index + event['move']]) for x in range(event['move']): base_raw_starts.append(event['raw_start']) probs.append(event['p_model_state']) bases.append(char_moves[x]) base_raw_lengths.append(event['raw_length']) # gather last bases for the last event base_raw_starts.extend( [event['raw_start'] for _ in event['model_state'][kmer_index + 1:]]) probs.extend([ event['p_model_state'] for _ in event['model_state'][kmer_index + 1:] ]) bases.extend([chr(x) for x in event['model_state'][kmer_index + 1:]]) base_raw_lengths.extend( [event['raw_length'] for _ in event['model_state'][kmer_index + 1:]]) # the index of each corresponds to the index of the final sequence return bases, base_raw_starts, base_raw_lengths, probs
def fix_sa_reference_indexes(self, data): """Fix reference indexes based on kmer length and kmer index""" check_numpy_table(data, req_fields=('reference_index', 'raw_start', 'kmer')) kmer_len = len(data[0]["kmer"]) if self.aligned_signal.minus_strand: data["reference_index"] += ((kmer_len - 1) - self.kmer_index) else: data["reference_index"] += self.kmer_index return data
def match_events_with_eventalign(events=None, event_detections=None, minus=False, rna=False): """Match event index with event detection data to label segments of signal for each kmer # RNA is sequenced 3'-5' # reversed for fasta/q sequence # if mapped to reverse strand # reverse reverse complement = complement # DNA is sequenced 5'-3' # if mapped to reverse strand # reverse complement :param events: events table reference_index', 'event_index', 'aligned_kmer', 'posterior_probability :param event_detections: event detection event table :param minus: boolean option to for minus strand mapping :param rna: boolean for RNA read """ assert events is not None, "Must pass signal alignment events" assert event_detections is not None, "Must pass event_detections events" check_numpy_table(events, req_fields=('position', 'event_index', 'reference_kmer')) check_numpy_table(event_detections, req_fields=('start', 'length')) label = np.zeros(len(events), dtype=[('raw_start', int), ('raw_length', int), ('reference_index', int), ('posterior_probability', float), ('kmer', 'S6')]) label['raw_start'] = [event_detections[x]["start"] for x in events["event_index"]] label['raw_length'] = [event_detections[x]["length"] for x in events["event_index"]] label['reference_index'] = events["position"] def convert_to_str(string): """Helper function to catch bytes as strings""" if type(string) is str: return string else: return bytes.decode(string) flip = ReverseComplement() if minus: if rna: kmers = [flip.complement(convert_to_str(x)) for x in events["reference_kmer"]] else: kmers = [flip.reverse_complement(convert_to_str(x)) for x in events["reference_kmer"]] else: if rna: kmers = [flip.reverse(convert_to_str(x)) for x in events["reference_kmer"]] else: kmers = events["reference_kmer"] label['kmer'] = kmers label['posterior_probability'] = np.ones(len(events)) # np.sort(label, order='raw_start', kind='mergesort') return label
def check_event_table_time(event_table): """Check if event table has correct math for start and length timing for each event :param event_table: event table with "start" and "length" columns """ check_numpy_table(event_table, req_fields=('start', 'length')) prev_end = event_table[0]["start"] + event_table[0]["length"] for event in event_table[1:]: if prev_end != event["start"]: return False prev_end = event["start"]+event["length"] return True
def get_basecalled_data_by_number(self, number): """Get basecalled event data by the number""" assert type(number) is int, "Number must be an integer" events_path = self.__default_template_1d_basecall_events__.format(number) try: events = self[events_path][()] except: raise ValueError('Could not retrieve basecall_1D data from {}'.format(events_path)) try: check_numpy_table(events, req_fields=('raw_start', 'raw_length')) # if events do not have raw_start or raw_lengths except KeyError: events = add_raw_start_and_raw_length_to_events(events, self.sample_rate, self.raw_attributes["start_time"]) return events
def has_valid_event_table_format(self, oned_root_address): """Check if the 'start' and 'length' values are in the time scale, NOT the index scale :param oned_root_address: Basecalled analysis path :return: boolean if start and length are correct format """ template_event_table_address = os.path.join( oned_root_address, "BaseCalled_template/Events") if template_event_table_address in self.fastFive: template_events = np.asarray( self.fastFive[template_event_table_address]) check_numpy_table(template_events, req_fields=('start', 'length')) if template_events["start"].dtype is np.dtype('uint64'): return False else: return True
def add_label(self, label, name, label_type): """Add labels to class. :param label: label numpy array with required fields ['raw_start', 'raw_length', 'reference_index', 'kmer', 'posterior_probability'] :param name: name of the label for signal :param label_type: type of label :['label', 'prediction', 'guide'] """ assert label_type in ['label', 'prediction', 'guide'], \ "{} not in ['label', 'prediction', 'guide']: Must select an acceptable type".format(label_type) check_numpy_table(label, req_fields=('raw_start', 'raw_length', 'reference_index', 'kmer', 'posterior_probability')) # label.sort(order=['raw_start'], kind='mergesort') # check the labels are in the correct format assert min(label["raw_start"]) >= 0, "Raw start cannot be less than 0" assert 0 <= max(label["posterior_probability"]) <= 1, \ "posterior_probability must be between zero and one {}".format(row["posterior_probability"]) # make sure last label can actually index the signal correctly try: self.scaled_signal[label[-1]["raw_start"]:label[-1]["raw_start"] + label[-1]["raw_length"]] except IndexError: raise IndexError("labels are longer than signal") label1 = np.sort(label, order=['raw_start'], kind='mergesort') # infer strand alignment of read if label1[0]["reference_index"] >= label1[-1]["reference_index"]: minus_strand = True else: minus_strand = False if self.minus_strand is not None: if label[0]["raw_start"] != label[-1]["raw_start"]: assert self.minus_strand == minus_strand, "New label has different strand direction, check label" else: self.minus_strand = minus_strand # set label with the specified name if label_type == 'label': self.label[name] = label elif label_type == 'prediction': self.prediction[name] = label elif label_type == 'guide': self.guide[name] = label
def add_basecall_alignment_prediction(self, sam=None, number=None, add_mismatches=False, my_events=None, trim=None): """Add the original basecalled event table and add matches and missmatches to 'prediction' alignment labels to signal_label handle :param sam: correctly formatted SAM string :param number: integer representing which signal align predictions to plot :param add_mismatches: boolean option to add mismatches to labels :param my_events: if you want to pass the event table in directly :param trim: trim both sides of continuous matches to show anchor pairs :return: True if the correct labels are added to the AlignedSignal internal class object """ if not sam: sam = self.get_signalalign_events(sam=True) matches_name = "matches_guide_alignment" mismatches_name = "mismatches_guide_alignment" if my_events is not None: events = my_events if number is not None: matches_name = "matches_guide_alignment_{}".format(number) mismatches_name = "mismatches_guide_alignment_{}".format(number) else: if number is not None: events = self.get_basecalled_data_by_number(number) matches_name = "matches_guide_alignment_{}".format(number) mismatches_name = "mismatches_guide_alignment_{}".format(number) else: events = self.get_basecall_data() try: check_numpy_table(events, req_fields=('raw_start', 'raw_length')) # if events do not have raw_start or raw_lengths except KeyError: events = add_raw_start_and_raw_length_to_events(events, self.sample_rate, self.raw_attributes["start_time"]) matches, mismatches, raw_starts = match_cigar_with_basecall_guide(events=events, sam_string=sam, rna=self.rna, kmer_index=self.kmer_index) if trim is not None: matches = trim_matches(matches, trim=trim) self.aligned_signal.add_label(matches, name=matches_name, label_type='prediction') if add_mismatches: self.aligned_signal.add_label(mismatches, name=mismatches_name, label_type='prediction') self.aligned_signal.add_raw_starts(raw_starts) self.has_guide_alignment = True return matches, mismatches
def index_to_time(basecall_events, sampling_freq=0, start_time=0): """Convert RNA basecall read start and length from indexes to time stamps :param basecall_events: basecall events from albacore/metricore basecalled event table :param sampling_freq: sampling frequency of experiment :param start_time: start time of experiment via fasta5 file """ check_numpy_table(basecall_events, req_fields=('start', 'length')) assert basecall_events["start"].dtype is np.dtype('uint64'), "Event start should be np.int32 type: {}"\ .format(basecall_events["start"].dtype) assert sampling_freq != 0, "Must set sampling frequency" assert start_time != 0, "Must set start time" event_table = change_np_field_type(basecall_events, 'start', float) event_table = change_np_field_type(event_table, 'length', float) event_table["start"] = (event_table["start"] / sampling_freq) + (start_time / sampling_freq) event_table["length"] = event_table["length"] / float(sampling_freq) return event_table
def sequence_from_events(events): """Get new read from event table with 'model_state' and 'move' fields :param events: event table with 'model_state' and 'move' fields """ check_numpy_table(events, req_fields=("model_state", "move")) bases = [] for i, event in enumerate(events): if i == 0: bases.extend([chr(x) for x in event['model_state']]) else: if event['move'] > 0: bases.append(bytes.decode (event['model_state'][-event['move']:])) sequence = ''.join(bases) return sequence
def time_to_index(event_table, sampling_freq=0, start_time=0): """Convert start and lengths from time to raw signal indexes :param event_table: basecall events from albacore/metricore basecalled event table :param sampling_freq: sampling frequency of experiment :param start_time: start time of experiment via fasta5 file """ check_numpy_table(event_table, req_fields=('start', 'length')) assert event_table["start"].dtype is not np.dtype('uint64'), "Event start should not be np.int32 type: {}" \ .format(event_table["start"].dtype) assert sampling_freq != 0, "Must set sampling frequency" assert start_time != 0, "Must set start time" event_table["start"] = np.round((event_table["start"] - (start_time / float(sampling_freq))) * sampling_freq) event_table["length"] = np.round(event_table["length"] * sampling_freq) event_table = change_np_field_type(event_table, 'start', int) event_table = change_np_field_type(event_table, 'length', int) return event_table
def match_events_with_signalalign(sa_events=None, event_detections=None): """Match event index with event detection data to label segments of signal for each kmer # RNA is sequenced 3'-5' # reversed for fasta/q sequence # if mapped to minus strand # reverse reverse complement = complement # DNA is sequenced 5'-3' # if mapped to minus strand # reverse complement :param sa_events: events table reference_index', 'event_index', 'aligned_kmer', 'posterior_probability :param event_detections: event detection event table """ assert sa_events is not None, "Must pass signal alignment events" assert event_detections is not None, "Must pass event_detections events" check_numpy_table(sa_events, req_fields=('reference_index', 'event_index', 'path_kmer', 'posterior_probability')) check_numpy_table(event_detections, req_fields=('raw_start', 'raw_length')) label = np.zeros(len(sa_events), dtype=[('raw_start', int), ('raw_length', int), ('reference_index', int), ('posterior_probability', float), ('kmer', 'S5')]) label['raw_start'] = [ event_detections[x]["raw_start"] for x in sa_events["event_index"] ] label['raw_length'] = [ event_detections[x]["raw_length"] for x in sa_events["event_index"] ] label['reference_index'] = sa_events["reference_index"] label['kmer'] = [convert_to_str(x) for x in sa_events["path_kmer"]] label['posterior_probability'] = sa_events["posterior_probability"] np.sort(label, order='raw_start', kind='mergesort') return label
def check_strand_mapping(self, data): """Check to see if alignment to reverse strand :param data: numpy table with 'reference_index' field """ check_numpy_table(data, req_fields=('reference_index', 'raw_start')) # infer strand alignment of read if data[0]["reference_index"] >= data[-1]["reference_index"]: minus_strand = True else: minus_strand = False # rna is read 3' - 5' if self.rna: minus_strand = not minus_strand if self.minus_strand is not None: if data[0]["raw_start"] != data[-1]["raw_start"]: assert self.minus_strand == minus_strand, "New label has different strand direction, check label" else: self.minus_strand = minus_strand
def add_label(self, label, name, label_type, guide_name=None, check_strand=True): """Add labels to class. :param label: label numpy array with required fields ['raw_start', 'raw_length', 'reference_index', 'kmer', 'posterior_probability'] :param name: name of the label for signal :param label_type: type of label :['label', 'prediction', 'guide'] :param guide_name: must pass your own label via guide_name if label_type is guide """ assert label_type in ['label', 'prediction', 'guide'], \ "{} not in ['label', 'prediction', 'guide']: Must select an acceptable type".format(label_type) check_numpy_table(label, req_fields=('raw_start', 'raw_length', 'reference_index', 'kmer', 'posterior_probability')) # label.sort(order=['raw_start'], kind='mergesort') # check the labels are in the correct format assert min(label["raw_start"]) >= 0, "Raw start cannot be less than 0" assert 0 <= max(label["posterior_probability"]) <= 1, \ "posterior_probability must be between zero and one {}".format(max(label["posterior_probability"])) if label_type == 'guide': assert guide_name is not None, "If label_type is 'guide', you must pass in a guide_name" # make sure last label can actually index the signal correctly try: self.scaled_signal[label[-1]["raw_start"]:label[-1]["raw_start"] + label[-1]["raw_length"]] except IndexError: raise IndexError("labels are longer than signal") label1 = np.sort(label, order=['raw_start'], kind='mergesort') if check_strand: self.check_strand_mapping(label1) # set label with the specified name if label_type == 'label': self.label[name] = label1 elif label_type == 'prediction': self.prediction[name] = label1 elif label_type == 'guide': self.guide[guide_name][name] = label1
def test_check_numpy_table(self): """Test check_numpy_table method""" with captured_output() as (_, _): new = np.empty(3, dtype=[('reference_index', int), ('event_index', int), ('posterior_probability', float)]) check_numpy_table(new, req_fields=[ "reference_index", 'event_index', 'posterior_probability' ]) with self.assertRaises(KeyError): check_numpy_table(new, req_fields=[ "something", 'event_index', 'posterior_probability' ]) check_numpy_table(new, req_fields=["something"]) with self.assertRaises(TypeError): check_numpy_table('1', req_fields=["event_index"])
def create_anchor_kmers(new_events, old_events): """ Create anchor kmers for new event table. Basically, grab kmer and move information from previous event table and pull events covering the same time span into new event table. :param new_events: new event table :param old_events: event table from Fast5 file :return New event table """ num_old_events = len(old_events) check_numpy_table(new_events, req_fields=('start', 'length', 'mean', 'stdv', 'model_state', 'move', 'p_model_state')) check_numpy_table(old_events, req_fields=('start', 'length', 'mean', 'stdv', 'model_state', 'move', 'p_model_state')) # index of old events old_indx = 0 # start index to trim new_events for those with data from old_events start_index = 0 end_index = len(new_events) # personal tracker for dealing with how the segmentation algorithm is working most_moves = 0 # tracking overlaped events selected_overlap = False check_overlap = False homopolymer = False # keep track of events passed last_left_over = 0 for i, event in enumerate(new_events): # skip events that occur before labels from old events if old_events[0]["start"] <= event["start"]: # time of old event in new event for a given kmer time = [] probs = [] moves = [] kmers = [] # new event's start and end current_event_start = round(event["start"], 7) current_event_end = round(current_event_start + event["length"], 7) # if first event or event start is after current old_event start. if old_indx != num_old_events: prev_kmer = str() num_loops = 0 # print(round(old_events[old_indx]["start"], 7), old_events[old_indx]["length"], current_event_end, round(old_events[old_indx]["start"], 7) < current_event_end) while round(old_events[old_indx]["start"], 7) < current_event_end and old_indx != num_old_events: # print("INSIDE LOOP", round(old_events[old_indx]["start"], 7), old_events[old_indx]["length"], current_event_end, round(old_events[old_indx]["start"], 7) < current_event_end) # deal with bad event files and final event if old_indx == num_old_events-1: old_event_end = round(old_events[old_indx]["start"] + old_events[old_indx]["length"], 7) else: old_event_end = round(old_events[old_indx+1]["start"], 7) old_event_start = round(old_events[old_indx]["start"], 7) old_kmer = bytes.decode(old_events[old_indx]["model_state"]) # homopolymers or stays should be tracked together if old_kmer == prev_kmer: if len(set(old_kmer)) == 1: if not homopolymer and selected_overlap and num_loops <= 1: moves[index] = 0 homopolymer = True else: homopolymer = False index = kmers.index(old_kmer) probs[index] = max(probs[index], old_events[old_indx]["p_model_state"]) moves[index] += old_events[old_indx]["move"] else: # add new kmer index = len(time) kmers.append(old_kmer) probs.append(old_events[old_indx]["p_model_state"]) moves.append(old_events[old_indx]["move"]) time.append(0) homopolymer = False prev_kmer = old_kmer # if old event passes through current event calculate correct time in current event # deal with old events ending after the new event end if old_event_end > current_event_end: time[index] += current_event_end - old_event_start new_check_overlap = True break # check if entire old event is within the new event or not else: if old_event_start < current_event_start: time[index] += old_event_end - current_event_start else: time[index] += old_event_end - old_event_start # if old_event_end != current_event_end: old_indx += 1 new_check_overlap = False num_loops += 1 # break loop at end of old events if old_indx == num_old_events: break else: end_index = i num_kmers = len(kmers) # select index of best kmer to assign if num_kmers == 1: best_index = 0 left_over = 0 elif num_kmers > 1: # select on time in new event only best_index = time.index(max(time)) # if there are several old events in a new event, track how many if new_check_overlap: left_over = sum(moves[best_index+1:-1]) else: left_over = sum(moves[best_index+1:]) else: # end of possible alignments end_index = i break # if previous old event overlapped into current new event # check if old event is going to be assigned twice if selected_overlap and best_index == 0 and check_overlap: if homopolymer: move = moves[best_index] else: move = 0 elif selected_overlap and best_index != 0 and check_overlap: move = min(5, moves[best_index] + last_left_over) else: move = min(5, moves[best_index]+sum(moves[:best_index])+last_left_over) if most_moves < moves[best_index]+sum(moves[:best_index])+last_left_over: most_moves = moves[best_index]+sum(moves[:best_index])+last_left_over # print(kmers, moves, left_over, moves[best_index], sum(moves[:best_index]), last_left_over, move) # if new overlap if new_check_overlap: # new overlapped event will be tracked on next new_event so we drop a left_over count left_over = max(0, left_over-1) if most_moves < left_over-1: most_moves = left_over-1 # check if we currently selected an overlapping old event if best_index == num_kmers-1: selected_overlap = True else: selected_overlap = False else: selected_overlap = False kmer = kmers[best_index] prob = probs[best_index] # assign event probs, move and model state event["p_model_state"] = prob event["move"] = move event["model_state"] = kmer check_overlap = new_check_overlap last_left_over = left_over new_check_overlap = False homopolymer = False else: # skip event since the start_index = i + 1 # print(most_moves) return new_events[start_index:end_index]
def create_labels_from_guide_alignment(events, sam_string, rna=False, reference_path=None, kmer_index=2, one_ref_indexing=False): """Create labeled signal from a guide alignment with only matches being reported :param events: path to fast5 file :param sam_string: sam alignment string :param rna: if read is rna, reverse again :param reference_path: if sam_string has MDZ field the reference sequence can be inferred, otherwise, it is needed :param kmer_index: index of the kmer to select for reference to event mapping :param one_ref_indexing: boolean zero or 1 based indexing for reference """ # test if the required fields are in structured numpy array check_numpy_table(events, req_fields=('raw_start', 'model_state', 'p_model_state', 'raw_length', 'move')) assert type(one_ref_indexing) is bool, "one_ref_indexing must be a boolean" psam_h = initialize_pysam_wrapper(sam_string, reference_path=reference_path) # create an indexed map of the events and their corresponding bases bases, base_raw_starts, base_raw_lengths, probs = index_bases_from_events( events, kmer_index=kmer_index) # check if string mapped to reverse strand if psam_h.alignment_segment.is_reverse: probs = probs[::-1] base_raw_starts = base_raw_starts[::-1] # rna reads go 3' to 5' so we dont need to reverse if it mapped to reverse strand if not rna: bases = ReverseComplement().reverse(''.join(bases)) # reverse if it mapped to forward strand and RNA elif rna: bases = ReverseComplement().reverse(''.join(bases)) # all 'matches' and 'mismatches' matches_map = psam_h.seq_alignment.matches_map # zero indexed reference start ref_start = psam_h.alignment_segment.reference_start + one_ref_indexing # set labels raw_start = [] raw_length = [] reference_index = [] kmer = [] posterior_probability = [] cigar_labels = [] prev = matches_map[0].reference_index for i, alignment in enumerate(matches_map): if i == 0 or alignment.reference_index == prev + 1: raw_start.append(base_raw_starts[alignment.query_index]) raw_length.append(base_raw_lengths[alignment.query_index]) reference_index.append(alignment.reference_index + ref_start) kmer.append(alignment.reference_base) posterior_probability.append(probs[alignment.query_index]) else: # initialize labels cigar_label = np.zeros(len(raw_start), dtype=[('raw_start', int), ('raw_length', int), ('reference_index', int), ('posterior_probability', float), ('kmer', 'S5')]) # assign labels cigar_label['raw_start'] = raw_start cigar_label['raw_length'] = raw_length cigar_label['reference_index'] = reference_index cigar_label['kmer'] = kmer cigar_label['posterior_probability'] = posterior_probability # add to other blocks cigar_labels.append(cigar_label) # reset trackers raw_start = [base_raw_starts[alignment.query_index]] raw_length = [base_raw_lengths[alignment.query_index]] reference_index = [alignment.reference_index + ref_start] kmer = [alignment.reference_base] posterior_probability = [probs[alignment.query_index]] # keep track of reference positions prev = alignment.reference_index # catch the last label cigar_label = np.zeros(len(raw_start), dtype=[('raw_start', int), ('raw_length', int), ('reference_index', int), ('posterior_probability', float), ('kmer', 'S5')]) # assign labels cigar_label['raw_start'] = raw_start cigar_label['raw_length'] = raw_length cigar_label['reference_index'] = reference_index cigar_label['kmer'] = kmer cigar_label['posterior_probability'] = posterior_probability # add to other blocks cigar_labels.append(cigar_label) return cigar_labels
def match_cigar_with_basecall_guide(events, sam_string, kmer_index, rna=False, reference_path=None, one_ref_indexing=False): """Create labeled signal from a guide alignment with only matches being reported :param events: path to fast5 file :param sam_string: sam alignment string :param rna: if read is rna, reverse again :param reference_path: if sam_string has MDZ field the reference sequence can be inferred, otherwise, it is needed :param kmer_index: index of the kmer to select for reference to event mapping :param one_ref_indexing: boolean zero or 1 based indexing for reference """ check_numpy_table(events, req_fields=('raw_start', 'model_state', 'p_model_state', 'raw_length', 'move')) assert type(one_ref_indexing) is bool, "one_ref_indexing must be a boolean" psam_h = initialize_aligned_segment_wrapper(sam_string, reference_path=reference_path) # create an indexed map of the events and their corresponding bases _, base_raw_starts, base_raw_lengths, probs = index_bases_from_events(events, kmer_index=kmer_index) if rna: # events are 3'-5', swap to correct for alignment file base_raw_starts = base_raw_starts[::-1] base_raw_lengths = base_raw_lengths[::-1] probs = probs[::-1] # all 'matches' and 'mismatches' matches_map = psam_h.seq_alignment.matches_map ref_len = len(psam_h.get_reference_sequence()) # zero indexed reference start ref_start = psam_h.alignment_segment.reference_start + one_ref_indexing # set labels matches_raw_start = [] matches_raw_length = [] matches_reference_index = [] matches_kmer = [] matches_posterior_probability = [] mismatches_raw_start = [] mismatches_raw_length = [] mismatches_reference_index = [] mismatches_kmer = [] mismatches_posterior_probability = [] for i, alignment in enumerate(matches_map): if alignment.query_base == alignment.reference_base: matches_raw_start.append(base_raw_starts[alignment.query_index]) matches_raw_length.append(base_raw_lengths[alignment.query_index]) matches_kmer.append(alignment.reference_base) matches_posterior_probability.append(probs[alignment.query_index]) if psam_h.alignment_segment.is_reverse: matches_reference_index.append((ref_start + ref_len - 1) - alignment.reference_index) else: matches_reference_index.append(ref_start + alignment.reference_index) else: mismatches_raw_start.append(base_raw_starts[alignment.query_index]) mismatches_raw_length.append(base_raw_lengths[alignment.query_index]) mismatches_kmer.append(alignment.reference_base) mismatches_posterior_probability.append(probs[alignment.query_index]) if psam_h.alignment_segment.is_reverse: mismatches_reference_index.append((ref_start + ref_len - 1) - alignment.reference_index) else: mismatches_reference_index.append(ref_start + alignment.reference_index) matches = np.zeros(len(matches_raw_start), dtype=[('raw_start', int), ('raw_length', int), ('reference_index', int), ('posterior_probability', float), ('kmer', 'S5')]) mismatches = np.zeros(len(mismatches_raw_start), dtype=[('raw_start', int), ('raw_length', int), ('reference_index', int), ('posterior_probability', float), ('kmer', 'S5')]) # assign labels matches['raw_start'] = matches_raw_start matches['raw_length'] = matches_raw_length matches['reference_index'] = matches_reference_index matches['kmer'] = matches_kmer matches['posterior_probability'] = matches_posterior_probability mismatches['raw_start'] = mismatches_raw_start mismatches['raw_length'] = mismatches_raw_length mismatches['reference_index'] = mismatches_reference_index mismatches['kmer'] = mismatches_kmer mismatches['posterior_probability'] = mismatches_posterior_probability # trim extra event alignments return matches[kmer_index:len(matches)-kmer_index], mismatches[kmer_index:len(matches)-kmer_index], events["raw_start"]
def adaptive_banded_simple_event_align(events, model, nucleotide_seq, debug=False): """Generate a banded alignment between events and a nucleotide sequence using the adapted banded approach source: https://www.biorxiv.org/content/biorxiv/early/2017/04/25/130633.full.pdf / nanopolish :param events: event table with required fields: ('start', 'length', 'mean', 'stdv', 'model_state', 'move', 'p_model_state') :param model: HmmModel model :param nucleotide_seq: nucleotide sequence to match up :param debug: boolean debug option """ # initialize helper functions def move_down(curr_band): """Move location in dp matrix down one aka add one to event_index""" return EventKmerPair(event_idx=curr_band.event_idx + 1, kmer_idx=curr_band.kmer_idx) def move_right(curr_band): """Move location in dp matrix right one aka add one to kmer_index""" return EventKmerPair(event_idx=curr_band.event_idx, kmer_idx=curr_band.kmer_idx + 1) def event_kmer_to_band(ei, ki): return (ei + 1) + (ki + 1) def band_event_to_offset(bi, ei): """Get event index of a specific event pair in the 'band_lower_left' pairs and subtract the event offset""" return band_lower_left[bi].event_idx - ei def band_kmer_to_offset(bi, ki): """Subtract kmer index from the specific event pair in the 'band_lower_left' pairs """ return ki - band_lower_left[bi].kmer_idx def is_offset_valid(offset1): """Check if the offset is greater than zero and smaller than the bandwidth""" return 0 <= offset1 < bandwidth def event_at_offset(bi, offset1): """Get kmer index minus offset for a band within the 'band_lower_left' array of event/kmer pairs""" return band_lower_left[bi].event_idx - offset1 def kmer_at_offset(bi, offset1): """Get kmer index plus offset for a band within the 'band_lower_left' array of event/kmer pairs""" return band_lower_left[bi].kmer_idx + offset1 check_numpy_table(events, req_fields=('start', 'length', 'mean', 'stdv', 'model_state', 'move', 'p_model_state')) assert isinstance(model, HmmModel), "Input model needs to be HmmModel" k = model.kmer_length # strand_idx = 0 # how to deal with 2d? n_events = len(events) n_kmers = len(nucleotide_seq) - k + 1 # backtrack markers FROM_D = 0 FROM_U = 1 FROM_L = 2 # # // qc min_average_log_emission = -5.0 max_gap_threshold = 50 # // banding bandwidth = 100 half_bandwidth = int(bandwidth / 2) # setting a tiny skip penalty helps keep the true alignment within the adaptive band # this was empirically determined (From Nanopolish) # transition penalties events_per_kmer = float(n_kmers) / n_events p_stay = 1 - (1 / (events_per_kmer + 1)) # transitions epsilon = 1e-10 lp_skip = np.log(epsilon) lp_stay = np.log(p_stay) lp_step = np.log(1.0 - np.exp(lp_skip) - np.exp(lp_stay)) lp_trim = np.log(0.01) n_events = len(events) n_kmers = len(nucleotide_seq) - k + 1 n_rows = n_events + 1 n_cols = n_kmers + 1 n_bands = n_rows + n_cols bands = pd.DataFrame(np.zeros([bandwidth, n_bands])) * -np.infty trace = pd.DataFrame(np.zeros([bandwidth, n_bands])) # Keep track of the event/kmer index for the lower left corner of the band # these indices are updated at every iteration to perform the adaptive banding # Only the first two bands have their coordinates initialized, the rest are computed adaptively EventKmerPair = namedtuple('EventKmerPair', ['event_idx', 'kmer_idx']) band_lower_left = [namedtuple('EventKmerPair', ['event_idx', 'kmer_idx']) for _ in range(n_bands)] # initialize range of first two bands band_lower_left[0].event_idx = half_bandwidth - 1 band_lower_left[0].kmer_idx = -1 - half_bandwidth band_lower_left[1] = move_down(band_lower_left[0]) # band 0: score zero in the central cell start_cell_offset = band_kmer_to_offset(0, -1) assert(is_offset_valid(start_cell_offset)), "Offset is outside the bounds [0, {}]: {}".format(bandwidth, start_cell_offset) assert(band_event_to_offset(0, -1) == start_cell_offset), "Event offset is not correct:" \ " {} != {}".format(band_event_to_offset(0, -1), start_cell_offset) bands[0][start_cell_offset] = 0.0 # band 1: first event is trimmed first_trim_offset = band_event_to_offset(1, 0) negative_one = kmer_at_offset(1, first_trim_offset) assert(negative_one == -1), "Kmer offset is not correct: {} != {}".format(negative_one, -1) assert(is_offset_valid(start_cell_offset)), "Offset is outside the bounds [0, {}]: {}".format(bandwidth, offset) bands[1][first_trim_offset] = lp_trim trace[1][first_trim_offset] = FROM_U fills = 0 # fill in remaining bands for band_idx in range(2, n_bands): print(band_idx) # Determine placement of this band according to Suzuki's adaptive algorithm # When both ll and ur are out-of-band (ob) we alternate movements # otherwise we decide based on scores ll = bands[band_idx - 1][0] ur = bands[band_idx - 1][bandwidth - 1] ll_ob = ll == -np.infty ur_ob = ur == -np.infty if ll_ob and ur_ob: right = band_idx % 2 == 1 else: right = ll < ur # Suzuki's rule if right: band_lower_left[band_idx] = move_right(band_lower_left[band_idx - 1]) else: band_lower_left[band_idx] = move_down(band_lower_left[band_idx - 1]) # If the trim state is within the band, fill it in here trim_offset = band_kmer_to_offset(band_idx, -1) if is_offset_valid(trim_offset): event_idx = event_at_offset(band_idx, trim_offset) if 0 <= event_idx < n_events: bands[band_idx][trim_offset] = lp_trim * (event_idx + 1) trace[band_idx][trim_offset] = FROM_U else: bands[band_idx][trim_offset] = -np.infty else: "This happened!" # Get the offsets for the first and last event and kmer # We restrict the inner loop to only these values kmer_min_offset = band_kmer_to_offset(band_idx, 0) kmer_max_offset = band_kmer_to_offset(band_idx, n_kmers) event_min_offset = band_event_to_offset(band_idx, n_events - 1) event_max_offset = band_event_to_offset(band_idx, -1) min_offset = max(kmer_min_offset, event_min_offset) min_offset = max(min_offset, 0) max_offset = min(kmer_max_offset, event_max_offset) max_offset = min(max_offset, bandwidth) for offset in range(min_offset, max_offset): event_idx = event_at_offset(band_idx, offset) kmer_idx = kmer_at_offset(band_idx, offset) # kmer_rank = kmer_ranks[kmer_idx] offset_up = band_event_to_offset(band_idx - 1, event_idx - 1) offset_left = band_kmer_to_offset(band_idx - 1, kmer_idx - 1) offset_diag = band_kmer_to_offset(band_idx - 2, kmer_idx - 1) if debug: # verify loop conditions assert(0 <= kmer_idx < n_kmers) assert(0 <= event_idx < n_events) assert(offset_diag == band_event_to_offset(band_idx - 2, event_idx - 1)) assert(offset_up - offset_left == 1) assert(0 <= offset < bandwidth) if is_offset_valid(offset_up): up = bands[band_idx - 1][offset_up] else: up = -np.infty if is_offset_valid(offset_left): left = bands[band_idx - 1][offset_left] else: left = -np.infty if is_offset_valid(offset_diag): diag = bands[band_idx - 2][offset_diag] else: diag = -np.infty event_mean = events['mean'][event_idx] kmer = nucleotide_seq[kmer_idx:kmer_idx+k] lp_emission = model.log_event_mean_gaussian_probability_match(event_mean, kmer) score_d = diag + lp_step + lp_emission score_u = up + lp_stay + lp_emission score_l = left + lp_skip max_score = score_d from_where = FROM_D if score_u > max_score: max_score = score_u if max_score == score_u: from_where = FROM_U if score_l > max_score: max_score = score_l if max_score == score_l: from_where = FROM_L if debug: print("[adafill] offset-up: %d offset-diag: %d offset-left: %d\n", offset_up, offset_diag, offset_left) print("[adafill] up: %.2lf diag: %.2lf left: %.2lf\n", up, diag, left) print("[adafill] bi: %d o: %d e: %d k: %d s: %.2lf f: %d emit: %.2lf\n", band_idx, offset, event_idx, kmer_idx, max_score, from_where, lp_emission) bands[band_idx][offset] = max_score trace[band_idx][offset] = from_where fills += 1 # Backtrack to compute alignment sum_emission = 0 n_aligned_events = 0 max_score = -np.infty curr_event_idx = 0 curr_kmer_idx = n_kmers - 1 # Find best score between an event and the last k-mer. after trimming the remaining events for event_idx in range(n_events): band_idx1 = event_kmer_to_band(event_idx, curr_kmer_idx) # assert(band_idx < bands.size()) offset = band_event_to_offset(band_idx1, event_idx) if is_offset_valid(offset): s = bands[band_idx1][offset] + (n_events - event_idx) * lp_trim if s > max_score: max_score = s curr_event_idx = event_idx if debug: print("[adaback] ei: %d ki: %d s: %.2f\n", curr_event_idx, curr_kmer_idx, max_score) out = [] curr_gap = 0 max_gap = 0 moves = 0 while curr_kmer_idx >= 0 and curr_event_idx >= 0: # emit alignment out.append((curr_kmer_idx, curr_event_idx)) if debug: print("[adaback] ei: %d ki: %d\n", curr_event_idx, curr_kmer_idx) # qc stats event_mean = events['mean'][curr_event_idx] # print(event_idx, event_mean) kmer = nucleotide_seq[curr_kmer_idx:curr_kmer_idx+k] kmer_emission = model.log_event_mean_gaussian_probability_match(event_mean, kmer) sum_emission += kmer_emission if moves == 0: events['model_state'][curr_event_idx] = kmer events['p_model_state'][curr_event_idx] = np.exp(kmer_emission) n_aligned_events += 1 band_idx = event_kmer_to_band(curr_event_idx, curr_kmer_idx) offset = band_event_to_offset(band_idx, curr_event_idx) assert(band_kmer_to_offset(band_idx, curr_kmer_idx) == offset) from_where = trace[band_idx][offset] if from_where == FROM_D: moves += 1 events['move'][curr_event_idx] = moves moves = 0 curr_kmer_idx -= 1 curr_event_idx -= 1 curr_gap = 0 elif from_where == FROM_U: events['move'][curr_event_idx] = 0 print("Moves when skip {}".format(moves)) moves = 0 curr_event_idx -= 1 curr_gap = 0 else: moves += 1 curr_kmer_idx -= 1 curr_gap += 1 max_gap = max(curr_gap, max_gap) events['move'][0] = 0 # QC results out = out[::-1] avg_log_emission = sum_emission / n_aligned_events spanned = out[0][0] == 0 and out[-1][0] == n_kmers - 1 if avg_log_emission < min_average_log_emission or not spanned or max_gap > max_gap_threshold: print(spanned, avg_log_emission) print("We failed... f**k") # //fprintf(stderr, "ada\t%s\t%s\t%.2lf\t%zu\t%.2lf\t%d\t%d\t%d\n", read.read_name.substr(0, 6).c_str(), failed ? "FAILED" : "OK", events_per_kmer, sequence.size(), avg_log_emission, curr_event_idx, max_gap, fills); return events[out[0][1]:out[-1][1]+1], sum_emission
def test_event_table(data, req_fields=__default_event_table_fields__): """Wrapper function to test if event tables have required fields :param data: numpy array :param req_fields: required fields for event table """ return check_numpy_table(data, req_fields)
def simple_banded_event_align(events, model, nucleotide_seq): """Generate a banded alignment between events and a nucleotide sequence :param events: event table with required fields: ('start', 'length', 'mean', 'stdv', 'model_state', 'move', 'p_model_state') :param model: HmmModel model :param nucleotide_seq: nucleotide sequence to match up """ check_numpy_table(events, req_fields=('start', 'length', 'mean', 'stdv', 'model_state', 'move', 'p_model_state')) assert isinstance(model, HmmModel), "Input model needs to be HmmModel" k = model.kmer_length FROM_D = 0 FROM_U = 1 FROM_L = 2 # # // qc min_average_log_emission = -5.0 n_kmers = len(nucleotide_seq) - k + 1 # // banding bandwidth = min(1000, n_kmers+1) half_band = bandwidth / 2 # // transitions lp_skip = np.log(0.001) lp_stay = np.log(0.5) lp_step = np.log(1.0 - np.exp(lp_skip) - np.exp(lp_stay)) lp_trim = np.log(0.1) n_events = len(events) events_per_kmer = float(n_kmers) / n_events min_event_idx_by_kmer = [] # // Calculate the minimum event index that is within the band for each read kmer # // We determine this using the expected number of events observed per kmer for ki in range(n_kmers): expected_event_idx = ki * events_per_kmer min_event_idx_by_kmer.append(max(int(expected_event_idx - half_band), 0)) n_rows = bandwidth n_cols = n_kmers + 1 print(min_event_idx_by_kmer) print("n_rows", n_rows) print("n_cols", n_cols) print("seq_len", len(nucleotide_seq)) viterbi_matrix = np.zeros((n_rows, n_cols), dtype=np.float64) backtrack_matrix = np.zeros((n_rows, n_cols), dtype=np.int32) for i in range(n_cols): viterbi_matrix[0][i] = -np.infty # backtrack_matrix[0][i] = 0 for i in range(bandwidth): viterbi_matrix[i][0] = i * lp_trim # backtrack_matrix[i][0] = 0 fills = 0 for col in range(1, n_cols): # print("New_col {}/{}".format(col, n_cols)) kmer_idx = col - 1 min_event_idx = min_event_idx_by_kmer[kmer_idx] if kmer_idx > 0: min_event_idx_prev_col = min_event_idx_by_kmer[kmer_idx - 1] else: min_event_idx_prev_col = 0 # kmer_rank = alphabet->kmer_rank(sequence.substr(kmer_idx, k).c_str(), k); for row in range(n_rows): event_idx = min_event_idx + row if event_idx >= n_events: viterbi_matrix[row][col] = -np.infty continue # // dp update # // here we are calculating whether the event for each neighboring cell is within the band # // and calculating its position within the column row_up = event_idx - min_event_idx - 1 row_diag = event_idx - min_event_idx_prev_col - 1 row_left = event_idx - min_event_idx_prev_col # try: if 0 <= row_up < n_rows: up = viterbi_matrix[row_up][col] else: up = -np.infty if 0 <= row_diag < n_rows: diag = viterbi_matrix[row_diag][col-1] else: diag = -np.infty if 0 <= row_left < n_rows: left = viterbi_matrix[row_left][col-1] else: left = -np.infty # except IndexError: # print(row_up, row_diag, row_left, col) # SystemExit # lp_emission = log_probability_match_r9(read, pore_model, kmer_rank, event_idx, strand_idx); event_mean = events['mean'][event_idx] # print(event_idx, event_mean) kmer = nucleotide_seq[kmer_idx:kmer_idx+k] lp_emission = model.log_event_mean_gaussian_probability_match(event_mean, kmer) score_d = diag + lp_step + lp_emission score_u = up + lp_stay + lp_emission score_l = left + lp_skip max_score = score_d from_where = FROM_D if score_u > max_score: max_score = score_u if max_score == score_u: from_where = FROM_U if score_l > max_score: max_score = score_l if max_score == score_l: from_where = FROM_L # //fprintf(stderr, "[orgfill] up: %.2lf diag: %.2lf left: %.2lf\n", up, diag, left) # fprintf(stderr, "[orgfill] e: %d k: %d s: %.2lf f: %d emit: %.2lf\n", event_idx, kmer_idx, max_score, from, lp_emission) viterbi_matrix[row][col] = max_score backtrack_matrix[row][col] = from_where fills += 1 # // Initialize by finding best alignment between an event and the last kmer curr_k_idx = n_kmers - 1 curr_event_idx = 0 max_score = -np.infty for row in range(n_rows): col = curr_k_idx + 1 ei = row + min_event_idx_by_kmer[curr_k_idx] s = viterbi_matrix[row][col] + (n_events - ei - 1) * lp_trim if s > max_score and ei < n_events: max_score = s curr_event_idx = ei sum_emission = 0 n_aligned_events = 0 moves = 0 out = [] while curr_k_idx >= 0: # // emit alignment out.append((curr_k_idx, curr_event_idx)) event_mean = events['mean'][curr_event_idx] # print(event_idx, event_mean) kmer = nucleotide_seq[curr_k_idx:curr_k_idx+k] kmer_emission = model.log_event_mean_gaussian_probability_match(event_mean, kmer) sum_emission += kmer_emission if moves == 0: events['model_state'][curr_event_idx] = kmer events['p_model_state'][curr_event_idx] = np.exp(kmer_emission) # kmer_rank = alphabet->kmer_rank(sequence.substr(curr_k_idx, k).c_str(), k); # sum_emission += log_probability_match_r9(read, pore_model, kmer_rank, curr_event_idx, strand_idx); n_aligned_events += 1 # // update indices using backtrack pointers row = curr_event_idx - min_event_idx_by_kmer[curr_k_idx] col = curr_k_idx + 1 from_where = backtrack_matrix[row][col] if from_where == FROM_D: moves += 1 events['move'][curr_event_idx] = moves curr_k_idx -= 1 curr_event_idx -= 1 moves = 0 elif from_where == FROM_U: events['move'][curr_event_idx] = 0 print("Moves when skip {}".format(moves)) curr_event_idx -= 1 moves = 0 else: moves += 1 curr_k_idx -= 1 events['move'][0] = 0 out = out[::-1] avg_log_emission = sum_emission / n_aligned_events spanned = out[0][0] == 0 and out[-1][0] == n_kmers - 1 if avg_log_emission < min_average_log_emission or not spanned: print(spanned, avg_log_emission) print("We failed... f**k") return events[out[0][1]:out[-1][1]+1], sum_emission