예제 #1
0
def add_start_and_length_to_events(basecall_events, sampling_freq, start_time):
    check_numpy_table(basecall_events, req_fields=('raw_start', 'raw_length'))
    assert basecall_events["raw_start"].dtype == np.dtype('uint64'), "Event raw_start should be uint64 type: {}" \
        .format(basecall_events["raw_start"].dtype)
    assert basecall_events["raw_length"].dtype == np.dtype('uint64'), "Event raw_length should be uint64 type: {}" \
        .format(basecall_events["raw_length"].dtype)
    assert sampling_freq >= 0, "Invalid sampling frequency: {}".format(
        sampling_freq)
    assert start_time != 0, "Invalid start time: {}".format(start_time)

    calc_start = lambda x: np.float64(
        (basecall_events[x]["raw_start"] / float(sampling_freq)) +
        (start_time / float(sampling_freq)))
    calc_length = lambda x: np.float64(basecall_events[x]["raw_length"] /
                                       float(sampling_freq))
    starts = list(map(calc_start, range(len(basecall_events))))
    lengths = list(map(calc_length, range(len(basecall_events))))
    basecall_events = append_fields(basecall_events,
                                    "start",
                                    starts,
                                    usemask=False)
    basecall_events = append_fields(basecall_events,
                                    "length",
                                    lengths,
                                    usemask=False)
    return basecall_events
예제 #2
0
def create_label_from_events(mea_events):
    """Trim input events to just required fields

        req_fields: 'reference_index'', 'path_kmer', 'posterior_probability', 'raw_start', 'raw_length'

    :param mea_events: events table reference_index', 'event_index', 'aligned_kmer', 'posterior_probability
    """
    check_numpy_table(mea_events,
                      req_fields=('reference_index', 'path_kmer',
                                  'posterior_probability', 'raw_start',
                                  'raw_length'))

    label = np.zeros(len(mea_events),
                     dtype=[('raw_start', int), ('raw_length', int),
                            ('reference_index', int),
                            ('posterior_probability', float), ('kmer', 'S5')])

    label['raw_start'] = mea_events["raw_start"]
    label['raw_length'] = mea_events["raw_length"]
    label['reference_index'] = mea_events["reference_index"]
    label['kmer'] = [convert_to_str(x) for x in mea_events["path_kmer"]]
    label['posterior_probability'] = mea_events["posterior_probability"]
    np.sort(label, order='raw_start', kind='mergesort')

    return label
예제 #3
0
def get_mea_params_from_events(events):
    """Get the posterior matrix, shortest_ref_per_event and event matrix from events table

    :param events: events table with required fields"""
    check_numpy_table(
        events,
        req_fields=('contig', 'reference_index', 'reference_kmer', 'strand',
                    'event_index', 'event_mean', 'event_noise',
                    'event_duration', 'aligned_kmer', 'scaled_mean_current',
                    'scaled_noise', 'posterior_probability',
                    'descaled_event_mean', 'ont_model_mean', 'path_kmer'))
    # get min/max args
    ref_start = min(events["reference_index"])
    ref_end = max(events["reference_index"])

    # sort events to collect the min ref position per event
    events = np.sort(events, order=['event_index'], kind='mergesort')
    event_start = events["event_index"][0]
    event_end = events["event_index"][-1]

    # check strand of the read
    minus_strand = False
    if events[0]["reference_index"] > events[-1]["reference_index"]:
        minus_strand = True
    # print("minus_strand", minus_strand)

    ref_length = int(ref_end - ref_start + 1)
    event_length = int(event_end - event_start + 1)

    # initialize data structures
    event_matrix = [[0 for _ in range(ref_length)]
                    for _ in range(event_length)]
    posterior_matrix = np.zeros([event_length, ref_length])
    shortest_ref_per_event = [np.inf for _ in range(event_length)]

    min_shortest_ref = np.inf
    # print(ref_start, ref_end)
    # go through events backward to make sure the shortest ref per event is calculated at the same time
    for i in range(1, len(events) + 1):
        event = events[-i]
        event_indx = event["event_index"] - event_start
        if minus_strand:
            ref_indx = event["reference_index"] - ref_end
            ref_indx *= -1
        else:
            ref_indx = event["reference_index"] - ref_start

        # using full event so we can use the same matrix to assemble the training data later
        posterior_matrix[event_indx][ref_indx] = event['posterior_probability']
        event_matrix[event_indx][ref_indx] = event
        # edit shortest ref per event list
        if shortest_ref_per_event[event_indx] > ref_indx:
            if min_shortest_ref > ref_indx:
                min_shortest_ref = ref_indx
            # print(event_indx, ref_indx, min_shortest_ref)
            shortest_ref_per_event[event_indx] = min_shortest_ref

    return posterior_matrix, shortest_ref_per_event, event_matrix
예제 #4
0
def index_bases_from_events(events, kmer_index=2):
    """Map basecalled sequence to events from a table with required fields

    :param kmer_index: index of kmer to create map
    :param events: original base-called events with required fields
    """

    check_numpy_table(events,
                      req_fields=('raw_start', 'model_state', 'p_model_state',
                                  'raw_length', 'move'))
    assert len(events[0]['model_state']) > kmer_index, \
        "Selected too big of a kmer_index len(kmer) !> kmer_index, {} !> {} ".format(len(events[0]['model_state']),
                                                                                     kmer_index)
    probs = []
    base_raw_starts = []
    bases = []
    base_raw_lengths = []
    for i, event in enumerate(events):
        if i == 0:
            # initialize with first kmer
            base_raw_starts.extend([
                event['raw_start']
                for _ in event['model_state'][:kmer_index + 1]
            ])
            probs.extend([
                event['p_model_state']
                for _ in event['model_state'][:kmer_index + 1]
            ])
            bases.extend(
                [chr(x) for x in event['model_state'][:kmer_index + 1]])
            base_raw_lengths.extend([
                event['raw_length']
                for _ in event['model_state'][:kmer_index + 1]
            ])
        else:
            # if there was a move, gather the information for each base by index
            if event['move'] > 0:
                char_moves = bytes.decode(
                    event['model_state'][kmer_index:kmer_index +
                                         event['move']])
                for x in range(event['move']):
                    base_raw_starts.append(event['raw_start'])
                    probs.append(event['p_model_state'])
                    bases.append(char_moves[x])
                    base_raw_lengths.append(event['raw_length'])
    # gather last bases for the last event
    base_raw_starts.extend(
        [event['raw_start'] for _ in event['model_state'][kmer_index + 1:]])
    probs.extend([
        event['p_model_state'] for _ in event['model_state'][kmer_index + 1:]
    ])
    bases.extend([chr(x) for x in event['model_state'][kmer_index + 1:]])
    base_raw_lengths.extend(
        [event['raw_length'] for _ in event['model_state'][kmer_index + 1:]])

    # the index of each corresponds to the index of the final sequence
    return bases, base_raw_starts, base_raw_lengths, probs
예제 #5
0
    def fix_sa_reference_indexes(self, data):
        """Fix reference indexes based on kmer length and kmer index"""
        check_numpy_table(data, req_fields=('reference_index', 'raw_start', 'kmer'))
        kmer_len = len(data[0]["kmer"])

        if self.aligned_signal.minus_strand:
            data["reference_index"] += ((kmer_len - 1) - self.kmer_index)
        else:
            data["reference_index"] += self.kmer_index
        return data
예제 #6
0
def match_events_with_eventalign(events=None, event_detections=None, minus=False, rna=False):
    """Match event index with event detection data to label segments of signal for each kmer

    # RNA is sequenced 3'-5'
    # reversed for fasta/q sequence
    # if mapped to reverse strand
    # reverse reverse complement = complement

    # DNA is sequenced 5'-3'
    # if mapped to reverse strand
    # reverse complement

    :param events: events table reference_index', 'event_index', 'aligned_kmer', 'posterior_probability
    :param event_detections: event detection event table
    :param minus: boolean option to for minus strand mapping
    :param rna: boolean for RNA read
    """
    assert events is not None, "Must pass signal alignment events"
    assert event_detections is not None, "Must pass event_detections events"

    check_numpy_table(events, req_fields=('position', 'event_index',
                                          'reference_kmer'))

    check_numpy_table(event_detections, req_fields=('start', 'length'))

    label = np.zeros(len(events), dtype=[('raw_start', int), ('raw_length', int), ('reference_index', int),
                                         ('posterior_probability', float), ('kmer', 'S6')])

    label['raw_start'] = [event_detections[x]["start"] for x in events["event_index"]]
    label['raw_length'] = [event_detections[x]["length"] for x in events["event_index"]]
    label['reference_index'] = events["position"]

    def convert_to_str(string):
        """Helper function to catch bytes as strings"""
        if type(string) is str:
            return string
        else:
            return bytes.decode(string)

    flip = ReverseComplement()
    if minus:
        if rna:
            kmers = [flip.complement(convert_to_str(x)) for x in events["reference_kmer"]]
        else:
            kmers = [flip.reverse_complement(convert_to_str(x)) for x in events["reference_kmer"]]
    else:
        if rna:
            kmers = [flip.reverse(convert_to_str(x)) for x in events["reference_kmer"]]
        else:
            kmers = events["reference_kmer"]
    label['kmer'] = kmers
    label['posterior_probability'] = np.ones(len(events))
    # np.sort(label, order='raw_start', kind='mergesort')

    return label
예제 #7
0
def check_event_table_time(event_table):
    """Check if event table has correct math for start and length timing for each event

    :param event_table: event table with "start" and "length" columns
    """
    check_numpy_table(event_table, req_fields=('start', 'length'))

    prev_end = event_table[0]["start"] + event_table[0]["length"]
    for event in event_table[1:]:
        if prev_end != event["start"]:
            return False
        prev_end = event["start"]+event["length"]

    return True
예제 #8
0
 def get_basecalled_data_by_number(self, number):
     """Get basecalled event data by the number"""
     assert type(number) is int, "Number must be an integer"
     events_path = self.__default_template_1d_basecall_events__.format(number)
     try:
         events = self[events_path][()]
     except:
         raise ValueError('Could not retrieve basecall_1D data from {}'.format(events_path))
     try:
         check_numpy_table(events, req_fields=('raw_start', 'raw_length'))
     # if events do not have raw_start or raw_lengths
     except KeyError:
         events = add_raw_start_and_raw_length_to_events(events, self.sample_rate, self.raw_attributes["start_time"])
     return events
예제 #9
0
 def has_valid_event_table_format(self, oned_root_address):
     """Check if the 'start' and 'length' values are in the time scale, NOT the index scale
     :param oned_root_address: Basecalled analysis path
     :return: boolean if start and length are correct format
     """
     template_event_table_address = os.path.join(
         oned_root_address, "BaseCalled_template/Events")
     if template_event_table_address in self.fastFive:
         template_events = np.asarray(
             self.fastFive[template_event_table_address])
     check_numpy_table(template_events, req_fields=('start', 'length'))
     if template_events["start"].dtype is np.dtype('uint64'):
         return False
     else:
         return True
예제 #10
0
    def add_label(self, label, name, label_type):
        """Add labels to class.

        :param label: label numpy array with required fields ['raw_start', 'raw_length', 'reference_index',
                                                              'kmer', 'posterior_probability']
        :param name: name of the label for signal
        :param label_type: type of label  :['label', 'prediction', 'guide']
        """
        assert label_type in ['label', 'prediction', 'guide'], \
            "{} not in ['label', 'prediction', 'guide']: Must select an acceptable type".format(label_type)
        check_numpy_table(label,
                          req_fields=('raw_start', 'raw_length',
                                      'reference_index', 'kmer',
                                      'posterior_probability'))

        # label.sort(order=['raw_start'], kind='mergesort')
        # check the labels are in the correct format
        assert min(label["raw_start"]) >= 0, "Raw start cannot be less than 0"
        assert 0 <= max(label["posterior_probability"]) <= 1, \
            "posterior_probability must be between zero and one {}".format(row["posterior_probability"])

        # make sure last label can actually index the signal correctly
        try:
            self.scaled_signal[label[-1]["raw_start"]:label[-1]["raw_start"] +
                               label[-1]["raw_length"]]
        except IndexError:
            raise IndexError("labels are longer than signal")

        label1 = np.sort(label, order=['raw_start'], kind='mergesort')

        # infer strand alignment of read
        if label1[0]["reference_index"] >= label1[-1]["reference_index"]:
            minus_strand = True
        else:
            minus_strand = False
        if self.minus_strand is not None:
            if label[0]["raw_start"] != label[-1]["raw_start"]:
                assert self.minus_strand == minus_strand, "New label has different strand direction, check label"
        else:
            self.minus_strand = minus_strand

        # set label with the specified name
        if label_type == 'label':
            self.label[name] = label
        elif label_type == 'prediction':
            self.prediction[name] = label
        elif label_type == 'guide':
            self.guide[name] = label
예제 #11
0
    def add_basecall_alignment_prediction(self, sam=None, number=None, add_mismatches=False, my_events=None, trim=None):
        """Add the original basecalled event table and add matches and missmatches to 'prediction'
            alignment labels to signal_label handle
        :param sam: correctly formatted SAM string
        :param number: integer representing which signal align predictions to plot
        :param add_mismatches: boolean option to add mismatches to labels
        :param my_events: if you want to pass the event table in directly
        :param trim: trim both sides of continuous matches to show anchor pairs
        :return: True if the correct labels are added to the AlignedSignal internal class object
        """
        if not sam:
            sam = self.get_signalalign_events(sam=True)
        matches_name = "matches_guide_alignment"
        mismatches_name = "mismatches_guide_alignment"

        if my_events is not None:
            events = my_events
            if number is not None:
                matches_name = "matches_guide_alignment_{}".format(number)
                mismatches_name = "mismatches_guide_alignment_{}".format(number)

        else:
            if number is not None:
                events = self.get_basecalled_data_by_number(number)
                matches_name = "matches_guide_alignment_{}".format(number)
                mismatches_name = "mismatches_guide_alignment_{}".format(number)
            else:
                events = self.get_basecall_data()
        try:
            check_numpy_table(events, req_fields=('raw_start', 'raw_length'))
        # if events do not have raw_start or raw_lengths
        except KeyError:
            events = add_raw_start_and_raw_length_to_events(events, self.sample_rate, self.raw_attributes["start_time"])

        matches, mismatches, raw_starts = match_cigar_with_basecall_guide(events=events, sam_string=sam, rna=self.rna,
                                                                          kmer_index=self.kmer_index)

        if trim is not None:
            matches = trim_matches(matches, trim=trim)

        self.aligned_signal.add_label(matches, name=matches_name, label_type='prediction')
        if add_mismatches:
            self.aligned_signal.add_label(mismatches, name=mismatches_name, label_type='prediction')
        self.aligned_signal.add_raw_starts(raw_starts)
        self.has_guide_alignment = True
        return matches, mismatches
예제 #12
0
def index_to_time(basecall_events, sampling_freq=0, start_time=0):
    """Convert RNA basecall read start and length from indexes to time stamps

    :param basecall_events: basecall events from albacore/metricore basecalled event table
    :param sampling_freq: sampling frequency of experiment
    :param start_time: start time of experiment via fasta5 file
    """
    check_numpy_table(basecall_events, req_fields=('start', 'length'))
    assert basecall_events["start"].dtype is np.dtype('uint64'), "Event start should be np.int32 type: {}"\
        .format(basecall_events["start"].dtype)
    assert sampling_freq != 0, "Must set sampling frequency"
    assert start_time != 0, "Must set start time"

    event_table = change_np_field_type(basecall_events, 'start', float)
    event_table = change_np_field_type(event_table, 'length', float)
    event_table["start"] = (event_table["start"] / sampling_freq) + (start_time / sampling_freq)
    event_table["length"] = event_table["length"] / float(sampling_freq)
    return event_table
예제 #13
0
def sequence_from_events(events):
    """Get new read from event table with 'model_state' and 'move' fields

    :param events: event table with 'model_state' and 'move' fields

    """
    check_numpy_table(events, req_fields=("model_state", "move"))
    bases = []
    for i, event in enumerate(events):
        if i == 0:
            bases.extend([chr(x) for x in event['model_state']])

        else:
            if event['move'] > 0:
                bases.append(bytes.decode
                             (event['model_state'][-event['move']:]))
    sequence = ''.join(bases)
    return sequence
예제 #14
0
def time_to_index(event_table, sampling_freq=0, start_time=0):
    """Convert start and lengths from time to raw signal indexes

    :param event_table: basecall events from albacore/metricore basecalled event table
    :param sampling_freq: sampling frequency of experiment
    :param start_time: start time of experiment via fasta5 file
    """
    check_numpy_table(event_table, req_fields=('start', 'length'))
    assert event_table["start"].dtype is not np.dtype('uint64'), "Event start should not be np.int32 type: {}" \
        .format(event_table["start"].dtype)
    assert sampling_freq != 0, "Must set sampling frequency"
    assert start_time != 0, "Must set start time"

    event_table["start"] = np.round((event_table["start"] - (start_time / float(sampling_freq))) * sampling_freq)
    event_table["length"] = np.round(event_table["length"] * sampling_freq)
    event_table = change_np_field_type(event_table, 'start', int)
    event_table = change_np_field_type(event_table, 'length', int)

    return event_table
예제 #15
0
def match_events_with_signalalign(sa_events=None, event_detections=None):
    """Match event index with event detection data to label segments of signal for each kmer

    # RNA is sequenced 3'-5'
    # reversed for fasta/q sequence
    # if mapped to minus strand
    # reverse reverse complement = complement

    # DNA is sequenced 5'-3'
    # if mapped to minus strand
    # reverse complement

    :param sa_events: events table reference_index', 'event_index', 'aligned_kmer', 'posterior_probability
    :param event_detections: event detection event table
    """
    assert sa_events is not None, "Must pass signal alignment events"
    assert event_detections is not None, "Must pass event_detections events"

    check_numpy_table(sa_events,
                      req_fields=('reference_index', 'event_index',
                                  'path_kmer', 'posterior_probability'))

    check_numpy_table(event_detections, req_fields=('raw_start', 'raw_length'))

    label = np.zeros(len(sa_events),
                     dtype=[('raw_start', int), ('raw_length', int),
                            ('reference_index', int),
                            ('posterior_probability', float), ('kmer', 'S5')])

    label['raw_start'] = [
        event_detections[x]["raw_start"] for x in sa_events["event_index"]
    ]
    label['raw_length'] = [
        event_detections[x]["raw_length"] for x in sa_events["event_index"]
    ]
    label['reference_index'] = sa_events["reference_index"]

    label['kmer'] = [convert_to_str(x) for x in sa_events["path_kmer"]]
    label['posterior_probability'] = sa_events["posterior_probability"]
    np.sort(label, order='raw_start', kind='mergesort')

    return label
예제 #16
0
    def check_strand_mapping(self, data):
        """Check to see if alignment to reverse strand

        :param data: numpy table with 'reference_index' field
        """
        check_numpy_table(data, req_fields=('reference_index', 'raw_start'))
        # infer strand alignment of read
        if data[0]["reference_index"] >= data[-1]["reference_index"]:
            minus_strand = True
        else:
            minus_strand = False

        # rna is read 3' - 5'
        if self.rna:
            minus_strand = not minus_strand

        if self.minus_strand is not None:
            if data[0]["raw_start"] != data[-1]["raw_start"]:
                assert self.minus_strand == minus_strand, "New label has different strand direction, check label"
        else:
            self.minus_strand = minus_strand
예제 #17
0
    def add_label(self, label, name, label_type, guide_name=None, check_strand=True):
        """Add labels to class.

        :param label: label numpy array with required fields ['raw_start', 'raw_length', 'reference_index',
                                                              'kmer', 'posterior_probability']
        :param name: name of the label for signal
        :param label_type: type of label  :['label', 'prediction', 'guide']
        :param guide_name: must pass your own label via guide_name if label_type is guide
        """
        assert label_type in ['label', 'prediction', 'guide'], \
            "{} not in ['label', 'prediction', 'guide']: Must select an acceptable type".format(label_type)
        check_numpy_table(label, req_fields=('raw_start', 'raw_length', 'reference_index',
                                             'kmer', 'posterior_probability'))

        # label.sort(order=['raw_start'], kind='mergesort')
        # check the labels are in the correct format
        assert min(label["raw_start"]) >= 0, "Raw start cannot be less than 0"
        assert 0 <= max(label["posterior_probability"]) <= 1, \
            "posterior_probability must be between zero and one {}".format(max(label["posterior_probability"]))
        if label_type == 'guide':
            assert guide_name is not None, "If label_type is 'guide', you must pass in a guide_name"
        # make sure last label can actually index the signal correctly
        try:
            self.scaled_signal[label[-1]["raw_start"]:label[-1]["raw_start"] + label[-1]["raw_length"]]
        except IndexError:
            raise IndexError("labels are longer than signal")

        label1 = np.sort(label, order=['raw_start'], kind='mergesort')
        if check_strand:
            self.check_strand_mapping(label1)
        # set label with the specified name
        if label_type == 'label':
            self.label[name] = label1
        elif label_type == 'prediction':
            self.prediction[name] = label1
        elif label_type == 'guide':
            self.guide[guide_name][name] = label1
예제 #18
0
    def test_check_numpy_table(self):
        """Test check_numpy_table method"""
        with captured_output() as (_, _):
            new = np.empty(3,
                           dtype=[('reference_index', int),
                                  ('event_index', int),
                                  ('posterior_probability', float)])
            check_numpy_table(new,
                              req_fields=[
                                  "reference_index", 'event_index',
                                  'posterior_probability'
                              ])

            with self.assertRaises(KeyError):
                check_numpy_table(new,
                                  req_fields=[
                                      "something", 'event_index',
                                      'posterior_probability'
                                  ])
                check_numpy_table(new, req_fields=["something"])
            with self.assertRaises(TypeError):
                check_numpy_table('1', req_fields=["event_index"])
예제 #19
0
def create_anchor_kmers(new_events, old_events):
    """
    Create anchor kmers for new event table.

    Basically, grab kmer and move information from previous event table and
    pull events covering the same time span into new event table.
    :param new_events: new event table
    :param old_events: event table from Fast5 file
    :return New event table
    """
    num_old_events = len(old_events)
    check_numpy_table(new_events, req_fields=('start', 'length', 'mean', 'stdv', 'model_state', 'move', 'p_model_state'))
    check_numpy_table(old_events, req_fields=('start', 'length', 'mean', 'stdv', 'model_state', 'move', 'p_model_state'))
    # index of old events
    old_indx = 0
    # start index to trim new_events for those with data from old_events
    start_index = 0
    end_index = len(new_events)
    # personal tracker for dealing with how the segmentation algorithm is working
    most_moves = 0
    # tracking overlaped events
    selected_overlap = False
    check_overlap = False
    homopolymer = False
    # keep track of events passed
    last_left_over = 0
    for i, event in enumerate(new_events):
        # skip events that occur before labels from old events
        if old_events[0]["start"] <= event["start"]:
            # time of old event in new event for a given kmer
            time = []
            probs = []
            moves = []
            kmers = []
            # new event's start and end
            current_event_start = round(event["start"], 7)
            current_event_end = round(current_event_start + event["length"], 7)
            # if first event or event start is after current old_event start.
            if old_indx != num_old_events:
                prev_kmer = str()
                num_loops = 0
                # print(round(old_events[old_indx]["start"], 7), old_events[old_indx]["length"], current_event_end, round(old_events[old_indx]["start"], 7) < current_event_end)
                while round(old_events[old_indx]["start"], 7) < current_event_end and old_indx != num_old_events:
                    # print("INSIDE LOOP", round(old_events[old_indx]["start"], 7), old_events[old_indx]["length"], current_event_end, round(old_events[old_indx]["start"], 7) < current_event_end)
                    # deal with bad event files and final event
                    if old_indx == num_old_events-1:
                        old_event_end = round(old_events[old_indx]["start"] + old_events[old_indx]["length"], 7)
                    else:
                        old_event_end = round(old_events[old_indx+1]["start"], 7)
                    old_event_start = round(old_events[old_indx]["start"], 7)
                    old_kmer = bytes.decode(old_events[old_indx]["model_state"])
                    # homopolymers or stays should be tracked together
                    if old_kmer == prev_kmer:
                        if len(set(old_kmer)) == 1:
                            if not homopolymer and selected_overlap and num_loops <= 1:
                                moves[index] = 0
                            homopolymer = True
                        else:
                            homopolymer = False
                        index = kmers.index(old_kmer)
                        probs[index] = max(probs[index], old_events[old_indx]["p_model_state"])
                        moves[index] += old_events[old_indx]["move"]
                    else:
                        # add new kmer
                        index = len(time)
                        kmers.append(old_kmer)
                        probs.append(old_events[old_indx]["p_model_state"])
                        moves.append(old_events[old_indx]["move"])
                        time.append(0)
                        homopolymer = False
                    prev_kmer = old_kmer
                    # if old event passes through current event calculate correct time in current event
                    # deal with old events ending after the new event end
                    if old_event_end > current_event_end:
                        time[index] += current_event_end - old_event_start
                        new_check_overlap = True
                        break
                    # check if entire old event is within the new event or not
                    else:
                        if old_event_start < current_event_start:
                            time[index] += old_event_end - current_event_start
                        else:
                            time[index] += old_event_end - old_event_start
                        # if old_event_end != current_event_end:
                        old_indx += 1
                        new_check_overlap = False
                    num_loops += 1
                    # break loop at end of old events
                    if old_indx == num_old_events:
                        break
            else:
                end_index = i
            num_kmers = len(kmers)
            # select index of best kmer to assign
            if num_kmers == 1:
                best_index = 0
                left_over = 0
            elif num_kmers > 1:
                # select on time in new event only
                best_index = time.index(max(time))
                # if there are several old events in a new event, track how many
                if new_check_overlap:
                    left_over = sum(moves[best_index+1:-1])
                else:
                    left_over = sum(moves[best_index+1:])
            else:
                # end of possible alignments
                end_index = i
                break
            # if previous old event overlapped into current new event
            # check if old event is going to be assigned twice
            if selected_overlap and best_index == 0 and check_overlap:
                if homopolymer:
                    move = moves[best_index]
                else:
                    move = 0
            elif selected_overlap and best_index != 0 and check_overlap:
                move = min(5, moves[best_index] + last_left_over)
            else:
                move = min(5, moves[best_index]+sum(moves[:best_index])+last_left_over)
                if most_moves < moves[best_index]+sum(moves[:best_index])+last_left_over:
                    most_moves = moves[best_index]+sum(moves[:best_index])+last_left_over
            # print(kmers, moves, left_over, moves[best_index], sum(moves[:best_index]), last_left_over, move)
            # if new overlap
            if new_check_overlap:
                # new overlapped event will be tracked on next new_event so we drop a left_over count
                left_over = max(0, left_over-1)
                if most_moves < left_over-1:
                    most_moves = left_over-1

                # check if we currently selected an overlapping old event
                if best_index == num_kmers-1:
                    selected_overlap = True
                else:
                    selected_overlap = False
            else:
                selected_overlap = False

            kmer = kmers[best_index]
            prob = probs[best_index]
            # assign event probs, move and model state
            event["p_model_state"] = prob
            event["move"] = move
            event["model_state"] = kmer
            check_overlap = new_check_overlap
            last_left_over = left_over
            new_check_overlap = False
            homopolymer = False
        else:
            # skip event since the
            start_index = i + 1
    # print(most_moves)
    return new_events[start_index:end_index]
예제 #20
0
def create_labels_from_guide_alignment(events,
                                       sam_string,
                                       rna=False,
                                       reference_path=None,
                                       kmer_index=2,
                                       one_ref_indexing=False):
    """Create labeled signal from a guide alignment with only matches being reported

    :param events: path to fast5 file
    :param sam_string: sam alignment string
    :param rna: if read is rna, reverse again
    :param reference_path: if sam_string has MDZ field the reference sequence can be inferred, otherwise, it is needed
    :param kmer_index: index of the kmer to select for reference to event mapping
    :param one_ref_indexing: boolean zero or 1 based indexing for reference
    """
    # test if the required fields are in structured numpy array
    check_numpy_table(events,
                      req_fields=('raw_start', 'model_state', 'p_model_state',
                                  'raw_length', 'move'))
    assert type(one_ref_indexing) is bool, "one_ref_indexing must be a boolean"

    psam_h = initialize_pysam_wrapper(sam_string,
                                      reference_path=reference_path)
    # create an indexed map of the events and their corresponding bases
    bases, base_raw_starts, base_raw_lengths, probs = index_bases_from_events(
        events, kmer_index=kmer_index)

    # check if string mapped to reverse strand
    if psam_h.alignment_segment.is_reverse:
        probs = probs[::-1]
        base_raw_starts = base_raw_starts[::-1]
        # rna reads go 3' to 5' so we dont need to reverse if it mapped to reverse strand
        if not rna:
            bases = ReverseComplement().reverse(''.join(bases))
    # reverse if it mapped to forward strand and RNA
    elif rna:
        bases = ReverseComplement().reverse(''.join(bases))

    # all 'matches' and 'mismatches'
    matches_map = psam_h.seq_alignment.matches_map
    # zero indexed reference start
    ref_start = psam_h.alignment_segment.reference_start + one_ref_indexing
    # set labels
    raw_start = []
    raw_length = []
    reference_index = []
    kmer = []
    posterior_probability = []
    cigar_labels = []
    prev = matches_map[0].reference_index
    for i, alignment in enumerate(matches_map):
        if i == 0 or alignment.reference_index == prev + 1:
            raw_start.append(base_raw_starts[alignment.query_index])
            raw_length.append(base_raw_lengths[alignment.query_index])
            reference_index.append(alignment.reference_index + ref_start)
            kmer.append(alignment.reference_base)
            posterior_probability.append(probs[alignment.query_index])
        else:
            # initialize labels
            cigar_label = np.zeros(len(raw_start),
                                   dtype=[('raw_start', int),
                                          ('raw_length', int),
                                          ('reference_index', int),
                                          ('posterior_probability', float),
                                          ('kmer', 'S5')])
            # assign labels
            cigar_label['raw_start'] = raw_start
            cigar_label['raw_length'] = raw_length
            cigar_label['reference_index'] = reference_index
            cigar_label['kmer'] = kmer
            cigar_label['posterior_probability'] = posterior_probability
            # add to other blocks
            cigar_labels.append(cigar_label)
            # reset trackers
            raw_start = [base_raw_starts[alignment.query_index]]
            raw_length = [base_raw_lengths[alignment.query_index]]
            reference_index = [alignment.reference_index + ref_start]
            kmer = [alignment.reference_base]
            posterior_probability = [probs[alignment.query_index]]
        # keep track of reference positions
        prev = alignment.reference_index

    # catch the last label
    cigar_label = np.zeros(len(raw_start),
                           dtype=[('raw_start', int), ('raw_length', int),
                                  ('reference_index', int),
                                  ('posterior_probability', float),
                                  ('kmer', 'S5')])
    # assign labels
    cigar_label['raw_start'] = raw_start
    cigar_label['raw_length'] = raw_length
    cigar_label['reference_index'] = reference_index
    cigar_label['kmer'] = kmer
    cigar_label['posterior_probability'] = posterior_probability
    # add to other blocks
    cigar_labels.append(cigar_label)

    return cigar_labels
예제 #21
0
def match_cigar_with_basecall_guide(events, sam_string, kmer_index, rna=False, reference_path=None,
                                    one_ref_indexing=False):
    """Create labeled signal from a guide alignment with only matches being reported

    :param events: path to fast5 file
    :param sam_string: sam alignment string
    :param rna: if read is rna, reverse again
    :param reference_path: if sam_string has MDZ field the reference sequence can be inferred, otherwise, it is needed
    :param kmer_index: index of the kmer to select for reference to event mapping
    :param one_ref_indexing: boolean zero or 1 based indexing for reference
    """
    check_numpy_table(events, req_fields=('raw_start', 'model_state', 'p_model_state', 'raw_length', 'move'))
    assert type(one_ref_indexing) is bool, "one_ref_indexing must be a boolean"

    psam_h = initialize_aligned_segment_wrapper(sam_string, reference_path=reference_path)

    # create an indexed map of the events and their corresponding bases
    _, base_raw_starts, base_raw_lengths, probs = index_bases_from_events(events, kmer_index=kmer_index)
    if rna:
        # events are 3'-5', swap to correct for alignment file
        base_raw_starts = base_raw_starts[::-1]
        base_raw_lengths = base_raw_lengths[::-1]
        probs = probs[::-1]
    # all 'matches' and 'mismatches'
    matches_map = psam_h.seq_alignment.matches_map
    ref_len = len(psam_h.get_reference_sequence())
    # zero indexed reference start
    ref_start = psam_h.alignment_segment.reference_start + one_ref_indexing
    # set labels
    matches_raw_start = []
    matches_raw_length = []
    matches_reference_index = []
    matches_kmer = []
    matches_posterior_probability = []

    mismatches_raw_start = []
    mismatches_raw_length = []
    mismatches_reference_index = []
    mismatches_kmer = []
    mismatches_posterior_probability = []

    for i, alignment in enumerate(matches_map):
        if alignment.query_base == alignment.reference_base:
            matches_raw_start.append(base_raw_starts[alignment.query_index])
            matches_raw_length.append(base_raw_lengths[alignment.query_index])
            matches_kmer.append(alignment.reference_base)
            matches_posterior_probability.append(probs[alignment.query_index])
            if psam_h.alignment_segment.is_reverse:
                matches_reference_index.append((ref_start + ref_len - 1) - alignment.reference_index)
            else:
                matches_reference_index.append(ref_start + alignment.reference_index)
        else:
            mismatches_raw_start.append(base_raw_starts[alignment.query_index])
            mismatches_raw_length.append(base_raw_lengths[alignment.query_index])
            mismatches_kmer.append(alignment.reference_base)
            mismatches_posterior_probability.append(probs[alignment.query_index])
            if psam_h.alignment_segment.is_reverse:
                mismatches_reference_index.append((ref_start + ref_len - 1) - alignment.reference_index)
            else:
                mismatches_reference_index.append(ref_start + alignment.reference_index)

    matches = np.zeros(len(matches_raw_start), dtype=[('raw_start', int), ('raw_length', int), ('reference_index', int),
                                                      ('posterior_probability', float), ('kmer', 'S5')])
    mismatches = np.zeros(len(mismatches_raw_start),
                          dtype=[('raw_start', int), ('raw_length', int), ('reference_index', int),
                                 ('posterior_probability', float), ('kmer', 'S5')])
    # assign labels
    matches['raw_start'] = matches_raw_start
    matches['raw_length'] = matches_raw_length
    matches['reference_index'] = matches_reference_index
    matches['kmer'] = matches_kmer
    matches['posterior_probability'] = matches_posterior_probability

    mismatches['raw_start'] = mismatches_raw_start
    mismatches['raw_length'] = mismatches_raw_length
    mismatches['reference_index'] = mismatches_reference_index
    mismatches['kmer'] = mismatches_kmer
    mismatches['posterior_probability'] = mismatches_posterior_probability

    # trim extra event alignments
    return matches[kmer_index:len(matches)-kmer_index], mismatches[kmer_index:len(matches)-kmer_index], events["raw_start"]
예제 #22
0
def adaptive_banded_simple_event_align(events, model, nucleotide_seq, debug=False):
    """Generate a banded alignment between events and a nucleotide sequence using the adapted banded approach
    source: https://www.biorxiv.org/content/biorxiv/early/2017/04/25/130633.full.pdf /  nanopolish

    :param events: event table with required fields: ('start', 'length', 'mean', 'stdv', 'model_state', 'move', 'p_model_state')
    :param model: HmmModel model
    :param nucleotide_seq: nucleotide sequence to match up
    :param debug: boolean debug option
    """
    # initialize helper functions
    def move_down(curr_band):
        """Move location in dp matrix down one aka add one to event_index"""
        return EventKmerPair(event_idx=curr_band.event_idx + 1, kmer_idx=curr_band.kmer_idx)

    def move_right(curr_band):
        """Move location in dp matrix right one aka add one to kmer_index"""
        return EventKmerPair(event_idx=curr_band.event_idx, kmer_idx=curr_band.kmer_idx + 1)

    def event_kmer_to_band(ei, ki):
        return (ei + 1) + (ki + 1)

    def band_event_to_offset(bi, ei):
        """Get event index of a specific event pair in the 'band_lower_left' pairs and subtract the event offset"""
        return band_lower_left[bi].event_idx - ei

    def band_kmer_to_offset(bi, ki):
        """Subtract kmer index from the specific event pair in the 'band_lower_left' pairs """
        return ki - band_lower_left[bi].kmer_idx

    def is_offset_valid(offset1):
        """Check if the offset is greater than zero and smaller than the bandwidth"""
        return 0 <= offset1 < bandwidth

    def event_at_offset(bi, offset1):
        """Get kmer index minus offset for a band within the 'band_lower_left' array of event/kmer pairs"""
        return band_lower_left[bi].event_idx - offset1

    def kmer_at_offset(bi, offset1):
        """Get kmer index plus offset for a band within the 'band_lower_left' array of event/kmer pairs"""
        return band_lower_left[bi].kmer_idx + offset1

    check_numpy_table(events, req_fields=('start', 'length', 'mean', 'stdv', 'model_state', 'move', 'p_model_state'))
    assert isinstance(model, HmmModel), "Input model needs to be HmmModel"
    k = model.kmer_length
    # strand_idx = 0
    # how to deal with 2d?
    n_events = len(events)
    n_kmers = len(nucleotide_seq) - k + 1

    # backtrack markers
    FROM_D = 0
    FROM_U = 1
    FROM_L = 2

    # # // qc
    min_average_log_emission = -5.0
    max_gap_threshold = 50

    # // banding
    bandwidth = 100
    half_bandwidth = int(bandwidth / 2)
    # setting a tiny skip penalty helps keep the true alignment within the adaptive band
    # this was empirically determined (From Nanopolish)

    # transition penalties
    events_per_kmer = float(n_kmers) / n_events
    p_stay = 1 - (1 / (events_per_kmer + 1))

    # transitions
    epsilon = 1e-10
    lp_skip = np.log(epsilon)
    lp_stay = np.log(p_stay)
    lp_step = np.log(1.0 - np.exp(lp_skip) - np.exp(lp_stay))
    lp_trim = np.log(0.01)

    n_events = len(events)
    n_kmers = len(nucleotide_seq) - k + 1

    n_rows = n_events + 1
    n_cols = n_kmers + 1
    n_bands = n_rows + n_cols

    bands = pd.DataFrame(np.zeros([bandwidth, n_bands])) * -np.infty
    trace = pd.DataFrame(np.zeros([bandwidth, n_bands]))

    # Keep track of the event/kmer index for the lower left corner of the band
    # these indices are updated at every iteration to perform the adaptive banding
    # Only the first two bands have their coordinates initialized, the rest are computed adaptively
    EventKmerPair = namedtuple('EventKmerPair', ['event_idx', 'kmer_idx'])
    band_lower_left = [namedtuple('EventKmerPair', ['event_idx', 'kmer_idx']) for _ in range(n_bands)]

    # initialize range of first two bands
    band_lower_left[0].event_idx = half_bandwidth - 1
    band_lower_left[0].kmer_idx = -1 - half_bandwidth
    band_lower_left[1] = move_down(band_lower_left[0])

    # band 0: score zero in the central cell
    start_cell_offset = band_kmer_to_offset(0, -1)
    assert(is_offset_valid(start_cell_offset)), "Offset is outside the bounds [0, {}]: {}".format(bandwidth, start_cell_offset)
    assert(band_event_to_offset(0, -1) == start_cell_offset), "Event offset is not correct:" \
                                                              " {} != {}".format(band_event_to_offset(0, -1),
                                                                                 start_cell_offset)
    bands[0][start_cell_offset] = 0.0

    # band 1: first event is trimmed
    first_trim_offset = band_event_to_offset(1, 0)
    negative_one = kmer_at_offset(1, first_trim_offset)
    assert(negative_one == -1), "Kmer offset is not correct: {} != {}".format(negative_one, -1)
    assert(is_offset_valid(start_cell_offset)), "Offset is outside the bounds [0, {}]: {}".format(bandwidth, offset)
    bands[1][first_trim_offset] = lp_trim
    trace[1][first_trim_offset] = FROM_U

    fills = 0
    # fill in remaining bands
    for band_idx in range(2, n_bands):
        print(band_idx)
        # Determine placement of this band according to Suzuki's adaptive algorithm
        # When both ll and ur are out-of-band (ob) we alternate movements
        # otherwise we decide based on scores

        ll = bands[band_idx - 1][0]
        ur = bands[band_idx - 1][bandwidth - 1]
        ll_ob = ll == -np.infty
        ur_ob = ur == -np.infty

        if ll_ob and ur_ob:
            right = band_idx % 2 == 1
        else:
            right = ll < ur  # Suzuki's rule

        if right:
            band_lower_left[band_idx] = move_right(band_lower_left[band_idx - 1])
        else:
            band_lower_left[band_idx] = move_down(band_lower_left[band_idx - 1])

        # If the trim state is within the band, fill it in here
        trim_offset = band_kmer_to_offset(band_idx, -1)
        if is_offset_valid(trim_offset):
            event_idx = event_at_offset(band_idx, trim_offset)
            if 0 <= event_idx < n_events:
                bands[band_idx][trim_offset] = lp_trim * (event_idx + 1)
                trace[band_idx][trim_offset] = FROM_U
            else:
                bands[band_idx][trim_offset] = -np.infty
        else:
            "This happened!"

        # Get the offsets for the first and last event and kmer
        # We restrict the inner loop to only these values
        kmer_min_offset = band_kmer_to_offset(band_idx, 0)
        kmer_max_offset = band_kmer_to_offset(band_idx, n_kmers)
        event_min_offset = band_event_to_offset(band_idx, n_events - 1)
        event_max_offset = band_event_to_offset(band_idx, -1)

        min_offset = max(kmer_min_offset, event_min_offset)
        min_offset = max(min_offset, 0)

        max_offset = min(kmer_max_offset, event_max_offset)
        max_offset = min(max_offset, bandwidth)

        for offset in range(min_offset, max_offset):
            event_idx = event_at_offset(band_idx, offset)
            kmer_idx = kmer_at_offset(band_idx, offset)

            # kmer_rank = kmer_ranks[kmer_idx]

            offset_up   = band_event_to_offset(band_idx - 1, event_idx - 1)
            offset_left = band_kmer_to_offset(band_idx - 1, kmer_idx - 1)
            offset_diag = band_kmer_to_offset(band_idx - 2, kmer_idx - 1)

            if debug:
                # verify loop conditions
                assert(0 <= kmer_idx < n_kmers)
                assert(0 <= event_idx < n_events)
                assert(offset_diag == band_event_to_offset(band_idx - 2, event_idx - 1))
                assert(offset_up - offset_left == 1)
                assert(0 <= offset < bandwidth)

            if is_offset_valid(offset_up):
                up = bands[band_idx - 1][offset_up]
            else:
                up = -np.infty
            if is_offset_valid(offset_left):
                left = bands[band_idx - 1][offset_left]
            else:
                left = -np.infty
            if is_offset_valid(offset_diag):
                diag = bands[band_idx - 2][offset_diag]
            else:
                diag = -np.infty

            event_mean = events['mean'][event_idx]
            kmer = nucleotide_seq[kmer_idx:kmer_idx+k]
            lp_emission = model.log_event_mean_gaussian_probability_match(event_mean, kmer)

            score_d = diag + lp_step + lp_emission
            score_u = up + lp_stay + lp_emission
            score_l = left + lp_skip

            max_score = score_d
            from_where = FROM_D

            if score_u > max_score:
                max_score = score_u

            if max_score == score_u:
                from_where = FROM_U

            if score_l > max_score:
                max_score = score_l

            if max_score == score_l:
                from_where = FROM_L

            if debug:
                print("[adafill] offset-up: %d offset-diag: %d offset-left: %d\n", offset_up, offset_diag, offset_left)
                print("[adafill] up: %.2lf diag: %.2lf left: %.2lf\n", up, diag, left)
                print("[adafill] bi: %d o: %d e: %d k: %d s: %.2lf f: %d emit: %.2lf\n", band_idx, offset, event_idx, kmer_idx, max_score, from_where, lp_emission)

            bands[band_idx][offset] = max_score
            trace[band_idx][offset] = from_where
            fills += 1


    # Backtrack to compute alignment
    sum_emission = 0
    n_aligned_events = 0

    max_score = -np.infty
    curr_event_idx = 0
    curr_kmer_idx = n_kmers - 1
    # Find best score between an event and the last k-mer. after trimming the remaining events
    for event_idx in range(n_events):
        band_idx1 = event_kmer_to_band(event_idx, curr_kmer_idx)
        # assert(band_idx < bands.size())
        offset = band_event_to_offset(band_idx1, event_idx)
        if is_offset_valid(offset):
            s = bands[band_idx1][offset] + (n_events - event_idx) * lp_trim
            if s > max_score:
                max_score = s
                curr_event_idx = event_idx

    if debug:
        print("[adaback] ei: %d ki: %d s: %.2f\n", curr_event_idx, curr_kmer_idx, max_score)

    out = []
    curr_gap = 0
    max_gap = 0
    moves = 0
    while curr_kmer_idx >= 0 and curr_event_idx >= 0:
        # emit alignment
        out.append((curr_kmer_idx, curr_event_idx))
        if debug:
            print("[adaback] ei: %d ki: %d\n", curr_event_idx, curr_kmer_idx)
        # qc stats
        event_mean = events['mean'][curr_event_idx]
        # print(event_idx, event_mean)
        kmer = nucleotide_seq[curr_kmer_idx:curr_kmer_idx+k]

        kmer_emission = model.log_event_mean_gaussian_probability_match(event_mean, kmer)
        sum_emission += kmer_emission
        if moves == 0:
            events['model_state'][curr_event_idx] = kmer
            events['p_model_state'][curr_event_idx] = np.exp(kmer_emission)

        n_aligned_events += 1

        band_idx = event_kmer_to_band(curr_event_idx, curr_kmer_idx)
        offset = band_event_to_offset(band_idx, curr_event_idx)
        assert(band_kmer_to_offset(band_idx, curr_kmer_idx) == offset)

        from_where = trace[band_idx][offset]

        if from_where == FROM_D:
            moves += 1
            events['move'][curr_event_idx] = moves
            moves = 0

            curr_kmer_idx -= 1
            curr_event_idx -= 1
            curr_gap = 0

        elif from_where == FROM_U:
            events['move'][curr_event_idx] = 0
            print("Moves when skip {}".format(moves))
            moves = 0
            curr_event_idx -= 1
            curr_gap = 0

        else:
            moves += 1
            curr_kmer_idx -= 1
            curr_gap += 1
            max_gap = max(curr_gap, max_gap)
    events['move'][0] = 0
    # QC results
    out = out[::-1]
    avg_log_emission = sum_emission / n_aligned_events
    spanned = out[0][0] == 0 and out[-1][0] == n_kmers - 1
    if avg_log_emission < min_average_log_emission or not spanned or max_gap > max_gap_threshold:
        print(spanned, avg_log_emission)
        print("We failed... f**k")
    # //fprintf(stderr, "ada\t%s\t%s\t%.2lf\t%zu\t%.2lf\t%d\t%d\t%d\n", read.read_name.substr(0, 6).c_str(), failed ? "FAILED" : "OK", events_per_kmer, sequence.size(), avg_log_emission, curr_event_idx, max_gap, fills);

    return events[out[0][1]:out[-1][1]+1], sum_emission
예제 #23
0
 def test_event_table(data, req_fields=__default_event_table_fields__):
     """Wrapper function to test if event tables have required fields
     :param data: numpy array
     :param req_fields: required fields for event table """
     return check_numpy_table(data, req_fields)
예제 #24
0
def simple_banded_event_align(events, model, nucleotide_seq):
    """Generate a banded alignment between events and a nucleotide sequence

    :param events: event table with required fields: ('start', 'length', 'mean', 'stdv', 'model_state', 'move', 'p_model_state')
    :param model: HmmModel model
    :param nucleotide_seq: nucleotide sequence to match up
    """
    check_numpy_table(events, req_fields=('start', 'length', 'mean', 'stdv', 'model_state', 'move', 'p_model_state'))
    assert isinstance(model, HmmModel), "Input model needs to be HmmModel"
    k = model.kmer_length

    FROM_D = 0
    FROM_U = 1
    FROM_L = 2

    # # // qc
    min_average_log_emission = -5.0

    n_kmers = len(nucleotide_seq) - k + 1
    # // banding
    bandwidth = min(1000, n_kmers+1)
    half_band = bandwidth / 2

    # // transitions
    lp_skip = np.log(0.001)
    lp_stay = np.log(0.5)
    lp_step = np.log(1.0 - np.exp(lp_skip) - np.exp(lp_stay))
    lp_trim = np.log(0.1)

    n_events = len(events)
    events_per_kmer = float(n_kmers) / n_events
    min_event_idx_by_kmer = []
    # // Calculate the minimum event index that is within the band for each read kmer
    # // We determine this using the expected number of events observed per kmer
    for ki in range(n_kmers):
        expected_event_idx = ki * events_per_kmer
        min_event_idx_by_kmer.append(max(int(expected_event_idx - half_band), 0))

    n_rows = bandwidth
    n_cols = n_kmers + 1
    print(min_event_idx_by_kmer)
    print("n_rows", n_rows)
    print("n_cols", n_cols)
    print("seq_len", len(nucleotide_seq))

    viterbi_matrix = np.zeros((n_rows, n_cols), dtype=np.float64)
    backtrack_matrix = np.zeros((n_rows, n_cols), dtype=np.int32)

    for i in range(n_cols):
        viterbi_matrix[0][i] = -np.infty
        # backtrack_matrix[0][i] = 0

    for i in range(bandwidth):
        viterbi_matrix[i][0] = i * lp_trim
        # backtrack_matrix[i][0] = 0

    fills = 0
    for col in range(1, n_cols):
        # print("New_col {}/{}".format(col, n_cols))
        kmer_idx = col - 1
        min_event_idx = min_event_idx_by_kmer[kmer_idx]
        if kmer_idx > 0:
            min_event_idx_prev_col = min_event_idx_by_kmer[kmer_idx - 1]
        else:
            min_event_idx_prev_col = 0
        # kmer_rank = alphabet->kmer_rank(sequence.substr(kmer_idx, k).c_str(), k);

        for row in range(n_rows):
            event_idx = min_event_idx + row
            if event_idx >= n_events:
                viterbi_matrix[row][col] = -np.infty
                continue
            # // dp update
            # // here we are calculating whether the event for each neighboring cell is within the band
            # // and calculating its position within the column
            row_up = event_idx - min_event_idx - 1
            row_diag = event_idx - min_event_idx_prev_col - 1
            row_left = event_idx - min_event_idx_prev_col

            # try:
            if 0 <= row_up < n_rows:
                up = viterbi_matrix[row_up][col]
            else:
                up = -np.infty
            if 0 <= row_diag < n_rows:
                diag = viterbi_matrix[row_diag][col-1]
            else:
                diag = -np.infty
            if 0 <= row_left < n_rows:
                left = viterbi_matrix[row_left][col-1]
            else:
                left = -np.infty
            # except IndexError:
            #     print(row_up, row_diag, row_left, col)
            #     SystemExit
            # lp_emission = log_probability_match_r9(read, pore_model, kmer_rank, event_idx, strand_idx);
            event_mean = events['mean'][event_idx]
            # print(event_idx, event_mean)
            kmer = nucleotide_seq[kmer_idx:kmer_idx+k]
            lp_emission = model.log_event_mean_gaussian_probability_match(event_mean, kmer)

            score_d = diag + lp_step + lp_emission
            score_u = up + lp_stay + lp_emission
            score_l = left + lp_skip

            max_score = score_d
            from_where = FROM_D

            if score_u > max_score:
                max_score = score_u

            if max_score == score_u:
                from_where = FROM_U

            if score_l > max_score:
                max_score = score_l

            if max_score == score_l:
                from_where = FROM_L

            # //fprintf(stderr, "[orgfill] up: %.2lf diag: %.2lf left: %.2lf\n", up, diag, left)
            # fprintf(stderr, "[orgfill] e: %d k: %d s: %.2lf f: %d emit: %.2lf\n", event_idx, kmer_idx, max_score, from, lp_emission)
            viterbi_matrix[row][col] = max_score
            backtrack_matrix[row][col] = from_where
            fills += 1


    # // Initialize by finding best alignment between an event and the last kmer
    curr_k_idx = n_kmers - 1
    curr_event_idx = 0
    max_score = -np.infty
    for row in range(n_rows):
        col = curr_k_idx + 1
        ei = row + min_event_idx_by_kmer[curr_k_idx]
        s = viterbi_matrix[row][col] + (n_events - ei - 1) * lp_trim
        if s > max_score and ei < n_events:
            max_score = s
            curr_event_idx = ei

    sum_emission = 0
    n_aligned_events = 0
    moves = 0
    out = []
    while curr_k_idx >= 0:
        # // emit alignment
        out.append((curr_k_idx, curr_event_idx))
        event_mean = events['mean'][curr_event_idx]
        # print(event_idx, event_mean)
        kmer = nucleotide_seq[curr_k_idx:curr_k_idx+k]

        kmer_emission = model.log_event_mean_gaussian_probability_match(event_mean, kmer)
        sum_emission += kmer_emission
        if moves == 0:
            events['model_state'][curr_event_idx] = kmer
            events['p_model_state'][curr_event_idx] = np.exp(kmer_emission)

        # kmer_rank = alphabet->kmer_rank(sequence.substr(curr_k_idx, k).c_str(), k);
        # sum_emission += log_probability_match_r9(read, pore_model, kmer_rank, curr_event_idx, strand_idx);
        n_aligned_events += 1
        # // update indices using backtrack pointers
        row = curr_event_idx - min_event_idx_by_kmer[curr_k_idx]
        col = curr_k_idx + 1

        from_where = backtrack_matrix[row][col]
        if from_where == FROM_D:
            moves += 1
            events['move'][curr_event_idx] = moves

            curr_k_idx -= 1
            curr_event_idx -= 1
            moves = 0
        elif from_where == FROM_U:
            events['move'][curr_event_idx] = 0
            print("Moves when skip {}".format(moves))
            curr_event_idx -= 1
            moves = 0
        else:
            moves += 1
            curr_k_idx -= 1

    events['move'][0] = 0
    out = out[::-1]
    avg_log_emission = sum_emission / n_aligned_events
    spanned = out[0][0] == 0 and out[-1][0] == n_kmers - 1
    if avg_log_emission < min_average_log_emission or not spanned:
        print(spanned, avg_log_emission)
        print("We failed... f**k")
    return events[out[0][1]:out[-1][1]+1], sum_emission