예제 #1
0
    def test_mapping_reftosignal(self):
        """ Test the conversion from remapped path to reftosignal output

        Returns:
            None
        """
        sig = signal.Signal(dacs=np.zeros(12))
        # testing path with a single skip (over 3rd base; first "T")
        path = np.array([-1, 0, 0, 1, 1, 1, 3, 3, 3, 4, 4, 5, 6],
                        dtype=np.int32)
        reference = 'ACTACGT'

        int_ref = signal_mapping.SignalMapping.get_integer_reference(
            reference, 'ACGT')
        sigtoref_res = signal_mapping.SignalMapping.from_remapping_path(
            path, int_ref, 1, sig).Ref_to_signal
        self.assertEqual(sigtoref_res.tolist(), [0, 2, 5, 5, 8, 10, 11, 12])

        # now test with clipped bases
        sig = signal.Signal(dacs=np.zeros(15))
        # testing path with a single skip (over 4th base; first "T")
        path = np.array([-1, -1, 1, 1, 2, 2, 2, 4, 4, 4, 5, 5, 6, 7, -1, -1],
                        dtype=np.int32)
        reference = 'AACTACGTTT'

        int_ref = signal_mapping.SignalMapping.get_integer_reference(
            reference, 'ACGT')
        sigtoref_res = signal_mapping.SignalMapping.from_remapping_path(
            path, int_ref, 1, sig).Ref_to_signal
        self.assertEqual(sigtoref_res.tolist(),
                         [-1, 1, 3, 6, 6, 9, 11, 12, 13, 16, 16])

        return
예제 #2
0
    def test_mapping_reftosignal(self):
        """Test the conversion from remapped path to reftosignal output
        """
        sig = signal.Signal(dacs=np.zeros(12))
        # testing path with a single skip (over 3rd base; first "T")
        path = np.array([0,0,1,1,1,3,3,3,4,4,5,6], dtype=np.int32)
        reference =  'ACTACGT'

        sigtoref_res = mapping.Mapping(
            sig, path, reference).get_reftosignal()
        self.assertEqual(sigtoref_res.tolist(),
                         [0, 2, 5, 5, 8, 10, 11, 12])

        # now test with clipped bases
        sig = signal.Signal(dacs=np.zeros(15))
        # testing path with a single skip (over 4th base; first "T")
        path = np.array([-1,1,1,2,2,2,4,4,4,5,5,6,7,-1,-1], dtype=np.int32)
        reference =  'AACTACGTTT'

        sigtoref_res = mapping.Mapping(
            sig, path, reference).get_reftosignal()
        self.assertEqual(sigtoref_res.tolist(),
                         [-1, 1, 3, 6, 6, 9, 11, 12, 13, 16, 16])

        return
예제 #3
0
def get_remapping(sig_fn, dacs, scale_params, ref_seq, stride, read_id,
                  r_to_q_poss, rl_cumsum, r_ref_pos, ref_out_info):
    read = fast5_interface.get_fast5_file(sig_fn, 'r').get_read(read_id)
    channel_info = dict(fast5utils.get_channel_info(read).items())
    rd_factor = channel_info['range'] / channel_info['digitisation']
    shift_from_pA = (scale_params[0] + channel_info['offset']) * rd_factor
    scale_from_pA = scale_params[1] * rd_factor
    read_attrs = dict(fast5utils.get_read_attributes(read).items())

    # prepare taiyaki signal object
    sig = tai_signal.Signal(dacs=dacs)
    sig.channel_info = channel_info
    sig.read_attributes = read_attrs
    sig.offset = channel_info['offset']
    sig.range = channel_info['range']
    sig.digitisation = channel_info['digitisation']

    path = np.full((dacs.shape[0] // stride) + 1, -1)
    # skip last value since this is where the two seqs end
    for ref_pos, q_pos in enumerate(r_to_q_poss[:-1]):
        # if the query position maps to the end of the mapping skip it
        if rl_cumsum[q_pos + r_ref_pos.q_trim_start] >= path.shape[0]:
            continue
        path[rl_cumsum[q_pos + r_ref_pos.q_trim_start]] = ref_pos
    remapping = tai_mapping.Mapping.from_remapping_path(
        sig, path, ref_seq, stride)
    try:
        remapping.add_integer_reference(ref_out_info.alphabet)
    except Exception:
        raise mh.MegaError('Invalid reference sequence encountered')

    return (remapping.get_read_dictionary(shift_from_pA, scale_from_pA,
                                          read_id),
            prepare_mapping_funcs.RemapResult.SUCCESS)
예제 #4
0
def get_remapping(
    sig_fn,
    dacs,
    scale_params,
    ref_seq,
    stride,
    read_id,
    r_to_q_poss,
    rl_cumsum,
    r_ref_pos,
    ref_out_info,
):
    read = fast5_io.get_read(sig_fn, read_id)
    channel_info = dict(fast5utils.get_channel_info(read).items())
    read_params = {
        "trim_start": 0,
        "trim_end": 0,
        "shift": scale_params[0],
        "scale": scale_params[1],
    }
    sig = tai_signal.Signal(
        dacs=dacs,
        channel_info=channel_info,
        read_id=read_id,
        read_params=read_params,
    )

    ref_to_sig = np.empty(len(ref_seq) + 1, dtype=np.int32)
    # skip last value since this is where the two seqs end
    for ref_pos, q_pos in enumerate(r_to_q_poss):
        ref_to_sig[ref_pos] = rl_cumsum[q_pos + r_ref_pos.q_trim_start] * stride
    try:
        int_ref = tai_mapping.SignalMapping.get_integer_reference(
            ref_seq, ref_out_info.alphabet_info.alphabet
        )
    except Exception:
        raise mh.MegaError("Invalid reference sequence encountered")
    sig_mapping = tai_mapping.SignalMapping(ref_to_sig, int_ref, signalObj=sig)

    # annotate mod motifs
    if ref_out_info.ref_mods_all_motifs is not None:
        # annotate all mod base motif positions with alts
        int_ref = set_all_motif_mods(int_ref, ref_out_info.ref_mods_all_motifs)
        # set new Reference with mods annotated
        sig_mapping.Reference = int_ref

    return (
        sig_mapping.get_read_dictionary(),
        prepare_mapping_funcs.RemapResult.SUCCESS,
    )
예제 #5
0
def get_remapping(sig_fn, dacs, scale_params, ref_seq, stride, read_id,
                  r_to_q_poss, rl_cumsum, r_ref_pos, ref_out_info):
    read = fast5_interface.get_fast5_file(sig_fn, 'r').get_read(read_id)
    channel_info = dict(fast5utils.get_channel_info(read).items())
    rd_factor = channel_info['range'] / channel_info['digitisation']
    read_params = {
        'trim_start': 0,
        'trim_end': 0,
        'shift': (scale_params[0] + channel_info['offset']) * rd_factor,
        'scale': scale_params[1] * rd_factor
    }
    sig = tai_signal.Signal(dacs=dacs,
                            channel_info=channel_info,
                            read_id=read_id,
                            read_params=read_params)

    path = np.full((dacs.shape[0] // stride) + 1, -1)
    # skip last value since this is where the two seqs end
    for ref_pos, q_pos in enumerate(r_to_q_poss[:-1]):
        # if the query position maps to the end of the mapping skip it
        if rl_cumsum[q_pos + r_ref_pos.q_trim_start] >= path.shape[0]:
            continue
        path[rl_cumsum[q_pos + r_ref_pos.q_trim_start]] = ref_pos

    try:
        int_ref = tai_mapping.SignalMapping.get_integer_reference(
            ref_seq, ref_out_info.alphabet)
    except Exception:
        raise mh.MegaError('Invalid reference sequence encountered')
    sig_mapping = tai_mapping.SignalMapping.from_remapping_path(
        path, int_ref, stride, sig)

    # annotate mod motifs
    if ref_out_info.ref_mods_all_motifs is not None:
        # annotate all mod base motif positions with alts
        int_ref = set_all_motif_mods(int_ref, ref_out_info.ref_mods_all_motifs,
                                     ref_out_info.collapse_alphabet)
        # set new Reference with mods annotated
        sig_mapping.Reference = int_ref

    return (sig_mapping.get_read_dictionary(),
            prepare_mapping_funcs.RemapResult.SUCCESS)
예제 #6
0
def oneread_remap(read_tuple, references, model, device, per_read_params_dict,
                  alphabet_info):
    """ Worker function for remapping reads using flip-flop model on raw signal
    :param read_tuple                 : read, identified by a tuple (filepath, read_id)
    :param references                 :dict mapping fast5 filenames to reference strings
    :param model                      :pytorch model (the torch data structure, not a filename)
    :param device                     :integer specifying which GPU to use for remapping, or 'cpu' to use CPU
    :param per_read_params_dict       :dictionary where keys are UUIDs, values are dicts containing keys
                                         trim_start trim_end shift scale
    :param alphabet_info              : AlphabetInfo object for basecalling

    :returns: tuple of dictionary as specified in mapped_signal_files.Read class
              and a message string indicating an error if one occured
    """
    filename, read_id = read_tuple
    try:
        with fast5_interface.get_fast5_file(filename, 'r') as f5file:
            read = f5file.get_read(read_id)
            sig = signal.Signal(read)
    except Exception:
        return None, READ_ID_INFO_NOT_FOUND_ERR_TEXT

    if read_id in references:
        read_ref = references[read_id]
    else:
        return None, NO_REF_FOUND_ERR_TEXT

    try:
        read_params_dict = per_read_params_dict[read_id]
    except KeyError:
        return None, NO_PARAMS_ERR_TEXT

    sig.set_trim_absolute(read_params_dict['trim_start'],
                          read_params_dict['trim_end'])

    try:
        torch.set_num_threads(
            1
        )  # Prevents torch doing its own parallelisation on top of our imap_map
        # Standardise (i.e. shift/scale so that approximately mean =0, std=1)
        signalArray = (sig.current -
                       read_params_dict['shift']) / read_params_dict['scale']
        # Make signal into 3D tensor with shape [siglength,1,1] and move to appropriate device (GPU  number or CPU)
        signalTensor = torch.tensor(
            signalArray[:, np.newaxis, np.newaxis].astype(taiyaki_dtype),
            device=device)
        # The model must live on the same device
        modelOnDevice = model.to(device)
        # Apply the network to the signal, generating transition weight matrix, and put it back into a numpy array
        with torch.no_grad():
            transweights = modelOnDevice(signalTensor).cpu().numpy()
    except Exception:
        return None, REMAP_ERR_TEXT

    # Extra dimensions introduced by np.newaxis above removed by np.squeeze
    can_read_ref = alphabet_info.collapse_sequence(read_ref)
    remappingscore, path = flipflop_remap.flipflop_remap(
        np.squeeze(transweights),
        can_read_ref,
        alphabet=alphabet_info.can_bases,
        localpen=0.0)
    # read_ref comes out as a bytes object, so we need to convert to str
    # localpen=0.0 does local alignment

    # flipflop_remap() establishes a mapping between the network outputs and the reference.
    # What we need is a mapping between the signal and the reference.
    # To resolve this we need to know the stride of the model (how many samples for each network output)
    model_stride = helpers.guess_model_stride(model)
    remapping = mapping.Mapping.from_remapping_path(sig, path, read_ref,
                                                    model_stride)
    remapping.add_integer_reference(alphabet_info.alphabet)

    return remapping.get_read_dictionary(read_params_dict['shift'],
                                         read_params_dict['scale'],
                                         read_id), REMAP_SUCCESS_TEXT
예제 #7
0
def oneread_remap(read_tuple, references, model, device, per_read_params_dict,
                  alphabet, collapse_alphabet):
    """ Worker function for remapping reads using flip-flop model on raw signal
    :param read_tuple                 : read, identified by a tuple (filepath, read_id)
    :param references                 :dict mapping fast5 filenames to reference strings
    :param model                      :pytorch model (the torch data structure, not a filename)
    :param device                     :integer specifying which GPU to use for remapping, or 'cpu' to use CPU
    :param per_read_params_dict       :dictionary where keys are UUIDs, values are dicts containing keys
                                         trim_start trim_end shift scale
    :param alphabet                   : alphabet for basecalling (passed on to mapped-read file)
    :param collapse_alphabet          : collapsed alphabet for basecalling (passed on to mapped-read file)

    :returns: dictionary as specified in mapped_signal_files.Read class
    """
    filename, read_id = read_tuple
    try:
        with fast5_interface.get_fast5_file(filename, 'r') as f5file:
            read = f5file.get_read(read_id)
            sig = signal.Signal(read)
    except Exception as e:
        # We want any single failure in the batch of reads to not disrupt other reads being processed.
        sys.stderr.write(
            'No read information on read {} found in file {}.\n{}\n'.format(
                read_id, filename, repr(e)))
        return None

    if read_id in references:
        read_ref = references[read_id].decode("utf-8")
    else:
        sys.stderr.write('No fasta reference found for {}.\n'.format(read_id))
        return None

    if read_id in per_read_params_dict:
        read_params_dict = per_read_params_dict[read_id]
    else:
        return None

    sig.set_trim_absolute(read_params_dict['trim_start'],
                          read_params_dict['trim_end'])

    try:
        torch.set_num_threads(
            1
        )  # Prevents torch doing its own parallelisation on top of our imap_map
        # Standardise (i.e. shift/scale so that approximately mean =0, std=1)
        signalArray = (sig.current -
                       read_params_dict['shift']) / read_params_dict['scale']
        # Make signal into 3D tensor with shape [siglength,1,1] and move to appropriate device (GPU  number or CPU)
        signalTensor = torch.tensor(
            signalArray[:, np.newaxis, np.newaxis].astype(taiyaki_dtype),
            device=device)
        # The model must live on the same device
        modelOnDevice = model.to(device)
        # Apply the network to the signal, generating transition weight matrix, and put it back into a numpy array
        with torch.no_grad():
            transweights = modelOnDevice(signalTensor).cpu().numpy()
    except Exception as e:
        sys.stderr.write(
            "Failure applying basecall network to remap read {}.\n{}\n".format(
                sig.read_id, repr(e)))
        return None

    # Extra dimensions introduced by np.newaxis above removed by np.squeeze
    remappingscore, path = flipflop_remap.flipflop_remap(
        np.squeeze(transweights), read_ref, localpen=0.0)
    # read_ref comes out as a bytes object, so we need to convert to str
    # localpen=0.0 does local alignment

    # flipflop_remap() establishes a mapping between the network outputs and the reference.
    # What we need is a mapping between the signal and the reference.
    # To resolve this we need to know the stride of the model (how many samples for each network output)
    model_stride = helpers.guess_model_stride(model, device=device)
    remapping = mapping.Mapping.from_remapping_path(sig, path, read_ref,
                                                    model_stride)

    return remapping.get_read_dictionary(read_params_dict['shift'],
                                         read_params_dict['scale'],
                                         read_id,
                                         alphabet=alphabet,
                                         collapse_alphabet=collapse_alphabet)
예제 #8
0
def oneread_remap(read_tuple,
                  model,
                  per_read_params_dict,
                  alphabet_info,
                  max_read_length,
                  device='cpu',
                  localpen=0.0):
    """ Worker function for remapping reads using flip-flop model on raw signal

    Args:
        read_tuple (tuple) : read, identified by a tuple
                                  (filepath, read_id, read reference)
        model (pytorch Module): pytorch model
        device (int or float): integer specifying which GPU to use for
                                remapping, or 'cpu' to use CPU
        per_read_params_dict (dict) : dictionary where keys are UUIDs,
                                      values are dicts containing keys
                                      trim_start trim_end shift scale
        alphabet_info (AlphabetInfo object):  for basecalling
        max_read_length (int) : Don't attempt to remap reads with references
                                longer than this
        localpen (float): Penalty for local mapping

    Returns:
        tuple :(dict,str) containing
        1. dictionary as specified in
            signal_mapping.SignalMapping.get_read_dictionary
        2. message string indicating an error if one occured
    """
    filename, read_id, read_ref = read_tuple

    if read_ref is None:
        return None, RemapResult.NO_REF_FOUND

    if max_read_length is not None and len(read_ref) > max_read_length:
        return None, RemapResult.REF_TOO_LONG

    try:
        read_params_dict = per_read_params_dict[read_id]
    except KeyError:
        return None, RemapResult.NO_PARAMS

    try:
        with fast5_interface.get_fast5_file(filename, 'r') as f5file:
            read = f5file.get_read(read_id)
            sig = signal.Signal(read, read_params=read_params_dict)
    except Exception:
        return None, RemapResult.READ_ID_INFO_NOT_FOUND

    try:
        # Prevents torch doing its own parallelisation on top of our imap_map
        torch.set_num_threads(1)
        # Make signal into 3D tensor with shape [siglength,1,1] and move to
        # appropriate device (GPU  number or CPU)
        signalTensor = torch.tensor(
            sig.standardized_current[:, np.newaxis,
                                     np.newaxis].astype(np.float32),
            device=device)
        # The model must live on the same device
        modelOnDevice = model.to(device)
        # Apply the network to the signal, generating transition weight matrix,
        # and put it back into a numpy array
        with torch.no_grad():
            transweights = modelOnDevice(signalTensor).cpu().numpy()
    except Exception:
        return None, RemapResult.NETWORK_ERROR

    # Extra dimensions introduced by np.newaxis above removed by np.squeeze
    can_read_ref = alphabet_info.collapse_sequence(read_ref)
    remappingscore, path = flipflop_remap.flipflop_remap(
        np.squeeze(transweights),
        can_read_ref,
        alphabet=alphabet_info.can_bases,
        localpen=localpen)
    # read_ref comes out as a bytes object, so we need to convert to str
    # localpen=0.0 does local alignment

    # flipflop_remap() establishes a mapping between the network outputs and
    # the reference.
    # What we need is a mapping between the signal and the reference.
    # To resolve this we need to know the stride of the model (how many samples
    # for each network output)
    model_stride = helpers.guess_model_stride(model)
    int_ref = signal_mapping.SignalMapping.get_integer_reference(
        read_ref, alphabet_info.alphabet)
    sig_mapping = signal_mapping.SignalMapping.from_remapping_path(
        path, int_ref, model_stride, sig)
    try:
        sig_mapping_dict = sig_mapping.get_read_dictionary()
    except signal_mapping.TaiyakiSigMapError as e:
        return None, str(e)
    return sig_mapping_dict, RemapResult.SUCCESS