Exemplo n.º 1
0
 def test_004_med_mad(self):
     x = np.array([[0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 1.0, 1.0],
                   [0.0, 0.5, 0.5, 1.0]])
     factor = 1
     loc, scale = maths.med_mad(x, factor=factor)
     self.assertTrue(np.allclose(loc, 0.5))
     self.assertTrue(np.allclose(scale, 0))
Exemplo n.º 2
0
def sample_filter_parameters(read_data,
                             number_to_sample,
                             chunk_len,
                             filter_mean_dwell,
                             filter_max_dwell,
                             chunk_len_means_sequence_len=False):
    """Sample number_to_sample reads from read_data, calculate median and MAD
    of mean dwell. Note the MAD has an adjustment factor so that it would give the
    same result as the std for a normal distribution.

    See docstring for sample_chunks() for the parameters.
    """
    no_filter_params = FILTER_PARAMETERS(filter_mean_dwell=filter_mean_dwell,
                                         filter_max_dwell=filter_max_dwell,
                                         median_meandwell=None,
                                         mad_meandwell=None)
    chunks, _ = sample_chunks(
        read_data,
        number_to_sample,
        chunk_len,
        no_filter_params,
        chunk_len_means_sequence_len=chunk_len_means_sequence_len)
    meandwells = [get_mean_dwell(chunk) for chunk in chunks]
    median_meandwell, mad_meandwell = med_mad(meandwells)
    return FILTER_PARAMETERS(filter_mean_dwell=filter_mean_dwell,
                             filter_max_dwell=filter_max_dwell,
                             median_meandwell=median_meandwell,
                             mad_meandwell=mad_meandwell)
Exemplo n.º 3
0
def sample_filter_parameters(read_data, number_to_sample, chunk_len,
                             filter_mean_dwell, filter_max_dwell,
                             filter_min_pass_fraction,
                             model_stride, path_buffer,
                             chunk_len_means_sequence_len=False):
    """ Sample number_to_sample reads from read_data, calculate median and MAD
    of mean dwell. Note the MAD has an adjustment factor so that it would give
    the same result as the std for a normal distribution.

    See FILTER_PARAMETERS docstring for details.
    """
    no_filter_params = FILTER_PARAMETERS(
        filter_mean_dwell=filter_mean_dwell, filter_max_dwell=filter_max_dwell,
        filter_min_pass_fraction=filter_min_pass_fraction,
        median_meandwell=None, mad_meandwell=None,
        model_stride=None, path_buffer=None)
    chunks, _ = sample_chunks(
        read_data, number_to_sample, chunk_len, no_filter_params,
        chunk_len_means_sequence_len=chunk_len_means_sequence_len)
    meandwells = [chunk.mean_dwell for chunk in chunks]
    median_meandwell, mad_meandwell = med_mad(meandwells)
    return FILTER_PARAMETERS(
        filter_mean_dwell=filter_mean_dwell, filter_max_dwell=filter_max_dwell,
        filter_min_pass_fraction=filter_min_pass_fraction,
        median_meandwell=median_meandwell, mad_meandwell=mad_meandwell,
        model_stride=model_stride, path_buffer=path_buffer)
Exemplo n.º 4
0
def one_read_shift_scale(read_tuple):

    read_filename, read_id = read_tuple

    try:
        with fast5_interface.get_fast5_file(read_filename, 'r') as f5file:
            read = f5file.get_read(read_id)
            sig = Signal(read)

    except Exception as e:
        sys.stderr.write(
            'Unable to obtain signal for {} from {}.\n{}\n'.format(
                read_id, read_filename, repr(e)))
        return (None, None, None)

    else:
        signal = sig.current

        if len(signal) > 0:
            shift, scale = med_mad(signal)
        else:
            shift, scale = np.NaN, np.NaN
            # note - if signal trimmed by ub, it could be of length zero by this point for short reads
            # These are taken out later in the existing code, in the new code we'll take out ub trimming

        return (read_id, shift, scale)
Exemplo n.º 5
0
 def test_004_med_mad(self):
     """Test to see if med_mad works with axis not set (so flattening)."""
     x = np.array([[0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 1.0, 1.0],
                   [0.0, 0.5, 0.5, 1.0]])
     factor = 1
     loc, scale = maths.med_mad(x, factor=factor)
     self.assertTrue(np.allclose(loc, 0.5))
     self.assertTrue(np.allclose(scale, 0))
Exemplo n.º 6
0
 def test_006_med_mad_over_axis1(self):
     x = np.array([[0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 1.0, 1.0],
                   [0.0, 0.5, 0.5, 1.0]])
     factor = 1
     loc, scale = maths.med_mad(x, factor=factor, axis=1)
     expected_loc = [0.5, 0.75, 0.5]
     expected_scale = [0, 0.25, 0.25]
     self.assertTrue(np.allclose(loc, expected_loc))
     self.assertTrue(np.allclose(scale, expected_scale))
Exemplo n.º 7
0
 def test_005_med_mad_over_axis0(self):
     """Test to see if med_mad works when axis=0."""
     x = np.array([[0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 1.0, 1.0],
                   [0.5, 1.0, 0.5, 1.0]])
     factor = 1
     loc, scale = maths.med_mad(x, factor=factor, axis=0)
     expected_loc = [0.5, 0.5, 0.5, 1.0]
     expected_scale = [0, 0, 0, 0]
     self.assertTrue(np.allclose(loc, expected_loc))
     self.assertTrue(np.allclose(scale, expected_scale))
Exemplo n.º 8
0
def sample_filter_parameters(read_data, number_to_sample, chunk_len, args,
                             log=None, chunk_log=None,
                             chunk_len_means_sequence_len = False):
    """Sample number_to_sample reads from read_data, calculate median and MAD
    of mean dwell. Note the MAD has an adjustment factor so that it would give the
    same result as the std for a normal distribution.
    
    See docstring for sample_chunks() for the parameters.
    """
    meandwells, _ = sample_chunks(read_data, number_to_sample, chunk_len, args, get_mean_dwell,
                                  log=log, chunk_log=chunk_log, log_accepted_chunks=True,
                                  chunk_len_means_sequence_len=chunk_len_means_sequence_len)
    return med_mad(meandwells)
Exemplo n.º 9
0
def med_mad_norm(x, dtype='f4'):
    """ Normalise a numpy array using median and MAD

    Args:
        x (:class:`ndarray`): 1D array containing values to be normalised.
        dtype (str or :class:`dtype`): dtype of returned array.

    Returns:
        :class:`ndarray`:  Array of same shape as `x` and dtype `dtype`
            contained normalised values.
    """
    med, mad = med_mad(x)
    normed_x = (x - med) / mad
    return normed_x.astype(dtype)
Exemplo n.º 10
0
def one_read_shift_scale(read_tuple):
    """  Read signal from fast5 and perform medmad scaling

    Args:
        read_tuple (tuple of str and str): A filename and the read_id to read
            from it.

    Returns:
        tuple of str and float and float: read_id of the read and the
            calculated shift and scale parameters.

        If the signal is unable to be read from the file, the read_id is not
        present for example, then (None, , None, None) is returned.

        When a signal is read, but has zero length, the shift and scale
        returned are `np.NaN`
    """
    read_filename, read_id = read_tuple

    try:
        with fast5_interface.get_fast5_file(read_filename, 'r') as f5file:
            read = f5file.get_read(read_id)
            sig = Signal(read)

    except Exception as e:
        sys.stderr.write(
            'Unable to obtain signal for {} from {}.\n{}\n'.format(
                read_id, read_filename, repr(e)))
        return (None, None, None)

    else:
        signal = sig.current

        if len(signal) > 0:
            shift, scale = med_mad(signal)
        else:
            shift, scale = np.NaN, np.NaN
            # Note - if signal trimmed by ub, it could be of length zero by
            # this point for short reads
            # These are taken out later in the existing code, in the new code
            # we'll take out ub trimming

        return (read_id, shift, scale)
Exemplo n.º 11
0
def med_mad_norm(x, dtype='f4'):
    """ Normalise a numpy array using median and MAD """
    med, mad = med_mad(x)
    normed_x = (x - med) / mad
    return normed_x.astype(dtype)
Exemplo n.º 12
0
def main():
    args = get_parser().parse_args()
    out_fp = sys.stdout if args.output is None else open(args.output, 'w')
    sys.stderr.write('* Reading data from file\n')
    out_fp.write('*' * 10 + ' General Metrics ' + '*' * 10 + '\n')
    with MappedSignalReader(args.input) as msr:
        alphabet_info = msr.get_alphabet_information()
        out_fp.write('Alphabet: {}\n'.format(str(alphabet_info)))
        read_ids = msr.get_read_ids()
        out_fp.write('Total reads: {}\n'.format(len(read_ids)))
        if args.num_reads is not None:
            np.random.shuffle(read_ids)
            read_ids = read_ids[:args.num_reads]
        reads = list(msr.reads(read_ids))

    sys.stderr.write('* Computing sanity check metrics\n')
    out_fp.write('\n\n' + '*' * 10 + ' Sanity Checks ' + '*' * 10 + '\n')
    current_meds = np.array([np.median(read.get_current()) for read in reads])
    out_fp.write((
        'Median of medians of normalized signal: {:.6f} (should usually ' +
        'be close to 0)\n').format(np.median(current_meds)))
    current_mads = np.array([
        np.median(np.abs(read.get_current() - r_med)) * MAD_SD_FACTOR
        for read, r_med in zip(reads, current_meds)])
    out_fp.write((
        'Median of standardized MADs of normalized signal: {:.6f} ' +
        '(should usually be close to 1.0)\n').format(np.median(current_mads)))
    all_seqs = np.concatenate([read.Reference for read in reads])
    seq_counts = np.bincount(all_seqs)
    total_bases = np.sum(seq_counts)
    out_fp.write(
        'Global sequence composition:\n{: >11}{: >11}{: >11}\n'.format(
            'base', 'count', 'percent') + '\n'.join(
                '{: >11}{:11.0f}{:11.4f}'.format(*base_metrics)
                for base_metrics in zip(
                    alphabet_info.alphabet, seq_counts,
                    100 * seq_counts / total_bases)) + '\n')

    if args.motif is not None:
        sys.stderr.write('* Computing motif metrics (this may take a ' +
                         'while to complete)\n')
        out_fp.write('\n\n' + '*' * 10 + ' Motif Metrics ' + '*' * 10 + '\n')

        def match_motif(seq, motif):
            for base, motif_pos_bases in zip(seq, motif):
                if base not in motif_pos_bases:
                    return False
            return True

        motifs = [
            ([np.concatenate([np.where([m_base == a_base for a_base in
                                        alphabet_info.collapse_alphabet])[0]
                              for m_base in SINGLE_LETTER_CODE[m_raw_base]])
              for m_raw_base in motif], int(rel_pos))
            for motif, rel_pos in args.motif]
        motif_mod_counts = [[] for _ in range(len(motifs))]
        for read in reads:
            for motif_i, (motif, rel_pos) in enumerate(motifs):
                for offset in range(read.Reference.shape[0] - len(motif)):
                    if match_motif(
                            read.Reference[offset:offset + len(motif)], motif):
                        motif_mod_counts[motif_i].append(
                            read.Reference[offset + rel_pos])

        for (raw_motif, rel_pos), m_mod_counts in zip(
                args.motif, motif_mod_counts):
            rel_pos = int(rel_pos)
            motif_mod_counts = np.bincount(m_mod_counts)
            total_mod_bases = np.sum(motif_mod_counts)
            out_fp.write(
                ('{} Motif Modified Base Counts:\n{: >11}{: >11}' +
                 '{: >11}\n').format(raw_motif, 'base', 'count', 'percent') +
                '\n'.join('{: >11}{:11.0f}{:11.4f}'.format(
                    raw_motif[:rel_pos] + base + raw_motif[rel_pos + 1:],
                    count, pct) for base, count, pct in zip(
                        alphabet_info.alphabet, motif_mod_counts,
                        100 * motif_mod_counts / total_mod_bases)
                    if count > 0) + '\n\n')

    sys.stderr.write('* Computing read metrics\n')
    out_fp.write('\n' + '*' * 10 + ' Read Metrics ' + '*' * 10 + '\n')
    read_lens = np.array([read.reflen for read in reads])
    out_fp.write(('Median read length: {}\n').format(np.median(read_lens)))
    sig_lens = np.array([read.siglen for read in reads])
    out_fp.write(('Median signal length: {}\n').format(
        np.median(sig_lens)))

    if args.num_chunks is None:
        return
    sys.stderr.write('* Computing chunk metrics\n')
    out_fp.write('\n\n' + '*' * 10 + ' Chunk Metrics ' + '*' * 10 + '\n')
    chunks, rej_res = [], []
    while len(chunks) < args.num_chunks:
        chunk = np.random.choice(reads, 1)[0].get_chunk_with_sample_length(
            args.chunk_len)
        if chunk.accepted:
            chunks.append(chunk)
        else:
            rej_res.append(chunk.reject_reason)
    if len(rej_res) > 0:
        out_fp.write(
            'Chunk rejection reasons:\n{: >16}{: >16}\n'.format(
                'Reject Reason', 'Num. Chunks') +
            '\n'.join('{: >16}{: >16}'.format(*x)
                      for x in Counter(rej_res).most_common()) + '\n\n')
    else:
        out_fp.write('All chunks passed filters\n\n')
    mean_dwells = np.array([chunk.mean_dwell for chunk in chunks])
    max_dwells = np.array([chunk.max_dwell for chunk in chunks])
    # report in MAD units for direct use in command line parameter usage
    median_meandwell, mad_meandwell = med_mad(mean_dwells)
    out_fp.write(
        ('Chunk dwell distribution (standard units for direct use with ' +
         '--filter_max_dwell and --filter_mean_dwell):\n' +
         '{: >15}{: >15}{: >15}{: >15}{: >15}\n').format(
             'percentile', 'mean_dwell', 'mean_std_units',
             'max_dwell', 'max_std_units') +
        '\n'.join('{:15.2f}{:15.2f}{:15.2f}{:15.2f}{:15.2f}'.format(
            *pctl_metrics) for pctl_metrics in zip(
                args.chunk_percentiles,
                np.percentile(mean_dwells, args.chunk_percentiles),
                np.percentile(mean_dwells - median_meandwell / mad_meandwell,
                              args.chunk_percentiles),
                np.percentile(max_dwells, args.chunk_percentiles),
                np.percentile(max_dwells / median_meandwell,
                              args.chunk_percentiles))) + '\n')

    if args.output is not None:
        out_fp.close()