예제 #1
0
def filter_fraction(pm: PeakMatrix,
                    fraction_threshold: float,
                    within_classes: bool = False,
                    class_tag_type: Any = None,
                    flag_name: str = 'fraction_flag'):
    """
    PeakMatrix fraction filter.

    :param pm: the target peak matrix
    :param fraction_threshold: threshold of the sample fractions
    :param within_classes: whether to calculate the fraction array within each class. Default = False
    :param class_tag_type: tag type to unmask samples within the same class (e.g. "classLabel"). Default = None
    :param flag_name: name of the new flag. Default = 'fraction_flag'
    :rtype: PeakMatrix object

    This filter will calculate the fraction array over all samples or within each class (based on class_tag_type).
    The peaks with a fraction value smaller than the threshold will be unflagged.

    """
    if not within_classes:
        pm.add_flag(flag_name, pm.fraction >= fraction_threshold)
    else:
        if class_tag_type is None:
            raise KeyError(
                'must provide class tag type for within classes filtering')
        if not all([t.has_tag_type(class_tag_type) for t in pm.peaklist_tags]):
            raise AttributeError('not all tags have tag type [%s]' %
                                 class_tag_type)
        flg = np.zeros(pm.shape[1])
        for tag in pm.tags_of(class_tag_type):
            with unmask_peakmatrix(pm, tag) as m:
                flg = np.logical_or(flg, (m.fraction >= fraction_threshold))
        pm.add_flag(flag_name, flg)
    return pm
예제 #2
0
def filter_rsd(pm: PeakMatrix,
               rsd_threshold: Union[int, float],
               qc_tag: Any,
               on_attr: str = 'intensity',
               flag_name: str = 'rsd_flag'):
    """
    PeakMatrix RSD filter.

    :param pm: the target peak matrix
    :param rsd_threshold: threshold of the RSD of the QC samples
    :param qc_tag: tag (label) to unmask qc samples
    :param on_attr: calculate RSD on given attribute. Default = "intensity"
    :param flag_name: name of the new flag. Default = 'rsd_flag'
    :rtype: PeakMatrix object

    This filter will calculate the RSD values of the QC samples. A peak with a QC RSD value larger than the
    threshold will be unflagged.

    """
    rsd_values = pm.rsd(qc_tag, on_attr=on_attr)
    if np.any(np.isnan(rsd_values)):
        logging.warning(
            'nan found in QC rsd values, filter might not work properly')

    pm.add_flag(flag_name,
                [not (np.isnan(v) or v > rsd_threshold) for v in rsd_values])
    return pm
예제 #3
0
def load_peak_matrix_from_txt(filename: str, delimiter: str = '\t', samples_in_rows: bool = True, comprehensive: str = 'auto'):
    """
    Loads a peak matrix from plain text file.

    :param filename: path to an exiting text-based peak matrix file
    :param delimiter: delimiter of the text lines. Default = '\t', i.e., TSV format
    :param samples_in_rows: whether or not the samples are stored in rows. Default = True
    :param comprehensive: whether the input is a 'comprehensive' or 'simple' version of the matrix. Default = 'auto', i.e., auto detect
    :rtype: PeakMatrix object

    """
    if not os.path.isfile(filename):
        raise IOError('plain text file [%s] does not exist' % filename)
    with open(filename, 'r') as f:
        rlns = [x for x in f.readlines() if x != '']

    dlns = [list(map(str.strip, x.split(delimiter))) for x in rlns]
    if any([len(x) != len(dlns[0]) for x in dlns[1:]]):
        raise IOError('data matrix size not match')

    if samples_in_rows: dlns = list(zip(*dlns))
    if comprehensive == 'auto': comprehensive = ('flags' in dlns[0])
    rdlns = list(zip(*dlns))
    rsdrow = list(filter(lambda x: x[1][0] == 'rsd_all', enumerate(rdlns)))[0][0]

    def _parseflags():
        fgs = []
        for l, ln in enumerate(rdlns[rsdrow+1:]):
            if ln[0] == 'flags': break
            fgs += [(ln[0], list(map(eval, [x for x in ln[1:] if x != ''])))]
        return fgs
    flgs = _parseflags() if comprehensive else []

    # must refactor if PeakMatrix.to_str changed
    pcol = rsdrow + len(flgs) + 2 if comprehensive else 1
    pids = dlns[0][pcol:]

    def _parsetags(tgs):
        l = 0
        for l, ln in enumerate(dlns[2:]):  # line 1 = missing
            if not ln[0].startswith('tags_'): break
            tn, tv = ln[0][5:], ln[pcol:]
            tl = [x for x in enumerate(_evalv(tv)) if x[1] != '']
            for i, v in tl: tgs[i].add_tag(v) if tn == 'untyped' else tgs[i].add_tag(v, tn)
        return l, tgs
    tnum, tags = 0, [PeakList_Tags() for _ in pids]
    if comprehensive: tnum, tags = _parsetags(tags)

    rlns = list(zip(*dlns[2 + tnum:]))
    mz = np.array([rlns[0]] * len(pids), dtype=float)
    ints = np.array(rlns[pcol:], dtype=float)

    pm = PeakMatrix(pids, tags, [('mz', mz), ('intensity', ints)])
    for fn, fv in flgs: pm.add_flag(fn, fv, flagged_only = False)
    return pm
예제 #4
0
def load_peak_matrix_from_hdf5(filename: str, compatibility_mode: bool = False):
    """
    Loads a peak matrix from a HDF5 file.

    :param filename: path to an existing HDF5 file
    :rtype: PeakMatrix object

    """
    if not os.path.isfile(filename):
        raise IOError('HDF5 database [%s] does not exist' % filename)
    if not h5py.is_hdf5(filename):
        raise IOError('input file [%s] is not a valid HDF5 database' % filename)
    if compatibility_mode: logging.warning('DeprecationWarning: loading HDF file in the old format')
    f = h5py.File(filename, 'r') if compatibility_mode else ptb.open_file(filename, mode = 'r')

    def _old_loadpm():
        dset = f['mz']
        if _convByteStr(dset.attrs.get('class', '')) != 'PeakMatrix':
            raise IOError('input database is not a valid PeakMatrix')
        attl = dset.attrs['attributes'].astype(str)
        pids = dset.attrs['peaklist_ids'].astype(str)
        mask = dset.attrs['mask']

        tatt = sorted([x for x in dset.attrs.keys() if x.startswith('peaklist_tags_')], key=lambda x: int(x[14:]))
        ptgs = [PeakList_Tags(*[Tag(_eval(v), None if t == 'None' else t) for t,v in map(lambda x: x.astype(str), tags)]) for tags in [dset.attrs[x] for x in tatt]]

        flgs = [(fn, dset.attrs[fn]) for fn in dset.attrs['flag_names'].astype(str)]
        alst = [(attr, np.array(f[attr]).astype(f[attr].attrs['dtype'])) for attr in attl]
        return pids, ptgs, alst, mask, flgs

    def _loadpm():
        dset = f.root.mz
        if dset.attrs.data_class != 'PeakMatrix':
            raise IOError('input database is not a valid PeakMatrix')
        attl = dset.attrs.attributes
        pids = dset.attrs.peaklist_ids
        mask = dset.attrs.mask

        tatt = sorted([x for x in dset.attrs._f_list('user') if x.startswith('peaklist_tags_')], key = lambda x: int(x[14:]))
        ptgs = [PeakList_Tags(*[Tag(_eval(v), None if t == 'None' else t) for t, v in map(lambda x: x.astype(str), tags)]) for tags in [dset.attrs[x] for x in tatt]]

        flgs = [(flg, dset.attrs[flg]) for flg in dset.attrs.flag_names]
        alst = [(attr, f.root[attr].read().astype(f.root[attr].attrs.dtype)) for attr in attl]
        return pids, ptgs, alst, mask, flgs

    res = (_old_loadpm if compatibility_mode else _loadpm)()
    f.close()

    pm = PeakMatrix(*res[:3])
    pm.mask = res[3]
    for fn, fv in res[4]: pm.add_flag(fn, fv, flagged_only=False)
    return pm
예제 #5
0
def load_peak_matrix_from_hdf5(filename):
    """
    Loads a peak matrix from a HDF5 file.

    :param filename: path to an existing HDF5 file
    :rtype: PeakMatrix object

    """
    if not os.path.isfile(filename):
        raise IOError('HDF5 database [%s] does not exist' % filename)
    if not h5py.is_hdf5(filename):
        raise IOError('input file [%s] is not a valid HDF5 database' %
                      filename)
    f = h5py.File(filename, 'r')

    if 'mz' not in f:
        raise IOError('input database missing crucial attribute [mz]')

    dset = f['mz']
    if dset.attrs.get('class', '') != 'PeakMatrix':
        raise IOError('input database is not a valid PeakMatrix')
    attl = dset.attrs['attributes']
    pids = dset.attrs['peaklist_ids']
    mask = dset.attrs['mask']

    tatt = sorted(filter(lambda x: x.startswith('peaklist_tags_'),
                         dset.attrs.keys()),
                  key=lambda x: int(x[14:]))
    ptgs = [
        PeakList_Tags(
            *[Tag(_eval(v), None if t == 'None' else t) for t, v in tags])
        for tags in map(lambda x: dset.attrs[x], tatt)
    ]

    flgs = [(fn, dset.attrs[fn]) for fn in dset.attrs['flag_names']]
    flgs = [(fn, _unpackBool(fv) if fv.dtype.kind == 'u' and np.all(fv[:len(_BOOL_HEADERS)] == _BOOL_HEADERS) else \
                 _unpackMeta(fv) if fv.dtype.kind == 'S' and fv[-1] == '\xFF' else fv) for fn,fv in flgs]
    alst = [(attr, np.array(f[attr]).astype(f[attr].attrs['dtype']))
            for attr in attl]

    pm = PeakMatrix(pids, ptgs, alst)
    pm.mask = mask
    for fn, fv in flgs:
        pm.add_flag(fn, fv, flagged_only=False)
    return pm
예제 #6
0
def filter_blank_peaks(pm: PeakMatrix,
                       blank_tag: Any,
                       fraction_threshold: Union[int, float] = 1,
                       fold_threshold: Union[int, float] = 1,
                       method: str = 'mean',
                       rm_blanks: bool = True,
                       flag_name: str = 'blank_flag'):
    """
    PeakMatrix blank filter.

    :param pm: the target peak matrix
    :param blank_tag: tag (label) to mask blank samples. e.g Tag("blank", "classLabel")
    :param fraction_threshold: threshold of the sample fractions. Default = 1
    :param fold_threshold: threshold of the blank sample intensity folds. Default = 1
    :param method: method to calculate blank sample intensity array. Valid values include 'mean', 'median', and 'max'.
        Default = 'mean'
    :param rm_blanks: whether to remove (not mask) blank samples after filtering
    :param flag_name: name of the new flag. Default = 'blank_flag'
    :rtype: PeakMatrix object

    This filter will calculate the intensity array of the blanks using the "method", and compare with the
    intensities of the other samples. If fraction_threshold% of the intensity values of a peak are smaller than the
    blank intensities x fold_threshold, this peak will be unflagged.

    """
    if not any([blank_tag in x for x in pm.peaklist_tags]):
        raise ValueError('blank tag [%s] does not exist' % blank_tag)
    if method not in ('mean', 'median', 'max'):
        raise ValueError('filter method must be mean, median or max')

    with unmask_peakmatrix(pm, blank_tag) as m:
        ints = m.intensity_matrix[0] if m.shape[0] == 1 else \
               np.max(m.intensity_matrix, axis=0) if method == 'max' else \
               np.array([getattr(np, method)(x) for x in m.intensity_matrix.T])
        ints *= fold_threshold

    with mask_peakmatrix(pm, blank_tag) as m:
        faild_int = np.sum(m.intensity_matrix >= ints,
                           axis=0) < (fraction_threshold * m.shape[0])
        m.add_flag(flag_name, ~((ints > 0) & faild_int))

    if rm_blanks:
        pm = pm.remove_samples(
            np.where([x.has_tag(blank_tag) for x in pm.peaklist_tags])[0])
    return pm
예제 #7
0
    def _createPeakMatrix():
        pids, tags = list(
            zip(*[
                ('sample_1_1',
                 PeakList_Tags('sample',
                               treatment='compound_1',
                               time_point='1hr',
                               plate=1,
                               order=1)),
                ('sample_1_2',
                 PeakList_Tags('sample',
                               treatment='compound_1',
                               time_point='6hr',
                               plate=1,
                               order=2)),
                ('QC_1', PeakList_Tags('qc', plate=1, order=3)),
                ('sample_2_1',
                 PeakList_Tags('sample',
                               treatment='compound_2',
                               time_point='1hr',
                               plate=2,
                               order=1)),
                ('sample_2_2',
                 PeakList_Tags('sample',
                               treatment='compound_2',
                               time_point='6hr',
                               plate=2,
                               order=2)),
                ('QC_2', PeakList_Tags('qc', plate=2, order=3)),
            ]))

        mzs = np.tile(np.arange(0, 1000, step=100, dtype=float) + 1, (6, 1))
        ints = np.arange(60, dtype=float).reshape((6, 10)) / 20.
        ics = np.array([[2] * 10] * 6)
        # simulate missing values
        for m in (mzs, ints, ics):
            np.fill_diagonal(m, 0)
            m[:, 2] = 0
        return PeakMatrix(pids, tags, [('mz', mzs), ('intensity', ints),
                                       ('intra_count', ics)])
예제 #8
0
    def _createPeakMatrix():
        pids, tags = zip(*[
            ('sample_1_1',
             PeakList_Tags('sample',
                           treatment='compound_1',
                           time_point='1hr',
                           plate=1,
                           order=1)),
            ('sample_1_2',
             PeakList_Tags('sample',
                           treatment='compound_1',
                           time_point='6hr',
                           plate=1,
                           order=2)),
            ('QC_1', PeakList_Tags('qc', plate=1, order=3)),
            ('Blank_1', PeakList_Tags('blank', plate=1, order=4)),
            ('sample_2_1',
             PeakList_Tags('sample',
                           treatment='compound_2',
                           time_point='1hr',
                           plate=2,
                           order=1)),
            ('sample_2_2',
             PeakList_Tags('sample',
                           treatment='compound_2',
                           time_point='6hr',
                           plate=2,
                           order=2)),
            ('QC_2', PeakList_Tags('qc', plate=2, order=3)),
            ('Blank_2', PeakList_Tags('blank', plate=2, order=4)),
        ])

        mzs = np.tile(np.arange(0, 1000, step=100, dtype=float), (8, 1))
        ints = np.arange(80, dtype=float).reshape((8, 10)) / 20.
        ics = np.array([[1, 2] * 5] * 8)

        return PeakMatrix(pids, tags, (('mz', mzs), ('intensity', ints),
                                       ('intra_count', ics)))
예제 #9
0
def align_peaks(peaks: Sequence[PeakList],
                ppm: float = 2.0,
                block_size: int = 5000,
                fixed_block: bool = True,
                edge_extend: Union[int, float] = 10,
                ncpus: Union[int, None] = None):
    """
    Cluster and align peaklists into a peak matrix.

    :param peaks: list of peaklists for alignment
    :param ppm: the hierarchical clustering cutting height, i.e., ppm range for each aligned mz value. Default = 2.0
    :param block_size: number peaks in each centre clustering block. This can be a exact or approximate number depends
        on the fixed_block parameter. Default = 5000
    :param fixed_block: whether the blocks contain fixed number of peaks. Default = True
    :param edge_extend: ppm range for the edge blocks. Default = 10
    :param ncpus: number of CPUs for parallel clustering. Default = None, indicating using as many as possible
    :rtype: PeakMatrix object

    .. figure::  images/alignment.png
        :align:   center

    This function uses hierarchical clustering to align the mz values of the input peaklists. The alignment "width" is
    decided by the parameter of ppm. Due to a large number of peaks, this function splits them into blocks with fixed
    or approximate length, and clusters in a parallel manner on multiple CPUs. When running, the edge blocks are
    clustered first to prevent separating the same peak into two adjacent centre blocks. The size of the edge blocks is
    decided by edge_extend. The clustering of centre blocks is conducted afterwards.

    After merging the clustering results, all the attributes (mz, intensity, snr, etc.) are aligned into matrix
    accordingly. If multiple peaks from the same sample are clustered into one mz value, their attributes are averaged
    (for real value attributes e.g. mz and intensity) or concatenated (string, unicode, or bool attributes). The flag
    attributes are ignored. The number of these overlapping peaks is recorded in a new intra_count attribute matrix.

    """
    # remove empty peaklists
    emlst = np.array([x.size == 0 for x in peaks])
    if np.sum(emlst) > 0:
        logging.warning(
            'droping empty peaklist(s) [%s]' %
            str.join(',', map(str, [p.ID for e, p in zip(emlst, peaks) if e])))
        peaks = [p for e, p in zip(emlst, peaks) if not e]
    if len(peaks) == 0:
        raise ValueError('all input peaklists for alignment are empty')

    # obtain attrs
    attrs = peaks[0].attributes
    if attrs[:2] != ('mz', 'intensity'):
        raise AttributeError('PANIC: peak attributes in wrong order')
    if not all([attrs == x.attributes for x in peaks]):
        raise ValueError('peak attributes not the same')
    if 'intra_count' in attrs:
        raise AttributeError(
            'preserved attribute name [intra_count] already exists')
    attrs = [x for x in attrs
             if x not in peaks[0].flag_attributes]  # flags should be excluded

    # single peaklist
    if len(peaks) == 1:
        attrlst = [(a, peaks[0][a].reshape((1, -1))) for a in attrs] + \
                  [('intra_count', np.ones((1, peaks[0].size)))]
        return PeakMatrix([peaks[0].ID], [peaks[0].tags], attrlst)

    # flatten
    f_pids = np.hstack([[p.ID] * p.size for p in peaks])
    f_attrs = [np.hstack([p[attr] for p in peaks]) for attr in attrs]

    sortids = np.argsort(f_attrs[0])  # attrs[0] -> mz values
    s_pids = f_pids[sortids]
    s_attrs = [x[sortids] for x in f_attrs]

    # cluster
    clusters = _cluster_peaks_map(s_attrs[0], ppm, block_size, fixed_block,
                                  edge_extend, ncpus)
    cids = _cluster_peaks_reduce(clusters)

    # align
    a_pids, a_attrms = _align_peaks(cids, s_pids, *s_attrs)
    attrs += ('intra_count', )  # for cM

    # sort by original pid
    pids = f_pids[sorted(np.unique(f_pids, return_index=True)[1])]
    pdct = dict((i, mi) for mi, i in enumerate(a_pids))
    porder = [pdct[i] for i in pids]
    o_attrms = [x[porder] if x is not None else None for x in a_attrms]

    return PeakMatrix(pids, [p.tags for p in peaks],
                      [x for x in zip(attrs, o_attrms) if x[1] is not None])