def _loadpkl(ID): dset = f[ID] if dset.attrs.get('class', '') != 'PeakList': raise IOError('unknown object found in the database') dm = dset.value dn = dset.attrs['dtable_names'] dt = dset.attrs['dtable_types'] if dn[0] != 'mz' or dn[1] != 'intensity': raise IOError('PANIC: HDF5 dataset matrix not in order') pkl = PeakList( ID, dm[0].astype(np.float64), dm[1].astype(np.float64), **{ k[9:]: _unpackMeta(v) for k, v in dset.attrs.items() if k.startswith('metadata_') }) for n, v, t in zip(dn[2:], dm[2:], dt[2:]): pkl.add_attribute(n, v, t, is_flag=(n in dset.attrs['flag_attrs']), flagged_only=False) for t, v in dset.attrs['tags']: pkl.tags.add_tag(_eval(v), None if t == 'None' else t) return dset.attrs['order'], pkl
def peaklist(self, scan_id, function_noise="median"): if function_noise not in ["mean", "median", "mad"]: raise ValueError("select a function that is available [mean, median, mad]") for scan in self.run(): if scan["id"] == scan_id: mzs, ints = zip(*scan.peaks) scan_time = scan["MS:1000016"] tic = scan["total ion current"] if "MS:1000927" in scan: ion_injection_time = scan["MS:1000927"] else: ion_injection_time = None header = scan['MS:1000512'] mz_range = mz_range_from_header(header) ms_level = scan['ms level'] pl = PeakList(ID=scan["id"], mz=mzs, intensity=ints, mz_range=mz_range, header=header, ms_level=ms_level, ion_injection_time=ion_injection_time, scan_time=scan_time, tic=tic, function_noise=function_noise) snr = np.divide(ints, scan.estimatedNoiseLevel(mode=function_noise)) pl.add_attribute('snr', snr) return pl return None
def join_peaklists(name, pls): def _join_atrtributes(pls): attrs_out = collections.OrderedDict() for pl in pls: for atr in pl.attributes: attrs_out.setdefault(atr, []).extend( list(pl.get_attribute(atr, flagged_only=False))) if list(pl.attributes) != attrs_out.keys(): raise IOError("Different attributes") return attrs_out def _join_meta_data(pl, pls): # meta data for pl_ in pls: for k, v in pl_.metadata.items(): if k not in pl.metadata: pl.metadata[k] = [] if v is not None: pl.metadata[k].extend(v) return pl attrs = _join_atrtributes(pls) pl_j = PeakList(ID=name, mz=attrs["mz"], intensity=attrs["intensity"]) del attrs["mz"], attrs["intensity"] # default attributes for a in attrs: pl_j.add_attribute(a, attrs[a], is_flag=(a in pls[0].flag_attributes), flagged_only=False) return _join_meta_data(pl_j, pls)
def filter_mz_ranges(pl: PeakList, mz_ranges: Sequence[Tuple[float, float]], flag_name: str = 'mz_ranges_flag', flagged_only: bool = False, flag_index: Union[int, None] = None): """ Peaklist mz range filter. :param pl: the target peaklist :param mz_ranges: the mz ranges to remove. Must be in the format of [(mz_min1, mz_max2), (mz_min2, mz_max2), ...] :param flag_name: name of the new flag attribute. Default = 'mz_range_remove_flag' :param flag_index: index of the new flag to be inserted into the peaklist. Default = None :rtype: PeakList object This filter will remove all the peaks whose mz values are within any of the ranges in the mz_remove_rngs. """ if flagged_only: flags = np.ones(pl.shape[0], dtype=bool) else: flags = np.ones(pl.full_size, dtype=bool) for mzr in mz_ranges: if len(mzr) != 2: raise ValueError( 'mzr_remove: Provide a list of "start" and "end" values for each m/z range that needs to be removed.' ) if mzr[0] >= mzr[1]: raise ValueError( 'mzr_remove: Start value cannot be larger then end value.') flags[(pl.get_attribute("mz", flagged_only) >= mzr[0]) & (pl.get_attribute("mz", flagged_only) <= mzr[1])] = False pl.add_attribute(flag_name, flags, flagged_only=flagged_only, is_flag=True, on_index=flag_index) return pl
def _createPeakLists(self): mz = [np.array(m) + np.random.normal(0, 1e-5, len(m)) for m in self.mz] pkls = [] for i in range(len(mz)): pl = PeakList('peaklist_' + str(i), mz[i], self.ints[i]) pl.add_attribute('str_attr', self.strs[i]) pkls += [pl] return pkls
def test_peaklist_portal(self): pkl = PeakList('peaklist', np.sort(np.random.uniform(100, 1200, size=100)), np.random.normal(100, 10, size=100)) pkl.add_attribute('odd_flag', [0, 1] * 50, is_flag=True) save_peaklist_as_txt(pkl, '.test_peaklist.txt') npkl = load_peaklist_from_txt('.test_peaklist.txt', 'peaklist') self.assertEqual(npkl.size, 50) self.assertEqual(npkl.full_size, 100) self.assertTrue(np.allclose(pkl.mz_all, npkl.mz_all)) self.assertTrue(np.allclose(pkl.intensity, npkl.intensity))
def load_peaklist_from_txt(filename, ID, delimiter=',', flag_names='auto', has_flag_col=True): """ Loads a peaklist from plain text file. :param filename: path to an exiting text-based peaklist file :param ID: ID of the peaklist :param delimiter: delimiter of the text lines. Default = ',', i.e., CSV format :param flag_names: names of the flag attributes. Default = 'auto', indicating all the attribute names ends with "_flag" will be treated as flag attibute. Provide None to indicate no flag attributes :param has_flag_col: whether the text file contains the overall "flags" column. If True, it's values will be discarded. The overall flags of the new peaklist will be calculated automatically. Default = True :rtype: PeakList object """ if not os.path.isfile(filename): raise IOError('plain text file [%s] does not exist' % filename) with open(filename, 'rU') as f: rlns = filter(lambda x: x != '', map(strip, f.readlines())) dlns = map(lambda x: map(strip, x.split(delimiter)), rlns) if any(map(lambda x: len(x) != len(dlns[0]), dlns[1:])): raise IOError('data matrix size not match') hd, dm = dlns[0], zip(*dlns[1:]) if has_flag_col: hd, dm = hd[: -1], dm[: -1] # flag_col must be the last one, and discarded if len(set(hd)) != len(hd): raise IOError('duplicate headers found') mzs, ints = np.array(dm[0], dtype=float), np.array( dm[1], dtype=float) # first two cols must be mz and ints pkl = PeakList(ID, mzs, ints) flag_names = filter(lambda x: x.endswith('_flag'), hd) if flag_names == 'auto' else \ [] if flag_names is None else set(flag_names) for n, v in zip(hd[2:], dm[2:]): pkl.add_attribute(n, _evalv(v), is_flag=n in flag_names, flagged_only=False) return pkl
def filter_ringing(pl: PeakList, threshold: float, bin_size: Union[int, float] = 1.0, flag_name: str = 'ringing_flag', flag_index: Union[int, None] = None): """ Peaklist ringing filter. :param pl: the target peaklist :param threshold: intensity threshold ratio :param bin_size: size of the mz chunk for intensity filtering. Default = 1.0 ppm :param flag_name: name of the new flag attribute. Default = 'ringing_flag' :param flag_index: index of the new flag to be inserted into the peaklist. Default = None :rtype: PeakList object This filter will split the mz values into bin_size chunks, and search the highest intensity value for each chunk. All other peaks, if it's intensity is smaller than threshold x the highest intensity in that chunk, will be unflagged. """ if not 0 <= threshold <= 1: raise ValueError('mzr_remove: Provide a value in the range [0.0, 1.0]') inds = np.digitize( pl.mz, np.arange(np.floor(np.min(pl.mz)), np.ceil(np.max(pl.mz)) + bin_size, bin_size) - 0.5) blks = [(inds == i) for i in np.unique(inds)] mask = np.array( reduce(lambda x, y: x + y, [[np.max(pl.intensity[c])] * np.sum(c) for c in blks])) return pl.add_attribute(flag_name, pl.intensity > (mask * threshold), is_flag=True, on_index=flag_index)
def filter_attr(pl: PeakList, attr_name: str, max_threshold: Union[int, float, None] = None, min_threshold: [int, float, None] = None, flag_name: Union[str, None] = None, flag_index: Union[int, None] = None): """ Peaklist attribute values filter. :param pl: the target peaklist :param attr_name: name of the target attribute :param max_threshold: maximum threshold. A peak will be unflagged if the value of it's attr_name is larger than the threshold. Default = None, indicating no threshold :param min_threshold: Minimum threshold. A peak will be unflagged if the value of it's attr_name is smaller than the threshold. Default = None, indicating no threshold :param flag_name: name of the new flag attribute. Default = None, indicating using attr_name + '_flag' :param flag_index: index of the new flag to be inserted into the peaklist. Default = None :rtype: PeakList object This filter accepts real value attributes only. """ if min_threshold is None and max_threshold is None: raise ValueError('must specify minimum or maximum threshold value') flt = lambda x: np.logical_and((min_threshold <= x) if min_threshold is not None else True, (x <= max_threshold) if max_threshold is not None else True) if flag_name is None: flag_name = attr_name + '_flag' return pl.add_attribute(flag_name, flt(pl[attr_name]), is_flag=True, on_index=flag_index)
def _loadpkl(dset): if dset.attrs.data_class != 'PeakList': raise IOError('unknown object found in the database') dn = dset.attrs.dtable_names dt = dset.attrs.dtable_types dm = [np.array(dset.colinstances[n]) for n in dn] if dn[0] != 'mz' or dn[1] != 'intensity': raise IOError('PANIC: HDF5 dataset matrix not in order') pkl = PeakList(dset.name, dm[0], dm[1], **{k[9:]: _unpackMeta(getattr(dset.attrs, k)) for k in dset.attrs._f_list('user') if k.startswith('metadata_')}) for n, v, t in zip(dn[2:], dm[2:], dt[2:]): pkl.add_attribute(n, v, t, is_flag = (n in dset.attrs.flag_attrs), flagged_only = False) for t, v in map(lambda x: x.astype(str), dset.attrs.tags): pkl.tags.add_tag(_eval(v), None if t == 'None' else t) return dset.attrs.order, pkl
def peaklist(self, scan_id, function_noise="median"): if function_noise not in ["mean", "median", "mad"]: raise ValueError( "select a function that is available [mean, median, mad]") run = pymzml.run.Reader(self.filename) for scan in run: if scan["id"] == scan_id: peaks = scan.peaks("raw") if len(peaks) > 0: mzs, ints = list(zip(*peaks)) else: mzs, ints = [], [] scan_time = scan["MS:1000016"] tic = scan["total ion current"] if "MS:1000927" in scan: ion_injection_time = scan["MS:1000927"] else: ion_injection_time = None header = scan['MS:1000512'] mz_range = mz_range_from_header(header) ms_level = scan['ms level'] pl = PeakList(ID=scan["id"], mz=mzs, intensity=ints, mz_range=mz_range, header=header, ms_level=ms_level, ion_injection_time=ion_injection_time, scan_time=scan_time, tic=tic, function_noise=function_noise) snr = np.divide( ints, scan.estimated_noise_level(mode=function_noise)) pl.add_attribute('snr', snr) run.info["file_object"].close() return pl return None
def tree2peaklist(tree_pth, adjust_mz=True, merge=True, ppm=5, ms1=True, out_pth='', name=''): #################################################################################################################### # Extract peaklists from msnpy #################################################################################################################### trees = load_trees(tree_pth) plsd = {} all_ms1_precursors = {} # get peaklist for each header for tree in trees: plsd[tree.graph['id']] = [] # For each tree we look at each "header" e.g. the same mass spectrometry data (processed prior by dimspy-msnpy) # And create a peaklist for each header. (....probably a better way of doing this perhaps iterating through # the tree instead?). Anyway this seems to work OK. its = tree.nodes.items() # add id to tree values [i[1].update({'id': i[0]}) for i in its] tv = [i[1] for i in its] # requires sorting for itertools.groupby to work properly tv = sorted(tv, key=lambda i: i['header']) for header, group in itertools.groupby(tv, key=lambda x: x['header']): # get mz, intensity, mass, molecular formula, adduct mtch = re.search('.*Full ms .*', header) if mtch: # full scan continue precursor_detail_track = [] mz = [] intensity = [] mass = [] mf = [] adduct = [] metad = {'tree_id': tree.graph['id'], 'header': header, 'parent': {}} for d in list(group): # get precursor details for each level for n in tree.predecessors(d['id']): pd = tree.nodes.get(n) # check if we already have this precursor details if pd['mslevel'] in precursor_detail_track: continue metad['parent'][pd['mslevel']] = {} metad['parent'][pd['mslevel']]['mz'] = pd['mz'] if 'mf' in pd: mf_details_p = get_mf_details(pd) metad['parent'][pd['mslevel']]['mass'] = mf_details_p['mass'] metad['parent'][pd['mslevel']]['adduct'] = mf_details_p['adduct'] metad['parent'][pd['mslevel']]['mf'] = mf_details_p['mf'] precursor_detail_track.append(pd['mslevel']) if ms1: if adjust_mz: all_ms1_precursors[mf_details_p['mass']] = pd['intensity'] else: all_ms1_precursors[pd['mz']] = pd['intensity'] mz.append(d['mz']) intensity.append(d['intensity']) if 'mf' in d: mf_details = get_mf_details(d) mass.append(mf_details['mass']) mf.append(mf_details['mf']) adduct.append(mf_details['adduct']) if len(mz)<1: continue if adjust_mz: mza = mass else: mza = mz # create dimspy array object if mf: mza, intensity, mass, mf, adduct = sort_lists(mza, intensity, mass, mf, adduct) else: mza, intensity = sort_lists(mza, intensity) pl = PeakList(ID='{}: {}'.format(tree.graph['id'], header), mz=mza, intensity=intensity, **metad) print(pl) if mf: pl.add_attribute('mass', mass) pl.add_attribute('mz_original', mz) pl.add_attribute('mf', mf) pl.add_attribute('adduct', adduct) plsd[tree.graph['id']].append(pl) pls = [y for x in list(plsd.values()) for y in x] if out_pth: save_peaklists_as_hdf5(pls, os.path.join(out_pth, '{}_non_merged_pls.hdf5'.format(name))) # Merge if merge: merged_pls = [] for (key, pls) in iteritems(plsd): if not pls: continue merged_id = "<#>".join([pl.ID for pl in pls]) pm = align_peaks(pls, ppm=ppm) plm = pm.to_peaklist(ID=merged_id) plm.metadata['parent'] = {1: pls[0].metadata['parent'][1]} merged_pls.append(plm) if out_pth: save_peaklists_as_hdf5(merged_pls, os.path.join(out_pth, '{}_merged_pls.hdf5'.format(name))) else: merged_pls = '' if ms1: mz, intensity = sort_lists(list(all_ms1_precursors.keys()), list(all_ms1_precursors.values())) ms1_precursors_pl = [PeakList(ID='ms1_precursors', mz=mz, intensity=intensity)] if out_pth: save_peaklists_as_hdf5(ms1_precursors_pl, os.path.join(out_pth, '{}_ms1_precursors_pl.hdf5'.format(name))) else: ms1_precursors_pl = '' return pls, merged_pls, ms1_precursors_pl
def peaklist(self, scan_id, function_noise="noise_packets"): if function_noise not in ["noise_packets", "mean", "median", "mad"]: raise ValueError("select a function that is available [noise_packets, mean, median, mad]") scan = self.run.GetCentroidStream(scan_id, False) if scan.Masses is not None: mz_ibn = list(zip(scan.Masses, scan.Intensities, scan.Baselines, scan.Noises)) # SignalToNoise not available mz_ibn.sort() mzs, ints, baseline, noise = list(zip(*mz_ibn)) else: mzs, ints, baseline, noise = [], [], [], [] if function_noise == "noise_packets" and len(ints) > 0: snr = [p.SignalToNoise for p in scan.GetCentroids()] elif function_noise == "median" and len(ints) > 0: snr = ints / np.median(ints) elif function_noise == "mean" and len(ints) > 0: snr = ints / np.mean(ints) elif function_noise == "mad" and len(ints) > 0: snr = ints / np.median(np.abs(np.subtract(ints, np.median(ints)))) else: snr = [] scan_stats = self.run.GetScanStatsForScanNumber(scan_id) extra_values = list(self.run.GetTrailerExtraInformation(scan_id).Values) extra_labels = list(self.run.GetTrailerExtraInformation(scan_id).Labels) for i, label in enumerate(extra_labels): if "Ion Injection Time (ms):" == label: ion_injection_time = extra_values[i] else: ion_injection_time = None if "Elapsed Scan Time (sec):" == label: scan_time = extra_values[i] else: scan_time = None if "Micro Scan Count:" == label: micro_scans = extra_values[i] else: micro_scans = None tic = scan_stats.TIC segment = scan_stats.SegmentNumber header = str(self.run.GetScanEventStringForScanNumber(scan_id)) ms_level = header.count("@") + 1 pl = PeakList(ID=scan_id, mz=mzs, intensity=ints, mz_range=mz_range_from_header(header), header=header, ms_level=ms_level, micro_scans=micro_scans, segment=segment, ion_injection_time=ion_injection_time, scan_time=scan_time, tic=tic, function_noise=function_noise) if len(pl.mz) > 0: pl.add_attribute('snr', snr) pl.add_attribute('noise', noise) pl.add_attribute('baseline', baseline) return pl
def _createPeakList(): pkl = PeakList('peaklist', np.arange(10, dtype=float), np.arange(10, dtype=float) + 1) pkl.add_attribute('snr', (np.arange(10, dtype=float) + 1) / 10) return pkl