def _make_iter(self): self._reader = mzml.read(self.file_path) self._iter = ( process_mzml_scan( scan, savgol_window_length=self.savgol_window_length, remove_baseline=self.remove_baseline) for scan in self._reader)
def __compute_mz_axis(cls, filename): ## TODO completely refactor this to make it smartly handle profile or centroid datasets ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution ## TODO: profile datasets should work as is ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array'] """ Internal helper function used to compute the mz axis of each scantype Returns a list of numpy arrays """ reader = mzml.read(filename) mz_list = [] counter = 0 for spectrum in reader: mz_list.append(np.asarray(spectrum['m/z array'])) counter += 1 mzdiff = 10000000.0 mz_min = 10000000.0 mz_max = 0.0 for mz in mz_list: d = np.diff(mz).min() if d < mzdiff: mzdiff = d m = mz.min() if m < mz_min: mz_min = m m = mz.max() if m > mz_max: mz_max = m mz_axes = np.arange(start=mz_min, stop=mz_max, step=mzdiff) return mz_axes
def mzml_reader(msconvert_file): ind,mslev,bpmz,bpint,starttime,mzarray,intarray = [],[],[],[],[],[],[] with mzml.read(msconvert_file) as reader: k_count = 0 for each_dict in reader: # print(each_dict) if each_dict['ms level'] == 1: ind.append(each_dict['index']) bpmz.append(each_dict['base peak m/z']) bpint.append(each_dict['base peak intensity']) mzarray.append(each_dict['m/z array']) intarray.append(each_dict['intensity array']) v_dict = each_dict['scanList'] v_dict = v_dict['scan'][0] starttime.append(v_dict['scan start time']) mslev = [1] * len(ind) mzarray = [x.tolist() for x in mzarray] intarray = [x.tolist() for x in intarray] col_set = [ 'ind', 'mslev', 'bpmz', 'bpint', 'starttime', 'mzarray', 'intarray' ] df_ms1 = pd.DataFrame(list( zip(ind, mslev, bpmz, bpint, starttime, mzarray, intarray)), columns=col_set) return df_ms1
def ingest_mzML(input_file): #mzml_reader=mzml.read(input_file.split(".",1)[0]+".mzml",iterative=True) mzml_reader = mzml.read(input_file.split(".", 1)[0] + ".mzML", iterative=True) parsed_scans = [] for each_scan in tqdm(mzml_reader): if each_scan['ms level'] == 2: this_scan = {} #this_scan['scan']=int(each_scan['index'])+1 #Turns out, sometimes people use truncated files so this isn't a good plan. this_scan['scan'] = int( each_scan['id'].rsplit("=", 1) [1]) #Instead, we'll take scan number from the scan id string. this_scan['z'] = each_scan['precursorList']['precursor'][0][ 'selectedIonList']['selectedIon'][0]['charge state'] this_scan['m/z'] = each_scan['precursorList']['precursor'][0][ 'selectedIonList']['selectedIon'][0]['selected ion m/z'] this_scan["RT"] = each_scan['scanList']['scan'][0][ 'scan start time'] parsed_scans.append(this_scan) #this_dataset=this_group.create_dataset(str(each_scan['index']),compression="gzip",compression_opts=9,dtype="float32",data=numpy.column_stack((each_scan['m/z array'],each_scan['intensity array'])).T) #this_dataset.attrs["scan_index"]=each_scan['index'] print "done reading from with file {0}".format(input_file) new_df = pandas.DataFrame(parsed_scans) del mzml_reader return new_df
def split_mzml(mzml_file): """ function to split a mzML file into dict of MS2_Spectra objects (can be written to mgf format) by fragmentation method Parameters: ----------------------------------------- mzml_file: str, path to mzML file Return: dict {fragMethod: list(MS2_spectrum) """ mzml_reader = mzml.read(mzml_file) ordered_ms2_spectra = { "CID": [], "HCD": [], "ETD": [], "ETciD": [], "EThcD": [] } for spectrum in mzml_reader: if spectrum['ms level'] == 2: try: groups = re.search( "@([A-z]+)([0-9.]+)@?([A-z]+)?([0-9.]+)?", spectrum['scanList']['scan'][0]['filter string']).groups() except: print spectrum['scanList']['scan'][0]['filter string'] title = os.path.split(mzml_file)[1].split( 'mzML')[0] + spectrum['id'] rt = spectrum['scanList']['scan'][0]['scan start time'] * 60 precursor = spectrum['precursorList']['precursor'][0][ 'selectedIonList']['selectedIon'][0] pre_mz = precursor['selected ion m/z'] try: pre_int = precursor['peak intensity'] except KeyError: pre_int = 0 pre_z = precursor['charge state'] peaks = zip(spectrum['m/z array'], spectrum['intensity array']) ms2class_spectrum = MS2_spectrum(title, rt, pre_mz, pre_int, pre_z, peaks) if "etd" in groups: if "cid" in groups: ordered_ms2_spectra['ETciD'].append(ms2class_spectrum) elif "hcd" in groups: ordered_ms2_spectra['EThcD'].append(ms2class_spectrum) else: ordered_ms2_spectra['ETD'].append(ms2class_spectrum) elif "cid" in groups: ordered_ms2_spectra['CID'].append(ms2class_spectrum) elif "hcd" in groups: ordered_ms2_spectra['HCD'].append(ms2class_spectrum) return {k: v for k, v in ordered_ms2_spectra.items() if len(v) > 0}
def generate_cihcd_spectra(mzml_file): """ """ mzml_reader = mzml.read(mzml_file) cihcd_spectra = [] n = 0 for spectrum in mzml_reader: if spectrum['ms level'] == 3: n += 1 filter_str = spectrum['scanList']['scan'][0]['filter string'] try: detector_str = re.search("^(FT|IT)", filter_str).groups()[0] frag_groups = re.findall("@([A-z]+)([0-9.]+)", filter_str) precursor_mz_groups = re.findall("([0-9.]+)@", filter_str) except AttributeError: raise StandardError("filter string parse error: %s" % filter_str) ms2_id = spectrum['precursorList']['precursor'][0]['spectrumRef'] title = os.path.split(mzml_file)[1].split('.mzML')[0] + " " + spectrum['id'] + " ms2_scanId=" + ms2_id rt = spectrum['scanList']['scan'][0]['scan start time'] * 60 pre_mz = precursor_mz_groups[0] # take ms2 precursor as precursor pre_int = -1 pre_z = -1 peaks = zip(spectrum['m/z array'], spectrum['intensity array']) ms2class_spectrum = ProteoFileReader.MS2_spectrum(title, rt, pre_mz, pre_int, pre_z, peaks) cihcd_spectra.append(ms2class_spectrum) return cihcd_spectra
def spectrum_iter(self): """ Generator function that yields a position and associated spectrum for a selected datacube type. :yield: (xidx, yidx) a tuple of ints representing x and y position in the image :yield: yi, a numpy 1D-array of floats containing spectral intensities at the given position \ and for the selected datacube type """ reader = mzml.read(self.basename) if self.select_dataset is None: raise ValueError('Select a dataset to continue!') dataset_index = self.select_dataset for idx, spectrum in enumerate(reader): mz = self.mz_all[0] x = spectrum['m/z array'] try: y = spectrum['intensity array'] except KeyError: raise KeyError('Key "intensity array" not found in this mzml file') yi = np.interp(mz, x, y, 0, 0) # Interpolate the data onto the new axes in profiles mode # else: # shift = np.diff(mz).mean() # bin_edges = np.append(mz, mz[-1]+ shift) # yi, _ = np.histogram(x, bins=bin_edges, weights=y) # Re-histogram the data in centroided mode xidx = np.nonzero(self.x_pos == self.coordinates[idx, 0])[0][0] yidx = np.nonzero(self.y_pos == self.coordinates[idx, 1])[0][0] yield (xidx, yidx), yi
def spectrum_iter(self): """ Generator function that yields a position and associated spectrum for a selected datacube type. :yield: (xidx, yidx) a tuple of ints representing x and y position in the image :yield: yi, a numpy 1D-array of floats containing spectral intensities at the given position \ and for the selected datacube type """ reader = mzml.read(self.basename) if self.select_dataset is None: raise ValueError('Select a dataset to continue!') dataset_index = self.select_dataset for idx, spectrum in enumerate(reader): mz = self.mz_all[0] x = spectrum['m/z array'] try: y = spectrum['intensity array'] except KeyError: raise KeyError( 'Key "intensity array" not found in this mzml file') yi = np.interp( mz, x, y, 0, 0) # Interpolate the data onto the new axes in profiles mode # else: # shift = np.diff(mz).mean() # bin_edges = np.append(mz, mz[-1]+ shift) # yi, _ = np.histogram(x, bins=bin_edges, weights=y) # Re-histogram the data in centroided mode xidx = np.nonzero(self.x_pos == self.coordinates[idx, 0])[0][0] yidx = np.nonzero(self.y_pos == self.coordinates[idx, 1])[0][0] yield (xidx, yidx), yi
def read_mzml(PATH, scanlist, event_scan, fname, output, soutput, newscanno, spec_outfile): if os.path.isfile(PATH) and os.access(PATH, os.R_OK): with mzml.read(PATH) as reader: for scanindex, spectrum in enumerate(reader): if scanlist.has_key(scanindex): try: pev = event_scan[fname][scanindex]['pev'] output.append("%s\t%d\t%s\t%s\t%s\n" % (pev, newscanno, spec_outfile, event_scan[fname][scanindex]['pseq'], event_scan[fname][scanindex]['etype'])) newscanno += 1 charge = int(spectrum['precursorList']['precursor'][0] ['selectedIonList']['selectedIon'][0] ['charge state']) mz = spectrum['precursorList']['precursor'][0][ 'selectedIonList']['selectedIon'][0][ 'selected ion m/z'] soutput.write( "BEGIN IONS\nTITLE=controllerType=0 controllerNumber=1 scan=%d\nCHARGE=%d+\nPEPMASS=%s\n" % (newscanno, charge, mz)) for x, y in zip(spectrum['m/z array'], spectrum['intensity array']): soutput.write("%s %s\n" % (x, y)) soutput.write("END IONS\n\n") except: print("Error reading mzML file") return newscanno
def __compute_mz_axis(cls, filename): ## TODO completely refactor this to make it smartly handle profile or centroid datasets ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution ## TODO: profile datasets should work as is ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array'] """ Internal helper function used to compute the mz axis of each scantype Returns a list of numpy arrays """ reader = mzml.read(filename) mz_list = [] counter = 0 for spectrum in reader: mz_list.append(np.asarray(spectrum['m/z array'])) counter += 1 mzdiff = 10000000.0 mz_min = 10000000.0 mz_max = 0.0 for mz in mz_list: d = np.diff(mz).min() if d < mzdiff: mzdiff = d m = mz.min() if m < mz_min: mz_min = m m = mz.max() if m > mz_max: mz_max = m mz_axes = np.arange(start=mz_min, stop=mz_max, step=mzdiff) return mz_axes
def __compute_coordinates(self, filename, num_scans): """ Internal helper function used to compute the coordinates for each scan. :returns: 2D numpy integer array of shape (numScans,2) indicating for each scan its x and y coordinate """ spectrumid = 0 reader = mzml.read(filename) coords = np.zeros(shape=(num_scans, 2), dtype='uint32') with open(filename, 'r') as origin_file: for line in origin_file: s = re.findall(r'location="', line) if s: m = re.search( r'_[0-9]+x_[0-9]+y_', line, ) if m: coord_str = m.group() coord_str = coord_str.strip('_').split('_') coord_str = [int(c[:-1]) for c in coord_str] coords[spectrumid, 0] = coord_str[0] coords[spectrumid, 1] = coord_str[1] spectrumid += 1 return coords
def parse_mzml(mzml_path, pickle_data=None, logfile=None): """ retrieves all scans from a portion of an mzml file and generates arrays of mz versus intensity at each RT :param mzml_path: string pointing to the .mzml file to extra scans from :param pickle_data: string pointing to a pickle file to save the extracted spectra into. Default is not saved :param logfile: path to file to log :return: a dictionary with keys retention_times, and ms1_scans. retention_times points to a numpy array of each retention time. ms1_scans points to a list of pandas dataframes with columns mz_data and intensity_data. The index of this list corresponds to the position in the retention_times numpy array. """ with mzml.read(mzml_path) as mz_reader: scan_list = [] rt_list = [] log('reading mzml file ' + mzml_path + '...', logfile) for scan in mz_reader: assert scan['ms level'] == 1, 'Your mzml file contains non-MS1 scans. ' \ 'When converting your mzml file, only include MS1 scans. See the pysodist' \ 'docs for how to appropriately convert to .mzml files using msconvert.' mz_int_pd = pd.DataFrame({'mz_data': scan['m/z array'], 'intensity_data': scan['intensity array']}) scan_list.append(mz_int_pd) new_rt = float(scan['scanList']['scan'][0]['scan start time']) rt_list.append(new_rt) parsed_mz_file = {'retention_times': np.array(rt_list), 'ms1_scans': scan_list} if not (pickle_data is None): pickle.dump(parsed_mz_file, open(pickle_data, 'wb')) return parsed_mz_file
def extract_mid_from_file(mzml_path, mz_windows, rt_window, ppm): """ Pulls out MID info from mzML file, looking only in given mz_windows and rt_window. :param mzml_path: str path to mzml_file for extraction :param mz_windows: array (span+1-by-2) matrix with columns mz_min and mz_max for rows M0, M1 ... :param rt_window: array (1-by-2) array containing rt_min and rt_max in minutes for peak of interest :param ppm: float parts-per-million mass accuracy :return out: dict w/ keys: m, np.array of ints, the number of heavy neutrons mean_mz, the intensity-weighted m/z of the m peak total_i, the total intensity of isotopologue m """ with mzml.read(mzml_path) as reader: # identify rt window rt_min, rt_max = tuple(rt_window) # initialize output matrix of mean mz and intensities n_rows = mz_windows.shape[0] m_span = range(n_rows) # initialize lists to store relevant raw data points mzs = [[] for m in m_span] intensities = [[] for m in m_span] # loop through scans, keeping data points in mz_windows and rt_window only for spec in reader: try: rt = spec['scanList']['scan'][0]['scan start time'] except (KeyError, IndexError): continue if rt >= rt_min and rt <= rt_max: # get raw scan data these_mzs = spec['m/z array'] these_intensities = spec['intensity array'] # index into mz_windows to find relevant data in scan index_mat = np.searchsorted(these_mzs, mz_windows) start = index_mat[:, 0] stop = index_mat[:, 1] for m in m_span: # if scan has no mz values of interest, skip it if start[m] != stop[m]: mzs[m].extend(list(these_mzs[start[m]:stop[m]])) intensities[m].extend( list(these_intensities[start[m]:stop[m]])) try: mean_mz = np.asarray( [np.average(mzs[m], weights=intensities[m]) for m in m_span]) except ZeroDivisionError: mean_mz = np.asarray([0 for m in m_span]) total_i = np.asarray([np.sum(intensities[m]) for m in m_span]) return ({ 'm': np.asarray(m_span), 'mean_mz': mean_mz, 'total_i': total_i })
def __compute_mz_axis(cls, filename, mzml_filetype, scan_types, resolution): ## TODO completely refactor this to make it smartly handle profile or centroid datasets ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution ## TODO: profile datasets should work as is ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array'] """ Internal helper function used to compute the mz axis of each scantype Returns a list of numpy arrays """ reader = mzml.read(filename) if mzml_filetype == cls.available_mzml_types['thermo']: mz_axes = [np.array([]) for _ in scan_types] # all_centroid = True for spectrum in reader: scanfilt = spectrum['scanList']['scan'][0]['filter string'] scantype_idx = scan_types.index(scanfilt) mz = spectrum['m/z array'] try: len_axes = len(mz_axes[scantype_idx]) except TypeError: len_axes = 1 if spectrum.has_key('profile spectrum'): # all_centroid = False if len(mz) > len_axes: mzdiff = np.diff(mz).min() mzmin = spectrum['scanList']['scan'][0]['scanWindowList']['scanWindow'][0]['scan window lower limit'] mzmax = spectrum['scanList']['scan'][0]['scanWindowList']['scanWindow'][0]['scan window upper limit'] mz_axes[scantype_idx] = np.arange(start=mzmin, stop=mzmax, step=mzdiff) mz_axes[scantype_idx] = np.append(arr=mz_axes[scantype_idx], values=mzmax) else: if len(mz) > len_axes: mzmin = spectrum['scanList']['scan'][0]['scanWindowList']['scanWindow'][0]['scan window lower limit'] mzmax = spectrum['scanList']['scan'][0]['scanWindowList']['scanWindow'][0]['scan window upper limit'] f = np.ceil(1e6 * np.log(mzmax/mzmin)/resolution) mz_axes[scantype_idx] = np.logspace(np.log10(mzmin), np.log10(mzmax), f) # ['count', 'index', 'highest observed m/z', 'm/z array', 'total ion current', 'ms level', 'spotID', 'lowest observed m/z', 'defaultArrayLength', 'intensity array', 'centroid spectrum', 'positive scan', 'MS1 spectrum', 'spectrum title', 'base peak intensity', 'scanList', 'id', 'base peak m/z'] return mz_axes # assume bruker instruments have constant m/z axis from scan to scan elif mzml_filetype == cls.available_mzml_types['bruker']: mz_axes = [np.array([]) for _ in scan_types] for spectrum in reader: scanfilt = spectrum['scanList']['scan'][0]['filter string'] scantype_idx = scan_types.index(scanfilt) # grossly inefficient reassignment of m/z array at each scan mz_axes[scantype_idx] = spectrum['m/z array'] return mz_axes else: raise ValueError('Unknown mzml format')
def parse_mzml_file(filepath, mz_resolution=1): mzml_file = read(filepath) spectrumList_dict = iterfind(filepath, 'indexedmzML/mzML/run/spectrumList', read_schema=True, recursive=False).__next__() n_spectra = int(spectrumList_dict['count']) # number of all spectra mzmin, mzmax = None, None mz_array = None mat = None times = None for i, sp in enumerate(mzml_file): if mat is None: mzmin = float(sp['scanList']['scan'][0]['scanWindowList'] ['scanWindow'][0]['scan window lower limit']) mzmax = float(sp['scanList']['scan'][0]['scanWindowList'] ['scanWindow'][0]['scan window upper limit']) mzmax = mzmax + mz_resolution - ( mzmax - mzmin) % mz_resolution # recalculate the maximum m/z value mz_array = np.linspace( mzmin, mzmax, int((mzmax - mzmin) / mz_resolution) + 1) # make sure to have evenly spaced integerers mat = np.zeros((n_spectra, mz_array.shape[0])) times = np.zeros(n_spectra) indexes = (sp['m/z array'] * mz_array.shape[0] / mzmax).astype(int) # find duplicated and integrate (just sum) them indexes, indices, counts = np.unique(indexes, return_index=True, return_counts=True) intensities = sp['intensity array'] integrated_intensities = intensities[indices] for j in range(indexes.shape[0]): if counts[j] < 2: continue idx = indices[j] integrated_intensities[j] = intensities[idx:idx + counts[j]].sum() try: times[i] = float(sp['scanList']['scan'][0]['scan start time']) except KeyError: pass mat[i, indexes] = integrated_intensities return mat, times, mz_array
def plot_spectra(mzml_id, peptide, scan_id, mzml_dir, spec_pic_dir, psm_id): mzml_file = str( subprocess.check_output("find {} -name {}.mzML".format( mzml_dir, mzml_id), shell=True)) mzml_file = mzml_file.replace("b'", "").replace("\\n'", "") with mzml.read(mzml_file) as reader: # auxiliary.print_tree(next(reader)) for scan in reader: if not scan["index"] == int(scan_id) - 1: continue if "precursorList" not in scan.keys(): print("no precursor list") return mz = scan['m/z array'] intensity = scan['intensity array'] identifier = scan['index'] retention_time = float( scan['scanList']['scan'][0]["scan start time"]) * 60.0 precursor_mz = scan["precursorList"]["precursor"][0][ "selectedIonList"]["selectedIon"][0]["selected ion m/z"] precursor_charge = int( scan["precursorList"]["precursor"][0]["selectedIonList"] ["selectedIon"][0]["charge state"]) spec = spectrum.MsmsSpectrum(identifier, precursor_mz, precursor_charge, mz, intensity, retention_time=retention_time, peptide=peptide) min_mz, max_mz = 100, 1400 fragment_tol_mass, fragment_tol_mode = 10, 'ppm' min_intensity, max_num_peaks = 0.05, 150 scaling = 'root' ion_types = 'aby' spec = spec.set_mz_range(min_mz, max_mz) spec = spec.remove_precursor_peak(fragment_tol_mass, fragment_tol_mode) spec = spec.filter_intensity(min_intensity, max_num_peaks) spec = spec.scale_intensity(scaling) # spec = spec.annotate_peaks(fragment_tol_mass, fragment_tol_mode, ion_types) spec = spec.annotate_peptide_fragments(fragment_tol_mass, fragment_tol_mode, ion_types) plt.figure() plot.spectrum(spec, grid=False) mzml_id = os.path.splitext(os.path.split(mzml_file)[1])[0] plt.savefig("{}/{}_{}.svg".format(spec_pic_dir, mzml_id, psm_id), bbox_inches='tight') plt.close() print("print") return else: print("Scan not found")
def __compute_filetype(cls, filename): """ Internal helper function used to compute the filetype. """ spectrum = next(mzml.read(filename)) if 'spotID' in spectrum: return cls.available_mzml_types['thermo'] elif 'id' in spectrum: return cls.available_mzml_types['bruker'] else: return cls.available_mzml_types['unknown']
def __compute_filetype(cls, filename): """ Internal helper function used to compute the filetype. """ spectrum = next(mzml.read(filename)) if 'spotID' in spectrum: return cls.available_mzml_types['thermo'] elif 'id' in spectrum: return cls.available_mzml_types['bruker'] else: return cls.available_mzml_types['unknown']
def set_retention_times(file: str): retention_scan_dictionary = {} with mzml.read(file) as f: for scan in f: if scan["ms level"] == 2: scan_time = float( scan["scanList"]["scan"][0]["scan start time"]) scan_time = (scan_time - CON.RETENTION_SHIFT_INTERCEPT ) / CON.RETENTION_SHIFT_SLOPE scan_time *= CON.MINUTES_TO_SECONDS retention_scan_dictionary[scan["index"] + 1] = scan_time return retention_scan_dictionary
def read_mzml(self, file: str): total = 0 print() print("(initializing)") with mzml.read(file) as f: for scan in f: if scan["ms level"] == 1: total += 1 count = 0 retention_time = None with mzml.read(file) as f: for scan in f: if scan["ms level"] == 1: retention_time = scan["scanList"]["scan"][0][ "scan start time"] retention_time *= CON.MINUTES_TO_SECONDS count += 1 if count % 200 == 0 or count == 1 or count == total: print(count, "/", total, "scans") self.process_scan(scan) self.set_tuple_dictionary(self.sliding_window(retention_time))
def __read_all(self): """ Internal helper function used to read all data. The function directly modifies the self.data entry. Data is now a list of datacubes """ self.data = [ np.zeros(shape=self.shape_all_data[scan_idx], dtype=self.data_type) for scan_idx, scantype in enumerate(self.scan_types) ] for scan_idx, scantype in enumerate(self.scan_types): reader = mzml.read(self.basename) spectrumid = 0 if not self.scan_profiled[scan_idx]: shift = np.diff(self.mz_all[scan_idx]).mean() bin_edges = np.append(self.mz_all[scan_idx], self.mz_all[scan_idx][-1] + shift) else: bin_edges = None for spectrum in reader: if spectrum['scanList']['scan'][0][ 'filter string'] == scantype: x = spectrum['m/z array'] try: y = spectrum['intensity array'] except KeyError: raise KeyError if bin_edges is None: yi = np.interp( self.mz_all[scan_idx], x, y, 0, 0) # Re-interpolate the data in profiled mode else: yi, _ = np.histogram( x, bins=bin_edges, weights=y ) # Re-histogram the data in centroided mode xidx = np.nonzero( self.x_pos == self.coordinates[spectrumid, 0])[0] yidx = np.nonzero( self.y_pos == self.coordinates[spectrumid, 1])[0] try: self.data[scan_idx][xidx, yidx, :] = yi except: log_helper.debug(__name__, spectrumid, scan_idx, scantype, self.mz_all[scan_idx].shape) # TODO Note if the data is expected to be of float precision then self.data_type needs to be set accordingly if spectrumid % 1000 == 0: log_helper.info( __name__, 'Processed data for %s spectra to datacube for scan type %s' % (spectrumid, scantype)) spectrumid += 1
def load_from_mzml(filename: str, ms_level: int = 2, metadata_harmonization: bool = True ) -> Generator[Spectrum, None, None]: """Load spectrum(s) from mzml file. This function will create ~matchms.Spectrum for every spectrum of desired ms_level found in a given MzML file. For more extensive parsing options consider using pyteomics or pymzml packages. Example: .. code-block:: python from matchms.importing import load_from_mzml file_mzml = "testdata.mzml" spectrums = list(load_from_mzml(file_mzml)) Parameters ---------- filename: Filename for mzml file to import. ms_level: Specify which ms level to import. Default is 2. metadata_harmonization : bool, optional Set to False if metadata harmonization to default keys is not desired. The default is True. """ for pyteomics_spectrum in mzml.read(filename, dtype=dict): if "ms level" in pyteomics_spectrum and pyteomics_spectrum[ "ms level"] == ms_level: metadata = parse_mzml_mzxml_metadata(pyteomics_spectrum) mz = numpy.asarray(pyteomics_spectrum["m/z array"], dtype="float") intensities = numpy.asarray(pyteomics_spectrum["intensity array"], dtype="float") if mz.shape[0] > 0: # Sort by mz (if not sorted already) if not numpy.all(mz[:-1] <= mz[1:]): idx_sorted = numpy.argsort(mz) mz = mz[idx_sorted] intensities = intensities[idx_sorted] yield Spectrum(mz=mz, intensities=intensities, metadata=metadata, metadata_harmonization=metadata_harmonization)
def mZML_reader(filetoopen): app.queueFunction(app.setStatusbarWidth, len("Loading file..."), field=2) app.queueFunction(app.setStatusbar, "Loading file...", 2) data = mzml.read(filetoopen) #load only the first scan for scan in data: if scan['id'] == '1': X = scan['m/z array'] Y = scan['intensity array'] break Xscalar = np.around( min([ X[i + 1] - X[i] for i in range(0, len(X) - 1) if X[i + 1] - X[i] != 0.0 ]), 2 ) # no Xscalar is encoded in this data - and values have been filtered out - so need to guess if Xscalar > 1: Xscalar = 0.2 # fix if the estimator is way off try: XY = [(X[i] / Xscalar, Y[i]) for i in range(len(X)) if Y[i] > 0] # remove 0 values and adjust for scalar X, Y = ([int(XY[x][0]) for x in range(len(XY))], [int(XY[x][1]) for x in range(len(XY))]) ave_noise = np.average([ float(XY[j][1]) for j in range(len(XY)) if int(XY[j][1]) <= 10 ]) # determine average noise except: # if all else fails set scalar to 1 - so we can open it Xscalar = 1 XY = [(X[i] / Xscalar, Y[i]) for i in range(len(X)) if Y[i] > 0] # remove 0 values and adjust for scalar X, Y = ([int(XY[x][0]) for x in range(len(XY))], [int(XY[x][1]) for x in range(len(XY))]) ave_noise = np.average([ float(XY[j][1]) for j in range(len(XY)) if int(XY[j][1]) <= 10 ]) # determine average noise global progress progress = 50 app.registerEvent(updatprogress) return (X, Y, ave_noise, Xscalar, filetoopen)
def get_data(self, ms_level): data_for_analyse = [] for z in mzml.read(self.input_mzml_path): if z['ms level'] == ms_level: if 1: idx = z['intensity array'] >= self.min_intensity z['intensity array'] = z['intensity array'][idx] z['m/z array'] = z['m/z array'][idx] if 'mean inverse reduced ion mobility array' in z: z['mean inverse reduced ion mobility array'] = z[ 'mean inverse reduced ion mobility array'][idx] idx = np.argsort(z['m/z array']) z['m/z array'] = z['m/z array'][idx] z['intensity array'] = z['intensity array'][idx] if 'mean inverse reduced ion mobility array' in z: z['mean inverse reduced ion mobility array'] = z[ 'mean inverse reduced ion mobility array'][idx] data_for_analyse.append(z) return data_for_analyse
def main(): argparser = argparse.ArgumentParser( description='Creates an index for an MSP spectral library file') argparser.add_argument('--mzml_file', action='store', help='Name of the mzML file to read') argparser.add_argument('--version', action='version', version='%(prog)s 0.5') params = argparser.parse_args() #### Ensure that mzml_file was passed if params.mzml_file is None or params.mzml_file == "": print( 'ERROR: Parameter --mzml_file must be provided. See --help for more information' ) return if not os.path.isfile(params.mzml_file): print(f"ERROR: File '{params.mzml_file}' not found or not a file") return #### Read spectra from the file t0 = timeit.default_timer() stats = {'counter': 0, 'ms1spectra': 0, 'ms2spectra': 0} with mzml.read(params.mzml_file) as reader: for spectrum in reader: if stats['counter'] == 0: auxiliary.print_tree(spectrum) #### Update counters and print progress stats['counter'] += 1 if stats['counter'] / 1000 == int(stats['counter'] / 1000): print(f" {stats['counter']}") #### Print final timing information t1 = timeit.default_timer() print(f"INFO: Read {stats['counter']} spectra from {params.mzml_file}") print(f"INFO: Elapsed time: {t1-t0}") print(f"INFO: Processed {stats['counter']/(t1-t0)} spectra per second")
def __compute_scan_types_and_indices(self, filename=None): """ Internal helper function used to compute a list of unique scan types in the mzml file. Also computes a numpy 1d array of ints which index every scan to relevant datacube. """ reader = mzml.read(filename) scantypes = [] scan_indices = [] scan_profiled = [] for idx, spectrum in enumerate(reader): try: scanfilter = spectrum['scanList']['scan'][0]['filter string'] if scanfilter not in scantypes: scantypes.append(scanfilter) scan_profiled.append(spectrum.has_key('profile spectrum')) scan_indices.append(scantypes.index(scanfilter)) except: log_helper.debug(__name__, idx) assert len(scan_indices) == self.num_scans return scantypes, scan_indices, scan_profiled
def load_data(self, path_to_file, min_peak_th=10, data_type='ups1'): """Loading experimental data from *.mzML file""" self.spectrum_collection = [] # print(eval(mzml_params[data_type]['scan_id'])) with mzml.read(path_to_file, dtype=dict) as spectra: for spectrum_id, spectrum in enumerate(spectra): spectrum_record = Spectrum( path_to_file, # path to file eval(mzml_params[data_type]['scan_id']), #scan id eval(mzml_params[data_type]['mz_array']), # mz array eval(mzml_params[data_type] ['intensity_array']), # intensity array eval(mzml_params[data_type]['charge']), # charge eval(mzml_params[data_type] ['precursor_mass']), # precursor mass self.max_peak, self.remove_precursor_peak, self.remove_precursor_tolerance) if len(spectrum_record.intensity_array) >= min_peak_th: self.spectrum_collection.append(spectrum_record) self.set_spectrum_idx()
def is_valid_dataset(cls, name): """Check whether the given file or directory points to a img file. :param name: Name of the dir or file. :type name: String :returns: Boolean indicating whether the given file or folder is a valid img file. """ if os.path.isdir(name): # If we point to a directory, check if the dir contains an mzML file filelist = cls.get_files_from_dir(name) return len(filelist) > 0 else: try: # Try to open the file and iterate over it reader = mzml.read(name) for _ in reader: pass del reader return True except: return False
def __compute_scan_types_and_indices(self, filename=None): """ Internal helper function used to compute a list of unique scan types in the mzml file. Also computes a numpy 1d array of ints which index every scan to relevant datacube. """ reader = mzml.read(filename) scantypes = [] scan_indices = [] scan_profiled = [] for idx, spectrum in enumerate(reader): try: scanfilter = spectrum['scanList']['scan'][0]['filter string'] if scanfilter not in scantypes: scantypes.append(scanfilter) scan_profiled.append(spectrum.has_key('profile spectrum')) scan_indices.append(scantypes.index(scanfilter)) except: log_helper.debug(__name__, idx) assert len(scan_indices) == self.num_scans return scantypes, scan_indices, scan_profiled
def __compute_coordinates(self,filename,num_scans): """ Internal helper function used to compute the coordinates for each scan. :returns: 2D numpy integer array of shape (numScans,2) indicating for each scan its x and y coordinate """ spectrumid = 0 reader = mzml.read(filename) coords = np.zeros(shape=(num_scans, 2), dtype='uint32') with open(filename,'r') as origin_file: for line in origin_file: s = re.findall(r'location="', line) if s: m = re.search(r'_[0-9]+x_[0-9]+y_', line,) if m: coord_str = m.group() coord_str = coord_str.strip('_').split('_') coord_str = [int(c[:-1]) for c in coord_str] coords[spectrumid, 0] = coord_str[0] coords[spectrumid, 1] = coord_str[1] spectrumid += 1 return coords
def __read_all(self): """ Internal helper function used to read all data. The function directly modifies the self.data entry. Data is now a list of datacubes """ # self.data = np.zeros(shape=self.shape_all_data[scan_idx], dtype=self.data_type) for scan_idx, scantype in enumerate(self.scan_types) data = np.zeros(self.shape) reader = mzml.read(self.basename) spectrumid = 0 # if not self.scan_profiled[scan_idx]: # shift = np.diff(self.mz_all[scan_idx]).mean() # bin_edges = np.append(self.mz_all[scan_idx], self.mz_all[scan_idx][-1]+ shift) # else: # bin_edges = None for spectrum in reader: # if spectrum['scanList']['scan'][0]['filter string'] == scantype: x = spectrum['m/z array'] # try: y = spectrum['intensity array'] # except KeyError: # raise KeyError # if bin_edges is None: yi = np.interp(self.mz_all[scan_idx], x, y, 0, 0) # Re-interpolate the data in profiled mode # else: # yi, _ = np.histogram(x, bins=bin_edges, weights=y) # Re-histogram the data in centroided mode # xidx = np.nonzero(self.x_pos == self.coordinates[spectrumid, 0])[0] # yidx = np.nonzero(self.y_pos == self.coordinates[spectrumid, 1])[0] # try: data[self.coordinates[spectrumid, 0], self.coordinates[spectrumid, 1], :] = yi # except: # log_helper.debug(__name__, spectrumid, scan_idx, scantype, self.mz_all[scan_idx].shape) # TODO Note if the data is expected to be of float precision then self.data_type needs to be set accordingly # if spectrumid%1000 == 0: # log_helper.info(__name__, 'Processed data for %s spectra to datacube for scan type %s' % (spectrumid, scantype)) spectrumid += 1 return data
def __compute_coordinates(self): """ Internal helper function used to compute the coordinates for each scan. :returns: 2D numpy integer array of shape (numScans,2) indicating for each scan its x and y coordinate """ reader = mzml.read(self.basename) coords = np.zeros(shape=(self.num_scans, 2), dtype='uint32') if self.mzml_type == self.available_mzml_types['thermo']: spectrumid = 0 for spectrum in reader: spotid = spectrum['spotID'] coords[spectrumid, :] = map(int, spotid.split(',')[-1].split('x')) spectrumid += 1 elif self.mzml_type == self.available_mzml_types['bruker']: spectrumid = 0 for spectrum in reader: spotdesc = spectrum['id'].split('_x002f_')[1] matchobj = re.findall('\d+', spotdesc) coords[spectrumid, 0] = int(matchobj[2]) coords[spectrumid, 1] = int(matchobj[3]) spectrumid += 1 return coords
def is_valid_dataset(cls, name): """Check whether the given file or directory points to a img file. :param name: Name of the dir or file. :type name: String :returns: Boolean indicating whether the given file or folder is a valid img file. """ if os.path.isdir( name ): # If we point to a directory, check if the dir contains an mzML file filelist = cls.get_files_from_dir(name) return len(filelist) > 0 else: try: # Try to open the file and iterate over it reader = mzml.read(name) for _ in reader: pass del reader return True except: return False
def average_ms1(input_filename, output_filename=None, bin_width=1.0, format="csv"): mass_list = [] intensity_list = [] filename, file_extension = os.path.splitext(input_filename) if file_extension == ".mzXML": spectra = mzxml.read(input_filename, read_schema=True) #type is pyteomics mzxml if file_extension == ".mzML": spectra = mzml.read(input_filename, read_schema=True) #type is pyteomics mzxml peaks_list = [] for element in spectra: if "msLevel" in element: mslevel = element["msLevel"] if "ms level" in element: mslevel = element["ms level"] mlist = copy.deepcopy(element['m/z array']) inten = copy.deepcopy(element['intensity array']) if mslevel != 2: peaks_list += zip(mlist, inten) numpy_vector = vectorize_peaks(peaks_list, 2000, bin_width) if output_filename != None: dt = pd.DataFrame(data=numpy_vector) dt.to_csv(output_filename, mode='a', index=True) return numpy_vector
def __compute_coordinates(self): """ Internal helper function used to compute the coordinates for each scan. :returns: 2D numpy integer array of shape (numScans,2) indicating for each scan its x and y coordinate """ reader = mzml.read(self.basename) coords = np.zeros(shape=(self.num_scans, 2), dtype='uint32') if self.mzml_type == self.available_mzml_types['thermo']: spectrumid = 0 for spectrum in reader: spotid = spectrum['spotID'] coords[spectrumid, :] = map(int, spotid.split(',')[-1].split('x')) spectrumid += 1 elif self.mzml_type == self.available_mzml_types['bruker']: spectrumid = 0 for spectrum in reader: spotdesc = spectrum['id'].split('_x002f_')[1] matchobj = re.findall('\d+', spotdesc) coords[spectrumid, 0] = int(matchobj[2]) coords[spectrumid, 1] = int(matchobj[3]) spectrumid += 1 return coords
def __compute_num_scans(filename=None): """ Internal helper function used to compute the number of scans in the mzml file. """ reader = mzml.read(filename) return sum(1 for _ in reader)
def ingest_mzML(input_filename): """Ingest an mzML or mzXML file given it's name and return a dataframe of the file """ ''' {'count': 2, 'index': 2, 'highest observed m/z': 2020.216835219264, 'm/z array': array([ 346.51808351 'ms level': 1, 'total ion current': 5284812.0, 'profile spectrum': '', 'lowest observed m/z': 346.518083514683, 'defaultArrayLength': 6305, 'intensity array':, 'positive scan': '', 'MS1 spectrum': '', 'spectrum title': 'exp1720-04-ds259269.3.3. File:"exp1720-04-ds259269.raw", NativeID:"controllerType=0 controllerNumber=1 scan=3"', 'base peak intensity': 836452.44, 'scanList': {'count': 1, 'no combination': '', 'scan': [{'filter string': 'FTMS + p NSI Full ms [350.00-2000.00]', 'scan start time': 5.0165227, 'ion injection time': 100.000001490116, 'scanWindowList': {'count': 1, 'scanWindow': [{'scan window lower limit': 350.0, 'scan window upper limit': 2000.0}]}, 'preset scan configuration': 1.0}]}, 'id': 'controllerType=0 controllerNumber=1 scan=3', 'base peak m/z': 371.1017749} ''' columns = [] colProc = False with mzml.read(input_filename) as reader: mzml_list = [] for item in reader: row = [] #item["count"], #item["index"], col = "highest observed m/z" if col in item: row.append(float(item[col])) if not colProc: columns.append(col) #item["m/z array"], col = "ms level" if col in item: row.append(int(item[col])) if not colProc: columns.append(col) col = "total ion current" if col in item: row.append(float(item[col])) if not colProc: columns.append(col) #item["profile spectrum"], col = "lowest observed m/z" if col in item: row.append(float(item[col])) if not colProc: columns.append(col) #item["intensity array"], #item["positive scan"], #item["MS1 spectrum"], #exp1720-04-ds259269.3.3. File:"exp1720-04-ds259269.raw", NativeID:"controllerType=0 controllerNumber=1 scan=3" col = "spectrum title" if col in item: row.append(str(item[col].split("File:\"")[1].split("\",")[0])) row.append(int(item[col].split("controllerType=")[1].split(" ")[0])) row.append(int(item[col].split("controllerNumber=")[1].split(" ")[0])) row.append(int(item[col].split("scan=")[1].split("\"")[0])) if not colProc: columns.append("File") columns.append("controllerType") columns.append("controllerNumber") columns.append("scan") col = "base peak intensity" if col in item: row.append(float(item[col])) if not colProc: columns.append(col) #item["scanList"], #item["id"], col = "base peak m/z" if col in item: row.append(float(item[col])) if not colProc: columns.append(col) if not colProc: colProc = True mzml_list.append(row) df = pd.DataFrame(mzml_list,columns=columns) return df
def pick_peaks(mzml_file): scan = mzml.read(mzml_file).next() return process_mzml_scan(scan, savgol_window_length=7)
def load_mzml_file(filename, drop_ms1=False): output_ms1 = [] output_ms2 = [] for spectrum in pyteomicsmzml.read(filename): # print("==========================") # # for key in spectrum.keys(): # print(key, spectrum[key]) ms_level = spectrum["ms level"] scan = -1 index = int(spectrum["index"]) peaks = [] #peaks_zipped = zip(spectrum["m/z array"], spectrum["intensity array"]) for i in range(len(spectrum["m/z array"])): peaks.append([float(spectrum["m/z array"][i]), float(spectrum["intensity array"][i])]) #Determining scan for id_split in spectrum["id"].split(" "): if id_split.find("scan=") != -1: scan = int(id_split.replace("scan=", "")) if ms_level == 1: if drop_ms1 == False: output = Spectrum( filename, scan, index, peaks, 0, 0, ms_level ) output_ms1.append(output) if ms_level == 2: precusor_list = spectrum["precursorList"]["precursor"][0] activation = precusor_list["activation"] collision_energy = float(activation["collision energy"]) selected_ion_list = precusor_list["selectedIonList"] precursor_mz = float(selected_ion_list["selectedIon"][0]["selected ion m/z"]) precursor_intensity = 0 precursor_charge = 0 try: precursor_intensity = float(selected_ion_list["selectedIon"][0]["peak intensity"]) except: precursor_intensity = 0 try: precursor_charge = int(selected_ion_list["selectedIon"][0]["charge state"]) except: precursor_charge = 0 fragmentation_method = "NO_FRAG" totIonCurrent = float(spectrum["total ion current"]) try: for key in activation: if key == "beam-type collision-induced dissociation": fragmentation_method = "HCD" except: fragmentation_method = "NO_FRAG" output = Spectrum( filename, scan, index, peaks, precursor_mz, precursor_charge, ms_level, collision_energy=collision_energy, fragmentation_method=fragmentation_method, precursor_intensity=precursor_intensity, totIonCurrent=totIonCurrent ) output_ms1.append(output) return output_ms1 + output_ms2
def size(cls, name, max_num_reads=1000): """ Classmethod used to check the estimated size for the given file/folder. For mzml this is an estimate of the final size of the full 3D datacube. For efficiency the number of scans is estimated based on the size of the first 1000 scans. :param name: Name of the dir or file. :type name: unicode :param max_num_reads: The maximum number of spectrum reads to be performed to estimate the file size :type max_num_reads: int :returns: Integer indicating the size in byte or None if unknown. """ basename = None if os.path.isdir( name ): # If we point to a directory, check if the dir contains an mzML file filelist = cls.get_files_from_dir(name) if len(filelist) > 0: basename = filelist[0] else: basename = name if basename is not None: num_scans = -1 # Try to compute the number of scans by looking at the spectrumList count entry in the file try: size_line = os.popen('head -n 120 "' + basename + '" | grep "spectrumList count="').read() if len(size_line) > 0: size_text = size_line.split( 'spectrumList count=')[1].split('"')[1] if size_text.isdigit(): num_scans = int(size_text) except: pass if num_scans < 0: # Estimate the number of scans by reading the first 1000 spectra index = 0 prev_tell = 0 sizes = [] reader = mzml.read(basename) for _ in reader: if index >= max_num_reads: break current_tell = reader.file.file.tell() sizes.append(current_tell - prev_tell) prev_tell = current_tell index += 1 npsizes = np.asarray(sizes) filesize = os.stat(basename).st_size scansize = (npsizes.max() - npsizes.min()) / 2. num_scans = int(filesize / scansize) mz_axis_len = cls.__compute_mz_axis( filename=basename, mzml_filetype=cls.__compute_filetype(filename=basename), scan_types=cls.__compute_scan_types( filename=basename)).shape[0] return num_scans * mz_axis_len # temp_mzml_file = cls(basename=basename, requires_slicing=False) # itemsize = np.dtype(temp_mzml_file.data_type).itemsize # size = np.asarray(temp_mzml_file.shape).prod() * itemsize # print ('MZML size', size) # return size else: return None
def yield_spectrum(mzml_path): for spectrum in mzml.read(mzml_path): yield spectrum
def size(cls, name, max_num_reads=1000): """ Classmethod used to check the estimated size for the given file/folder. For mzml this is an estimate of the final size of the full 3D datacube. For efficiency the number of scans is estimated based on the size of the first 1000 scans. :param name: Name of the dir or file. :type name: unicode :param max_num_reads: The maximum number of spectrum reads to be performed to estimate the file size :type max_num_reads: int :returns: Integer indicating the size in byte or None if unknown. """ basename = None if os.path.isdir(name): # If we point to a directory, check if the dir contains an mzML file filelist = cls.get_files_from_dir(name) if len(filelist) > 0: basename = filelist[0] else: basename = name if basename is not None: num_scans = -1 # Try to compute the number of scans by looking at the spectrumList count entry in the file try: size_line = os.popen('head -n 120 "' + basename + '" | grep "spectrumList count="').read() if len(size_line) > 0: size_text = size_line.split('spectrumList count=')[1].split('"')[1] if size_text.isdigit(): num_scans = int(size_text) except: pass if num_scans < 0: # Estimate the number of scans by reading the first 1000 spectra index = 0 prev_tell = 0 sizes = [] reader = mzml.read(basename) for _ in reader: if index >= max_num_reads: break current_tell = reader.file.file.tell() sizes.append(current_tell - prev_tell) prev_tell = current_tell index += 1 npsizes = np.asarray(sizes) filesize = os.stat(basename).st_size scansize = (npsizes.max() - npsizes.min()) / 2. num_scans = int(filesize/scansize) mz_axis_len = cls.__compute_mz_axis(filename=basename, mzml_filetype=cls.__compute_filetype(filename=basename), scan_types=cls.__compute_scan_types(filename=basename)).shape[0] return num_scans*mz_axis_len # temp_mzml_file = cls(basename=basename, requires_slicing=False) # itemsize = np.dtype(temp_mzml_file.data_type).itemsize # size = np.asarray(temp_mzml_file.shape).prod() * itemsize # print ('MZML size', size) # return size else: return None
def __compute_num_scans(filename=None): """ Internal helper function used to compute the number of scans in the mzml file. """ reader = mzml.read(filename) return sum(1 for _ in reader)
def load_mzml_file(filename, drop_ms1=False): output_ms1 = [] output_ms2 = [] for spectrum in pyteomicsmzml.read(filename): # print("==========================") # # for key in spectrum.keys(): # print(key, spectrum[key]) ms_level = spectrum["ms level"] scan = -1 index = int(spectrum["index"]) peaks = [] #peaks_zipped = zip(spectrum["m/z array"], spectrum["intensity array"]) for i in range(len(spectrum["m/z array"])): peaks.append([float(spectrum["m/z array"][i]), float(spectrum["intensity array"][i])]) #Determining scan for id_split in spectrum["id"].split(" "): if id_split.find("scan=") != -1: scan = int(id_split.replace("scan=", "")) if ms_level == 1: if drop_ms1 == False: output = Spectrum( filename, scan, index, peaks, 0, 0, ms_level ) output_ms1.append(output) if ms_level == 2: precusor_list = spectrum["precursorList"]["precursor"][0] activation = precusor_list["activation"] collision_energy = float(activation["collision energy"]) selected_ion_list = precusor_list["selectedIonList"] precursor_mz = float(selected_ion_list["selectedIon"][0]["selected ion m/z"]) precursor_intensity = 0 precursor_charge = 0 try: precursor_intensity = float(selected_ion_list["selectedIon"][0]["peak intensity"]) except: precursor_intensity = 0 try: precursor_charge = int(selected_ion_list["selectedIon"][0]["charge state"]) except: precursor_charge = 0 fragmentation_method = "NO_FRAG" try: totIonCurrent = float(spectrum["total ion current"]) except: totIonCurrent = 0 try: for key in activation: if key == "beam-type collision-induced dissociation": fragmentation_method = "HCD" except: fragmentation_method = "NO_FRAG" output = Spectrum( filename, scan, index, peaks, precursor_mz, precursor_charge, ms_level, collision_energy=collision_energy, fragmentation_method=fragmentation_method, precursor_intensity=precursor_intensity, totIonCurrent=totIonCurrent ) output_ms1.append(output) return output_ms1 + output_ms2
4.69519356e+03, 1.55343822e+04, 5.45621612e+03, 5.53939031e+03, 9.49732490e+03, 8.05000735e+03, 2.65457068e+03, 1.36766228e+04, 2.69348480e+03, 6.71802368e+03, 4.46828571e+02, 1.39065143e+04, 4.29267365e+03, 2.73782365e+03, 1.35373492e+03, 1.17601397e+03 ] charge_array = [ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -2, -2, -3, -2, -2, -2, -3, -3, -2, -2, -3, -3, -2, -3, -3, -2, -3, -3, -3, -3, -3, -5, -4, -3, -6 ] f = writer.MzMLWriter(open(path, 'wb')) with f: f.controlled_vocabularies() with f.element('run'): f.write_spectrum(mz_array, intensity_array, charge_array, id='scanId=1', params=[{ "name": "ms level", "value": 1 }], polarity='negative scan') spec = next(mzml.read(path)) assert (all(np.abs(spec['m/z array'] - mz_array) < 1e-4))