def median_filtering(filenames, scans_count, n_median, mzs_net, mz_len): #scans_count=scans_count//n_median spectra_binned = np.zeros((scans_count.sum(), mz_len)) k = 0 if n_median == 1: for filename in filenames: reader = pymz.read(filename) print(filename) for tree in reader: mz = tree['m/z array'] spectr = tree['intensity array'] spectra_binned[k, :] = binning(mz, spectr, mzs_net, mz_len) k += 1 else: spectra_to_median = np.zeros((n_median, mz_len)) for i, filename in enumerate(filenames): reader = pymz.read(filename) print(filename) for j in range(scans_count[i]): for t in range(n_median): tree = next(reader) mz = tree['m/z array'] spectr = tree['intensity array'] spectra_to_median[t, :] = binning(mz, spectr, mzs_net, mz_len) spectra_binned[k, :] = np.median(spectra_to_median, axis=0) k += 1 return spectra_binned
def load_control_data(args): # load control dataset if args['control'] == '': print "No control file specified. Exiting BLANKA." sys.exit(1) if args['control'].endswith('.mzXML'): return list(pytmzxml.read(args['control'])) elif args['control'].endswith('d') or args['control'].endswith('.RAW'): args['control'] = args['control'].split('.')[0] + '.mzXML' return list(pytmzxml.read(args['control'])) else: control_files = [ os.path.join(dirpath, files) for dirpath, dirnames, filenames in os.walk(args['control']) for files in filenames if files.endswith('.mzXML') ] if control_files == []: control_raw_list = raw_data_detection(args, args['control']) msconvert(args, control_raw_list) control_files = [ os.path.join(dirpath, files) for dirpath, dirnames, filenames in os.walk(args['control']) for files in filenames if files.endswith('.mzXML') ] control_data = [] for control in control_files: control_data += list(pytmzxml.read(control)) print len(control_data) return control_data
def MGF_generator(df, folder_mzxml, output_filename): temp_spectrum_manager = [] for index, row in df.iterrows(): fn_mzxml_file = folder_mzxml + row['Original_Peaklist'] + '.mzXML' with mzxml.read(fn_mzxml_file) as output: spectrum_1 = str(row['ScanmzXML']) data_spectrum1 = output[spectrum_1] title = str('File:' + row['Original_Peaklist'] + '.' + str(row['ScanmzXML']) + ' "scan=' + str(row['ScanId']) + '"') params_dict = { 'TITLE': title, 'CHARGE': str('1+'), 'PEPMASS': str(row['ExpMz']), 'SCANS': str(row['ScanId']) } dictionnaire = { 'params': params_dict, 'm/z array': data_spectrum1['m/z array'], 'intensity array': data_spectrum1['intensity array'] } temp_spectrum_manager.append(dictionnaire) output_filename = output_filename + '.mgf' mgf.write(temp_spectrum_manager, output_filename)
def read_mzxml(PATH, scanlist, event_scan, fname, output, soutput, newscanno, spec_outfile): if os.path.isfile(PATH) and os.access(PATH, os.R_OK): with mzxml.read(PATH) as reader: for scanindex, spectrum in enumerate(reader): if scanlist.has_key(scanindex): try: pev = event_scan[fname][scanindex]['pev'] output.append("%s\t%d\t%s\t%s\t%s\n" % (pev, newscanno, spec_outfile, event_scan[fname][scanindex]['pseq'], event_scan[fname][scanindex]['etype'])) newscanno += 1 charge = int(spectrum['precursorList']['precursor'][0] ['selectedIonList']['selectedIon'][0] ['charge state']) mz = spectrum['precursorList']['precursor'][0][ 'selectedIonList']['selectedIon'][0][ 'selected ion m/z'] soutput.write( "BEGIN IONS\nTITLE=controllerType=0 controllerNumber=1 scan=%d\nCHARGE=%d+\nPEPMASS=%s\n" % (newscanno, charge, mz)) for x, y in zip(spectrum['m/z array'], spectrum['intensity array']): soutput.write("%s %s\n" % (x, y)) soutput.write("END IONS\n\n") except: print("Error reading mzML file") return newscanno
def readMs1Ms2(mzXmlPath): with mzxml.read(mzXmlPath) as spectra: ms1SpecDict = {} ms2SpecDict = {} lastMs1Scan = 0 for spectrum in spectra: # print(spectrum) # break if spectrum['msLevel'] == 1: ms1SpecDict[int(spectrum['num'])] = [ peak for peak in zip(spectrum['m/z array'], spectrum['intensity array']) ] lastMs1Scan = int(spectrum['num']) ## if spectrum['msLevel'] == 2: # print('2') # if int(spectrum['num']) == 120: # print(spectrum) charge = spectrum['precursorMz'][0]['precursorCharge'] neutralPcMass = spectrum['precursorMz'][0][ 'precursorMz'] * charge - ATOM_MASS['Z'] * charge ms1ScanNo = int(spectrum['precursorMz'][0]['precursorScanNum']) # ms1ScanNo = lastMs1Scan pcMz = spectrum['precursorMz'][0]['precursorMz'] ms2SpecDict[int(spectrum['num'])] = [ neutralPcMass, [ peak for peak in zip(spectrum['m/z array'], spectrum['intensity array']) ], ms1ScanNo, charge, pcMz ] return ms1SpecDict, ms2SpecDict
def create_mzxml_to_csodiaq_dict(mzxmlFile): mzxmlToCsodiaqDict = {} with mzxml.read(mzxmlFile) as spectra: for x in spectra: key = (round(x['precursorMz'][0]['precursorMz'], 2), round(x['precursorMz'][1]['precursorMz'], 2), x['compensationVoltage']) mzxmlToCsodiaqDict[key] = x['num'] return mzxmlToCsodiaqDict
def readPeaksFromXML(mzXmlPath): with mzxml.read(mzXmlPath) as spectra: specDict = {} for spectrum in spectra: # print(spectrum) # break specDict[int(spectrum['num'])] = [spectrum['precursorMz'][0]['precursorMz']*spectrum['precursorMz'][0]['precursorCharge'], [peak for peak in zip(spectrum['m/z array'], spectrum['intensity array'])]] return specDict
def get_mzxml(path, cutOff=100, digits=2): '''Generate a sequence of rounded and trimmed spectra from individual runs of the instrument.''' with mzxml.read(path) as reader: for spectrum in reader: mz = spectrum['m/z array'] intensity = spectrum['intensity array'] if cutOff: mz, intensity = trim_spectrum(mz, intensity, cutOff) mz, intensity = round_spec(mz, intensity, ) yield (mz, intensity)
def preprocess_sample(sample): print "Reading " + sample scans = [] r = mzxml.read(sample) while True: try: scans.append(r.next()) except: break print str(len(scans)) + " scans found in " + sample base_peaks = {} all_peaks = [] msI_list = {} for scan in scans: if scan['msLevel'] == '1': num = scan['num'] mzs_MSI = scan['m/z array'] intensities = scan['intensity array'] msI_TIC = scan['totIonCurrent'] msI_list[num] = {"num": num, "mzs": mzs_MSI, "TIC": msI_TIC} if scan['msLevel'] == '2': ms1_scan_num = int(scan['precursorMz'][0]['precursorScanNum']) base_mz = scan['precursorMz'][0]['precursorMz'] precursor_intensity = scan['precursorMz'][0]['precursorIntensity'] intensity_array = scan['intensity array'].tolist() msI_TIC = scans[ms1_scan_num]['totIonCurrent'] ms2_TIC = scan['totIonCurrent'] percentComposition = precursor_intensity / msI_TIC for x in range(0, len(intensity_array)): intensity_array[x] = intensity_array[x] / float(ms2_TIC) mzs = scan['m/z array'] num = int(scan['num']) percentComposition = ms2_TIC / msI_TIC base_peaks[num] = { "num": num, "base_mz": base_mz, "intensities": intensity_array, "mzs": mzs, "precursor_intensity": precursor_intensity, "MSI TIC": msI_TIC, "MS2 TIC": ms2_TIC } all_peaks = all_peaks + mzs.tolist() peak_min = int(math.floor(min(all_peaks))) peak_max = int(math.ceil(max(all_peaks))) #Get rid of really big variables... all_peaks = None scans = None r = None #Returns a list of MS2 spectra organized by scan #, and the largest and smallest precursor peaks across all MS2 in this file. return peak_min, peak_max, base_peaks, msI_list
def readMs1(mzXmlPath): with mzxml.read(mzXmlPath) as spectra: ms1SpecDict = {} for spectrum in spectra: # print(spectrum) # break ms1SpecDict[int(spectrum['num'])] = [ peak for peak in zip(spectrum['m/z array'], spectrum['intensity array']) ] return ms1SpecDict
def preprocess_sample(sample, antiBase_dict, adduct_titles, cutoff_percent, scan_level): """ Reads mzXML files in and compares their m/z values to those from antiBase file """ print("Reading " + sample) scans = [] r = mzxml.read(sample) while True: try: scans.append(r.next()) except: break print(str(len(scans)) + " scans found in " + sample) base_peaks = {} all_peaks = [] filtered_spectra = {} # if you choose to compare at the level of MSI, this script will only take peaks which have # intensities that are a significant portion of the TIC (significance is decided by variable cutoff_percent) if scan_level == 1: for scan in scans: if scan['msLevel'] == 1: num = int(scan['num']) RT = scan['retentionTime'] intensity_array_MSI = scan['intensity array'] mzs_MSI = scan['m/z array'] #print(mzs_MSI) total_TIC = scan['totIonCurrent'] filtered_spectra_properties_list = [] for i in range(0, len(mzs_MSI)): if intensity_array_MSI[i] / total_TIC >= cutoff_percent: filtered_spectra_properties_list.append( float(mzs_MSI[i])) filtered_spectra_properties_list.append(RT) filtered_spectra[ num] = filtered_spectra_properties_list # if you compare at the level of MS-II, this only takes peaks which generate MS2's if scan_level == 2: for scan in scans: if scan['msLevel'] == 2: num = int(scan['num']) RT = scan['retentionTime'] parent_mz = scan['precursorMz'][0]['precursorMz'] filtered_spectra_properties_list = [] filtered_spectra_properties_list.append(parent_mz) filtered_spectra_properties_list.append(RT) filtered_spectra[num] = filtered_spectra_properties_list scans = None print("Scans have been scanned") return antiBase_dict, filtered_spectra
def readMzXML(mzml_file, msLevel = 1): n = 0 with mzxml.read(mzml_file) as reader: for scan in reader: lvl = int(scan['msLevel']) time = float(scan['retentionTime']) * 60 n += 1 if scan['msLevel'] != msLevel: continue mzs = scan['m/z array'] ints = scan['intensity array'] assert mzs.shape == ints.shape yield time, mzs, ints, lvl, n - 1
def ReadInputFile(file): """Funkcja jako argument przyjmuje plik mzxml z 1 widmem eksperymentalnym, zwraca liste tupli [(masa1, pstwo1),...,(masaN, pstwoN)]""" with mzxml.read(file) as reader: mz_and_intensity = [] for spectrum in reader: intensity = spectrum['intensity array'] intensity_sum = np.sum(intensity) peaks_count = spectrum['peaksCount'] for p in range(peaks_count): prob = spectrum['intensity array'][p]/intensity_sum mz_and_intensity.append((spectrum['m/z array'][p], prob)) return(mz_and_intensity)
def load_from_mzxml(filename: str, ms_level: int = 2, metadata_harmonization: bool = True ) -> Generator[Spectrum, None, None]: """Load spectrum(s) from mzml file. This function will create ~matchms.Spectrum for every spectrum of desired ms_level found in a given MzXML file. For more extensive parsing options consider using the pyteomics package. Example: .. code-block:: python from matchms.importing import load_from_mzxml file_mzxml = "testdata.mzxml" spectrums = list(load_from_mzml(file_mzxml)) Parameters ---------- filename: Filename for mzXML file to import. ms_level: Specify which ms level to import. Default is 2. metadata_harmonization : bool, optional Set to False if metadata harmonization to default keys is not desired. The default is True. """ for pyteomics_spectrum in mzxml.read(filename, dtype=dict): if ("ms level" in pyteomics_spectrum and pyteomics_spectrum["ms level"] == ms_level or "msLevel" in pyteomics_spectrum and pyteomics_spectrum["msLevel"] == ms_level): metadata = parse_mzml_mzxml_metadata(pyteomics_spectrum) mz = numpy.asarray(pyteomics_spectrum["m/z array"], dtype="float") intensities = numpy.asarray(pyteomics_spectrum["intensity array"], dtype="float") if mz.shape[0] > 0: # Sort by mz (if not sorted already) if not numpy.all(mz[:-1] <= mz[1:]): idx_sorted = numpy.argsort(mz) mz = mz[idx_sorted] intensities = intensities[idx_sorted] yield Spectrum(mz=mz, intensities=intensities, metadata=metadata, metadata_harmonization=metadata_harmonization)
def load_control_data(args): # load control data files if args['output'] == '': directory = args['sample'] else: directory = args['output'] mzxml_list = [ os.path.join(dirpath, files) for dirpath, dirnames, filenames in os.walk(directory) for files in filenames if files.startswith(args['control']) and files.endswith('.mzXML') ] return [[[list(pytmzxml.read(mzxml))[0], mzxml], mzxml_list] for mzxml in mzxml_list]
def preprocess_sample(sample): print "Reading " + sample scans = [] r = mzxml.read(sample) while True: try: scans.append(r.next()) except: break print str(len(scans)) + " scans found in " + sample base_peaks = {} all_peaks = [] for scan in scans: if scan['msLevel'] == '2': ms1_scan_num = int(scan['precursorMz'][0]['precursorScanNum']) base_mz = scan['precursorMz'][0]['precursorMz'] precursor_intensity = scan['precursorMz'][0]['precursorIntensity'] intensity_array = scan['intensity array'].tolist() msI_TIC = scans[ms1_scan_num]['totIonCurrent'] ms2_TIC = scan['totIonCurrent'] percentComposition = ms2_TIC / msI_TIC #Normalize and log transform peak intensities in each scan #intensities = normalize(np.log(1+np.asarray(scan['intensity array']).reshape(1,-1)), norm='l1')[0] for x in range(0, len(intensity_array)): intensity_array[x] = math.log(intensity_array[x] / ms2_TIC) mzs = scan['m/z array'] num = int(scan['num']) percentComposition = ms2_TIC / msI_TIC base_peaks[num] = { "num": num, "base_mz": base_mz, "intensities": intensity_array, "mzs": mzs, "precursor_intensity": precursor_intensity, "Percent of Sample": percentComposition } all_peaks = all_peaks + mzs.tolist() peak_min = int(math.floor(min(all_peaks))) peak_max = int(math.ceil(max(all_peaks))) #Get rid of really big variables... all_peaks = None scans = None r = None #Returns a list of MS2 spectra organized by scan #, and the largest and smallest precursor peaks across all MS2 in this file. return peak_min, peak_max, base_peaks
def heavy_light_quantification(fragDict, libDict, mzxmlFiles, outDir, massTol, minMatch, ratioType, correction, hist): finalDf = initialize_quantification_output(fragDict, libDict) def initialize_ratio_dict_values(): return np.nan for f in mzxmlFiles: ppmDiffs = [] allSpectraMatch = QuantificationSpectraMatcher.QuantificationSpectraMatcher( ) scanToNoiseIntensityCutoffDict = dict() with mzxml.read(f, use_index=True) as file: for scan in sorted(libDict.keys()): spec = file.get_by_id(scan) scanToNoiseIntensityCutoffDict[int(scan)] = np.mean( sorted(spec['intensity array'])[:10]) / 2 expSpectrum = smf.format_spectra_for_pooling(spec, scan, sqrt=False) expSpectrum.sort() libSpectra = sorted(libDict[scan]) quantSpectraMatch = QuantificationSpectraMatcher.QuantificationSpectraMatcher( ) quantSpectraMatch.compare_spectra(libSpectra, expSpectrum, massTol, minMatch) allSpectraMatch.extend_all_spectra(quantSpectraMatch) if correction != -1: allSpectraMatch.filter_by_corrected_ppm_window( correction, hist, minMatch) ratioDict = defaultdict(initialize_ratio_dict_values) if len(allSpectraMatch.libraryIntensities) != 0: ratioDict = allSpectraMatch.determine_ratios( ratioDict, scanToNoiseIntensityCutoffDict, ratioType, minMatch) finalDf[f] = [ ratioDict[(int(row['scan']), row['peptide'])] for index, row in finalDf.iterrows() ] smf.print_milestone('Finish SILAC Quantification') return finalDf
def readPeaksFromXML(mzXmlPath): with mzxml.read(mzXmlPath) as spectra: specDict = {} for spectrum in spectra: charge = spectrum['precursorMz'][0]['precursorCharge'] neutralPcMass = spectrum['precursorMz'][0][ 'precursorMz'] * charge - ATOM_MASS['Z'] * charge ms1ScanNo = int(spectrum['precursorMz'][0]['precursorScanNum']) pcMz = spectrum['precursorMz'][0]['precursorMz'] specDict[int(spectrum['num'])] = [ neutralPcMass, [ peak for peak in zip(spectrum['m/z array'], spectrum['intensity array']) ], ms1ScanNo, charge, pcMz ] return specDict
def getMS2(outDict): ms2Dict = dict() for mzXML in outDict.keys(): reader = mzxml.read("C:/" + mzXML) ms2Dict[mzXML] = dict() nPSMs = 0 for spec in reader: scanNum = spec['id'] if scanNum in outDict[mzXML].keys(): nPSMs += 1 ms2Dict[mzXML][scanNum] = dict() ms2Dict[mzXML][scanNum]['mz'] = spec['m/z array'] ms2Dict[mzXML][scanNum]['intensity'] = spec['intensity array'] ms2Dict[mzXML][scanNum]['rt'] = spec['retentionTime'] elif int(scanNum) > int(max(outDict[mzXML].keys())): break else: continue ms2Dict[mzXML] = massCorrection(ms2Dict[mzXML]) return (ms2Dict)
def get_mzxml(path, prec_digits=2): """Generate a sequence of rounded and trimmed spectra from individual runs of the instrument. Parameters ---------- path : str Path to the mzXml file containing the mass spectrum. prec_digits : float The number of digits after which the floats get rounded. Returns ------- out : generator Generates tuples of numpy arrays corresponding to different runs of the experimental spectrum. """ with mzxml.read(path) as reader: for spectrum in reader: mz = spectrum['m/z array'] intensity = spectrum['intensity array'] mz, intensity = round_spectrum(mz, intensity, prec_digits) yield mz, intensity
def pool_scans_by_mz_windows(querySpectraFile): queWindowDict = defaultdict(list) queScanValuesDict = defaultdict(dict) with mzxml.read(querySpectraFile, use_index=True) as spectra: for spec in spectra: if 'precursorMz' not in spec: continue scan = spec['num'] precMz = spec['precursorMz'][0]['precursorMz'] windowWidth = spec['precursorMz'][0]['windowWideness'] queWindowDict[precMz, windowWidth].append(scan) queScanValuesDict[scan]['precursorMz'] = precMz queScanValuesDict[scan]['windowWideness'] = windowWidth queScanValuesDict[scan]['peaksCount'] = spec['peaksCount'] if 'compensationVoltage' in spec: CV = spec['compensationVoltage'] else: CV = '' queScanValuesDict[scan]['CV'] = CV return queWindowDict, queScanValuesDict
def test_cwt_peak_picking(): """ This is an old implementation of peak-picking with CWT with peaks correction and plotting. Super long (~306 s per scan). Thrashed.""" spectra = list(mzxml.read( '/Users/andreidm/ETH/projects/ms_feature_extractor/data/CsI_NaI_best_conc_mzXML/CsI_NaI_neg_08.mzXML')) mid_spectrum = spectra[43] # nice point on chromatogram # mz_region, intensities = extract_mz_region(mid_spectrum,[200,400]) mz_region, intensities = extract_mz_region(mid_spectrum, [200, 250]) # peak picking plt.plot(mz_region, intensities, lw=1) start_time = time.time() # peak_indices = signal.find_peaks_cwt(intensities, numpy.arange(1,32), min_snr=1, noise_perc=55) # this pair of widths and noise percent allows identification of everything beyond 100 intensity value (visually) # the larger widths the less number of relevant peaks identified # the larger noise percent the more number of redundant peaks identified cwt_peak_indices = signal.find_peaks_cwt(intensities, [0.5], min_snr=1, noise_perc=5) corrected_peak_indices = get_corrected_peak_indices(cwt_peak_indices, intensities, step=3, min_intensity=100) print('\n', time.time() - start_time, "seconds elapsed\n") # print(cwt_peak_indices, mz_region[cwt_peak_indices], intensities[cwt_peak_indices]) print("\nTotal number of CWT peaks = ", len(cwt_peak_indices)) print("\nTotal number of corrected peaks = ", len(corrected_peak_indices)) plt.plot(mz_region[cwt_peak_indices], intensities[cwt_peak_indices], 'gx', lw=1) plt.plot(mz_region[corrected_peak_indices], intensities[corrected_peak_indices], 'r.', lw=1) plt.show()
def readXML(bsaMzXmlPath): dtype = [('scanNo', int), ('pcCharge', int), ('pcMass', float)] with mzxml.read(bsaMzXmlPath) as spectra: specInfo = np.array([], dtype) specDict = {} for spectrum in spectra: # print(spectrum) # break # precurMass.append(spectrum['precursorMz'][0]['precursorMz']) # precurChag.append(spectrum['precursorMz'][0]['precursorCharge']) # specDict[spectrum['num']] = np.insert(spectrum['m/z array'], 0, 42.011) spec = np.array([(int(spectrum['num']),\ int(spectrum['precursorMz'][0]['precursorCharge']), spectrum['precursorMz'][0]['precursorCharge'])], dtype) specDict[int(spectrum['num'])] = spectrum['m/z array'] # if int(spectrum['num']) == 36781 : # print(spectrum) return specDict
def average_ms1(input_filename, output_filename=None, bin_width=1.0, format="csv"): mass_list = [] intensity_list = [] filename, file_extension = os.path.splitext(input_filename) if file_extension == ".mzXML": spectra = mzxml.read(input_filename, read_schema=True) #type is pyteomics mzxml if file_extension == ".mzML": spectra = mzml.read(input_filename, read_schema=True) #type is pyteomics mzxml peaks_list = [] for element in spectra: if "msLevel" in element: mslevel = element["msLevel"] if "ms level" in element: mslevel = element["ms level"] mlist = copy.deepcopy(element['m/z array']) inten = copy.deepcopy(element['intensity array']) if mslevel != 2: peaks_list += zip(mlist, inten) numpy_vector = vectorize_peaks(peaks_list, 2000, bin_width) if output_filename != None: dt = pd.DataFrame(data=numpy_vector) dt.to_csv(output_filename, mode='a', index=True) return numpy_vector
def mzxml_import_untargeted(mdf, mzxml_dir, compound_col='compound'): # create file_list from mdf using regexp search for date followed by _ file_list = mdf.filter(regex='\d{4,8}[_]').columns file_list = [s + '.mzxml' for s in file_list] print(file_list) ### below commented code will search all mzxml files in a folder #file_list = os.listdir(mzxml_dir) #file_list = [f for f in file_list if ('.mzXML' in f)] #print(file_list) dataframe_list = [] compound = compound_col compound = mdf[compound_col] medMz = mdf['medMz'] medRt = mdf['medRt'] ## mdf is a matrix with the relevant info for getting list mdf = pd.concat([compound, medMz, medRt], axis=1) for file_to_open in file_list: data_path_1 = os.path.join(mzxml_dir, file_to_open) ### Search spectra in .mzxml file for precursor ions matching Maven list (in mdf matrix) # Create empty list to store results results = [] try: # Load mass spec file using mzxml - returns a generator with results ms_file = mzxml.read(data_path_1, read_schema=True, iterative=False, use_index=False, dtype=None) print(file_to_open + ' : Processing...') # Iterate through elements in mass spec data generator for i, spectra in enumerate(ms_file): # iterate through m/z values to look for in each mass spec peak entry for k, mz_target in enumerate(mdf[mdf.columns[1]]): #rt_target = mdf.iloc[k,2] # 'precursorMz' in keys indicate a 2nd-order mass spec result with a 2nd smaller dictionary as a value if 'precursorMz' in spectra.keys(): mz_measured = spectra['precursorMz'][0]['precursorMz'] # if one of the recorded peaks is within 20ppm of the reference value, # extract the information in that entry and append to results # combine m/z array and intensity array (mzarray), also take relative intensity and sort descending (mzarray_norm) if (abs((mz_measured - mz_target) / mz_target) < spectra_threshold) and ( abs(mdf.iloc[k, 2] - spectra['retentionTime']) < rt_thresh): #if abs((mz_measured-mz_target)/mz_target)<spectra_threshold and abs(spectra['retentionTime']-mdf.iloc[k,2])<rt_thresh : q = spectra['intensity array'] maxvali = np.amax(q, axis=0) q_norm = q / maxvali r = spectra['m/z array'] mzarray = np.array(list(zip(r, q))) mzarray_norm = np.array(list(zip(r, q_norm))) mzarray_norm = mzarray_norm[( -mzarray_norm[:, 1]).argsort()] result_dict = { 'precursorMz_m': spectra['precursorMz'][0]['precursorMz'], 'retentionTime_m': spectra['retentionTime'], 'mzarray': mzarray, 'mzarray_norm': mzarray_norm, 'compound': mdf.iloc[k]['compound'] } results.append(pd.Series(result_dict)) # concatenate list of results into a single dataframe results = pd.DataFrame(results) # preview results #print('Done') #results.head() if len(results) == 0: continue #index mdf and results to compound and then combine results_c = results.set_index('compound') mdf_c = mdf.copy() mdf_c = mdf_c.set_index('compound') #this combines data = mdf_c.join(results_c) # look for specific fragments in each m/z array #for ref in ref_array: # data[str(ref)] = data['mzarray_norm'].apply(check_in_array, ref_value = ref) data["sample"] = file_to_open dataframe_list.append(data) group_df = pd.concat(dataframe_list) ## NOTE: when same compound names are included with different RTs, some spectra match to wrong RT ## below filter removes spectra matched with incorrect RT ## remove spectra that are not within rt range group_df['diff'] = group_df['medRt'] - group_df['retentionTime_m'] group_df = group_df.loc[~(abs(group_df['diff']) >= 1)] group_df = group_df.drop(['diff'], axis=1) except: print(file_to_open + ' not found') continue return group_df
def test_read_dtype(self): dtypes = {'m/z array': np.float32, 'intensity array': np.int32} with read(self.path, dtype=dtypes) as f: for spec in f: for k, v in dtypes.items(): self.assertEqual(spec[k].dtype, v)
def perform_spectra_pooling_and_analysis(querySpectraFile, outFile, lib, tolerance, maxQuerySpectraToPool, corrected, histFile): smf.print_milestone('Begin Grouping Scans by m/z Windows:') queWindowDict, queScanValuesDict = pool_scans_by_mz_windows( querySpectraFile) print('Number of Unpooled MS/MS Query Spectra: ' + str(len(queScanValuesDict))) print('Number of Pooled MS/MS Query Spectra/Mz Windows: ' + str(len(queWindowDict)), flush=True) # To enhance the print experience, status prints will be given at intervals tailored to the number of identified windows. # example: if there are 1-99 pooled query spectra, print statements are made after every pooled query spectra analysis is complete. # if there are 100-999, print after every 10 pooled spectra. And so on. printFriendlyCounter = 100 while printFriendlyCounter < len(queWindowDict): printFriendlyCounter *= 10 printFriendlyCounter /= 100 allLibKeys, libIdToKeyDict, libIdToDecoyDict = gather_library_metadata(lib) allSpectraMatches = IdentificationSpectraMatcher.IdentificationSpectraMatcher( ) numWindowsAnalyzed = 0 prevtime = timer() smf.print_milestone('Begin Pooled Spectra Analysis:') with mzxml.read(querySpectraFile, use_index=True) as spectra: for precMz_win, scans in queWindowDict.items(): top_mz = precMz_win[0] + precMz_win[1] / 2 bottom_mz = precMz_win[0] - precMz_win[1] / 2 libKeys = identify_lib_spectra_in_window(top_mz, bottom_mz, allLibKeys) if len(libKeys) == 0: continue pooledLibSpectra = pool_lib_spectra(lib, libKeys) pooledQueSpectra = [] for i in range(len(scans)): scanNumber = scans[i] queSpectrum = spectra.get_by_id(scanNumber) pooledQueSpectra += smf.format_spectra_for_pooling( queSpectrum, scanNumber) if (i % maxQuerySpectraToPool == 0 and i != 0) or i == len(scans) - 1: pooledQueSpectra.sort() windowSpectraMatches = IdentificationSpectraMatcher.IdentificationSpectraMatcher( ) windowSpectraMatches.compare_spectra( pooledLibSpectra, pooledQueSpectra, tolerance, libIdToDecoyDict) allSpectraMatches.extend_all_spectra(windowSpectraMatches) pooledQueSpectra.clear() numWindowsAnalyzed += 1 if numWindowsAnalyzed % printFriendlyCounter == 0: time = timer() print('\nNumber of Pooled Experimental Spectra Analyzed: ' + str(numWindowsAnalyzed)) print('Number of Spectra in Current Pooled Spectra: ' + str(len(scans))) print('Time Since Last Checkpoint: ' + str(round(time - prevtime, 2)) + ' Seconds', flush=True) prevtime = time smf.print_milestone('Begin FDR Analysis:') maccCutoff = allSpectraMatches.find_score_fdr_cutoff() if corrected != -1: smf.print_milestone('Begin Correction Process:') allSpectraMatches.filter_by_corrected_ppm_window( corrected, maccCutoff, histFile) smf.print_milestone('Begin Corrected FDR Analysis:') maccCutoff = allSpectraMatches.find_score_fdr_cutoff() smf.print_milestone('\nBegin Writing to File: ') allSpectraMatches.write_output(outFile, querySpectraFile, maccCutoff, queScanValuesDict, libIdToKeyDict, lib)
from pyteomics import mzxml from src.msfe import ms_operator annotated_peaks = [[204.92, 204.94], [205.15, 205.17], [205.92, 205.94], [207.055, 207.075], [207.092, 207.110], [208.085, 208.10], [208.135, 208.150], [214.99, 215.015], [216.93, 216.95], [239.055, 239.075]] expected_peaks = [[126.90, 126.92], [139.01, 139.03], [276.79, 276.81], [271.84, 271.85], [1047.9, 1047.92], [1048.9, 1048.92]] spectra = list( mzxml.read( '/Users/andreidm/ETH/projects/ms_feature_extractor/data/CsI_NaI_best_conc_mzXML/CsI_NaI_neg_08.mzXML' )) mid_spectrum = spectra[43] # nice point on chromatogram accurate_peak_locations = [] for mz_region in expected_peaks: print(mz_region, "is being processed...") accurate_peak_locations.append( ms_operator.locate_annotated_peak(mz_region, mid_spectrum)) print("Done!") print() print(accurate_peak_locations)
from pyteomics import mzxml import sys from itertools import groupby import collections import matplotlib.mlab as mlab import matplotlib.pyplot as plt import seaborn as sns scans = [] sample = sys.argv[1] r = mzxml.read(sample) while True: try: scans.append(r.next()) except: break freq_list = [] print "making frequency list" for scan in scans: if scan['msLevel'] == '2': base_mz = int(scan['precursorMz'][0]['precursorMz'] * 10000) precursor_intensity = scan['precursorMz'][0]['precursorIntensity'] freq_list.append(base_mz) freq_list = sorted(freq_list) d = {x: freq_list.count(x) for x in freq_list} od = sorted(d.items(), key=lambda x: x[1]) print freq_list #sns.distplot(d.keys()) plt.hist([item / float(10000) for item in freq_list], bins=len(freq_list))
def __init__(self, mz_file): self._file_path = mz_file self.data = mzxml.read(mz_file, use_index=True)