def test_molecular_formula_search_db(): MSParameters.molecular_search.isAdduct = False MSParameters.molecular_search.isRadical = False mass_spec_obj = create_mass_spectrum() time1 = time.time() SearchMolecularFormulas(mass_spec_obj, first_hit=True).run_worker_mass_spectrum() print('searching molecular formulas took %.3f seconds' % (time.time() - time1)) i = 0 j = 0 error = list() mass = list() abundance = list() for mspeak in mass_spec_obj.sort_by_abundance(): if mspeak.is_assigned: i += 1 for mformula in mspeak: mass.append(mspeak.mz_exp) error.append(mformula.mz_error) abundance.append(mspeak.abundance) else: j += 1 pass print('%i peaks assigned and %i peaks not assigned' % (i, j))
def test_run_molecular_formula_search(): MSParameters.molecular_search.usedAtoms['F'] = (0,0) MSParameters.molecular_search.usedAtoms['P'] = (0,0) MSParameters.molecular_search.usedAtoms['Cl'] = (0,0) MSParameters.molecular_search.isAdduct = False MSParameters.molecular_search.isRadical = False MSParameters.molecular_search.used_atom_valences['P'] = 0 MSParameters.molecular_search.used_atom_valences['F'] = 0 MSParameters.molecular_search.used_atom_valences['Cl'] = 0 mz = [215.09269] abundance = [1] rp, s2n = [1] ,[1] dataname = 'one peak' mass_spectrum_obj = ms_from_array_centroid(mz, abundance, rp, s2n, dataname) SearchMolecularFormulas(mass_spectrum_obj).run_worker_ms_peaks([mass_spectrum_obj[0]]) ms_peak = mass_spectrum_obj[0] print(ms_peak.mz_exp) if ms_peak.is_assigned: for formula in ms_peak: print(formula.string_formated, formula.mz_error)
def search_ms1_data(icrfile: str, dict_metal_eicdata: Dict[str, EIC_Data], parameters: LCMSParameters): '''place holder for parsing and search LC FT-MS data''' lcms_obj, parser = run_thermo(icrfile, parameters) tic_data, ax_tic = lcms_obj.get_tic(ms_type='MS !d', peak_detection=True, smooth=False, plot=True) plt.show() for metal, eic_data in dict_metal_eicdata.items(): print(metal, eic_data.apexes) for peak_indexex in eic_data.apexes: ftms_scans_index = ([ find_nearest_scan(eic_data.time[i], tic_data) for i in peak_indexex ]) ftms_scans = [tic_data.scans[i] for i in ftms_scans_index] ftms_times = [tic_data.time[i] for i in ftms_scans_index] retention_time = tic_data.time[ftms_scans_index[1]] print(ftms_scans) print(ftms_times) parser.chromatogram_settings.start_scan = ftms_scans[0] parser.chromatogram_settings.end_scan = ftms_scans[-1] mass_spec = parser.get_average_mass_spectrum_in_scan_range( auto_process=False) mass_spec.retention_time = retention_time mass_spec.settings = parameters.mass_spectrum mass_spec.molecular_search_settings = parameters.ms1_molecular_search mass_spec.mspeaks_settings = parameters.ms_peak mass_spec.process_mass_spec() metal_atom = ''.join(i for i in metal if not i.isdigit()) mass_spec.molecular_search_settings.usedAtoms[metal_atom] = (1, 1) mass_spec.plot_profile_and_noise_threshold() SearchMolecularFormulas( mass_spec, first_hit=False).run_worker_mass_spectrum() mass_spec.molecular_search_settings.usedAtoms[metal_atom] = (0, 0) mass_spec.percentile_assigned(report_error=True) print(metal) filename = '{}_rt{}_{}'.format(metal, retention_time, mass_spec.sample_name).replace( ".", "_") print(filename) mass_spec.to_csv(filename, write_metadata=False)
def run_molecular_formula_search(mz, out, parameters_filepath): mz = [mz] abundance = [1] rp, s2n = [[1], [1]] dataname = Path(str(out)) mass_spectrum_obj = ms_from_array_centroid(mz, abundance, rp, s2n, dataname) parameter_from_json.load_and_set_parameters_ms( mass_spectrum_obj, parameters_path=parameters_filepath) mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = False mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = 10 mass_spectrum_obj.molecular_search_settings.use_isotopologue_filter = False click.echo('Searching for molecular formulas within %.3f and %.3f ppm' % (mass_spectrum_obj.molecular_search_settings.min_ppm_error, mass_spectrum_obj.molecular_search_settings.max_ppm_error)) SearchMolecularFormulas(mass_spectrum_obj, find_isotopologues=True).run_worker_ms_peaks( [mass_spectrum_obj[0]]) ms_peak = mass_spectrum_obj[0] if ms_peak: header = [ 'Molecular Formula', 'Calculated m/z', 'Mass Error', 'DBE', 'Ion Type' ] results = [] for formula in ms_peak: results.append([ formula.to_string, formula.mz_calc, formula.mz_error, formula.dbe, formula.ion_type ]) click.echo(tabulate(results, headers=header, floatfmt=("s", ".5f", ".5f", ".1f", "s")), file=out) click.echo('', file=out) else: click.echo( "Could not find a possible molecular formula match for the m/z %.5f" % mz[0], file=out) click.echo('', file=out)
def search_sox(mass_spectrum_obj): filter_by_resolving_power() MSParameters.molecular_search.usedAtoms['O'] = (1, 10) MSParameters.molecular_search.usedAtoms['N'] = (0, 0) MSParameters.molecular_search.usedAtoms['S'] = (1, 3) MSParameters.molecular_search.usedAtoms['Cl'] = (0, 0) SearchMolecularFormulas(mass_spectrum_obj, first_hit=True).run_worker_mass_spectrum()
def assign_mf_pox(mass_spectrum_obj): MSParameters.molecular_search.usedAtoms['O'] = (4, 20) MSParameters.molecular_search.usedAtoms['N'] = (0, 0) MSParameters.molecular_search.usedAtoms['S'] = (0, 0) MSParameters.molecular_search.usedAtoms['Cl'] = (0,0) MSParameters.molecular_search.usedAtoms['P'] = (1, 1) MSParameters.molecular_search.isProtonated = True MSParameters.molecular_search.isRadical = True MSParameters.molecular_search.isAdduct = True SearchMolecularFormulas(mass_spectrum_obj, first_hit=True).run_worker_mass_spectrum()
def run_assignment(file_location): # mass_spectrum = run_bruker(file_location) # mass_spectrum = get_masslist(file_location) mass_spectrum = run_thermo(file_location) mass_spectrum.molecular_search_settings.error_method = 'None' mass_spectrum.molecular_search_settings.min_ppm_error = -5 mass_spectrum.molecular_search_settings.max_ppm_error = 5 mass_spectrum.molecular_search_settings.url_database = None mass_spectrum.molecular_search_settings.min_dbe = 0 mass_spectrum.molecular_search_settings.max_dbe = 50 mass_spectrum.molecular_search_settings.usedAtoms['C'] = (1, 100) mass_spectrum.molecular_search_settings.usedAtoms['H'] = (4, 200) mass_spectrum.molecular_search_settings.usedAtoms['O'] = (1, 30) mass_spectrum.molecular_search_settings.usedAtoms['N'] = (0, 0) mass_spectrum.molecular_search_settings.usedAtoms['S'] = (0, 0) mass_spectrum.molecular_search_settings.usedAtoms['Cl'] = (0, 0) mass_spectrum.molecular_search_settings.usedAtoms['Br'] = (0, 0) mass_spectrum.molecular_search_settings.usedAtoms['P'] = (0, 0) mass_spectrum.molecular_search_settings.usedAtoms['Na'] = (0, 0) mass_spectrum.molecular_search_settings.isProtonated = True mass_spectrum.molecular_search_settings.isRadical = False mass_spectrum.molecular_search_settings.isAdduct = False # mass_spectrum.filter_by_max_resolving_power(15, 2) SearchMolecularFormulas(mass_spectrum, first_hit=False).run_worker_mass_spectrum() mass_spectrum.percentile_assigned(report_error=True) mass_spectrum.molecular_search_settings.score_method = "prob_score" mass_spectrum.molecular_search_settings.output_score_method = "prob_score" # export_calc_isotopologues(mass_spectrum, "15T_Neg_ESI_SRFA_Calc_Isotopologues") mass_spectrum_by_classes = HeteroatomsClassification( mass_spectrum, choose_molecular_formula=True) mass_spectrum_by_classes.plot_ms_assigned_unassigned() plt.show() mass_spectrum_by_classes.plot_mz_error() plt.show() mass_spectrum_by_classes.plot_ms_class("O2") plt.show() # dataframe = mass_spectrum_by_classes.to_dataframe() return mass_spectrum
def test_search_imported_ref_files(): mass_spectrum_obj = get_mass_spectrum() ref_file_location = os.path.join(os.getcwd(), os.path.normcase("tests/tests_data/")) + "SRFA.ref" mf_references_list = ImportMassListRef(ref_file_location).from_bruker_ref_file() for mf in mf_references_list: print(mf.mass, mf.classe) ms_peaks_assigned = SearchMolecularFormulas(mass_spectrum_obj).search_mol_formulas( mf_references_list, find_isotopologues=False) assert (len(ms_peaks_assigned)) > 0
def run_assignment(file_location, field_strength=12): # mass_spectrum = get_masslist(file_location) mass_spectrum, transient_time = run_bruker(file_location) set_parameters(mass_spectrum, field_strength=field_strength, pos=False) mass_spectrum.filter_by_max_resolving_power(field_strength, transient_time) SearchMolecularFormulas(mass_spectrum, first_hit=False).run_worker_mass_spectrum() mass_spectrum.percentile_assigned(report_error=True) mass_spectrum.molecular_search_settings.score_method = "prob_score" mass_spectrum.molecular_search_settings.output_score_method = "prob_score" mass_spectrum.to_csv(mass_spectrum.sample_name, write_metadata=False)
def search_nsox(mass_spectrum_obj): filter_by_resolving_power() MSParameters.molecular_search.usedAtoms['O'] = (1, 10) MSParameters.molecular_search.usedAtoms['N'] = (1, 3) MSParameters.molecular_search.usedAtoms['S'] = (1, 3) MSParameters.molecular_search.usedAtoms['Cl'] = (0, 0) MSParameters.molecular_search.min_dbe = 0 MSParameters.molecular_search.max_dbe = 50 MSParameters.molecular_search.isProtonated = True MSParameters.molecular_search.isRadical = True MSParameters.molecular_search.isAdduct = True SearchMolecularFormulas(mass_spectrum_obj, first_hit=True).run_worker_mass_spectrum()
def find_most_abundant_formula(self, mass_spectrum_obj): ''' find most abundant using kendrick Returns ---------- MolecularFormula class obj most abundant MolecularFormula with the lowest mass error ''' #need to find a better way to cut off outliners #import matplotlib.pyplot as plt #plt.hist(mass_spectrum_obj.abundance, bins=100) #plt.show() abundances = mass_spectrum_obj.abundance abun_mean = average(abundances, axis=0) abun_std = std(abundances, axis=0) upper_limit = abun_mean + 7 * abun_std print( "Maximum abundance limit = %s and max abundance kendrick cluster = %s" % (upper_limit, max(mass_spectrum_obj, key=lambda m: m.abundance).abundance)) mspeak_most_abundant = max(mass_spectrum_obj, key=lambda m: m.abundance if m.abundance <= upper_limit else 0) print("Searching molecular formulas") SearchMolecularFormulas(mass_spectrum_obj, self.sql_db).run_worker_ms_peaks( [mspeak_most_abundant]) print("Finished searching molecular formulas") if mspeak_most_abundant: return mspeak_most_abundant.best_molecular_formula_candidate else: raise Exception( "Could not find a possible molecular formula match for the most abundant peak of m/z %.5f" % mspeak_most_abundant.mz_exp)
def search_sx(mass_spectrum_obj): #print(len(mass_spectrum_obj), 'before kendrick filter') filter_by_resolving_power() #print(len(mass_spectrum_obj), 'after kendrick filter') #print(len(mass_spectrum_obj), 'after resolving power filter') MSParameters.molecular_search.usedAtoms['O'] = (0,0) MSParameters.molecular_search.usedAtoms['N'] = (0, 0) MSParameters.molecular_search.usedAtoms['S'] = (1, 3) MSParameters.molecular_search.usedAtoms['Cl'] = (0, 0) #MSParameters.molecular_search.usedAtoms['F'] = (0, 1) #MSParameters.molecular_search.usedAtoms['P'] = (0, 0) MSParameters.molecular_search.min_dbe = 0 MSParameters.molecular_search.max_dbe = 50 SearchMolecularFormulas(mass_spectrum_obj, first_hit=True).run_worker_mass_spectrum()
def run_assignment(file_location): #mass_spectrum = run_bruker(file_location) mass_spectrum = get_masslist(file_location) mass_spectrum.molecular_search_settings.error_method = 'None' mass_spectrum.molecular_search_settings.min_ppm_error = -1 mass_spectrum.molecular_search_settings.max_ppm_error = 1 mass_spectrum.molecular_search_settings.url_database = "postgres://*****:*****@localhost:5432/molformula" mass_spectrum.molecular_search_settings.min_dbe = 0 mass_spectrum.molecular_search_settings.max_dbe = 50 mass_spectrum.molecular_search_settings.usedAtoms['C'] = (1, 100) mass_spectrum.molecular_search_settings.usedAtoms['H'] = (4, 200) mass_spectrum.molecular_search_settings.usedAtoms['O'] = (1, 22) mass_spectrum.molecular_search_settings.usedAtoms['N'] = (0, 0) mass_spectrum.molecular_search_settings.usedAtoms['S'] = (0, 0) mass_spectrum.molecular_search_settings.usedAtoms['Cl'] = (0, 0) mass_spectrum.molecular_search_settings.usedAtoms['Br'] = (0, 0) mass_spectrum.molecular_search_settings.usedAtoms['P'] = (0, 0) mass_spectrum.molecular_search_settings.usedAtoms['Na'] = (0, 0) mass_spectrum.molecular_search_settings.isProtonated = True mass_spectrum.molecular_search_settings.isRadical = False mass_spectrum.molecular_search_settings.isAdduct = False #mass_spectrum.filter_by_max_resolving_power(15, 2) SearchMolecularFormulas(mass_spectrum, first_hit=False).run_worker_mass_spectrum() mass_spectrum.percentile_assigned(report_error=True) mass_spectrum.to_csv("15T_Neg_ESI_SRFA") export_calc_isotopologues(mass_spectrum, "15T_Neg_ESI_SRFA_Calc_Isotopologues") mass_spectrum_by_classes = HeteroatomsClassification( mass_spectrum, choose_molecular_formula=True) mass_spectrum_by_classes.plot_ms_assigned_unassigned() #plt.show() # dataframe = mass_spectrum_by_classes.to_dataframe() return (mass_spectrum, mass_spectrum_by_classes)
def test_mspeak_search(): mass_spec_obj = create_mass_spectrum() print("OK") mspeak_obj = mass_spec_obj.most_abundant_mspeak SearchMolecularFormulas(mass_spec_obj).run_worker_ms_peaks([mspeak_obj]) print("OK2") if mspeak_obj.is_assigned: print(mspeak_obj.molecular_formula_earth_filter().string) print(mspeak_obj.molecular_formula_water_filter().string) print(mspeak_obj.molecular_formula_air_filter().string) print(mspeak_obj.cia_score_S_P_error().string) print(mspeak_obj.cia_score_N_S_P_error().string) print(mspeak_obj.best_molecular_formula_candidate.string) print(mspeak_obj[0].mz_error, mspeak_obj[0].string_formated)
def find_most_abundant_formula_test(self, mass_spectrum_obj, settings): #this function is intended for test only. # Have to sort by Kendrick to be able to select the most abundant series #then select the most abundant peak inside the series #or have the user select the reference mspeak on the gui mspeak_most_abundant = mass_spectrum_obj.most_abundant_mspeak SearchMolecularFormulas(mass_spectrum_obj, self.sql_db).run_worker_ms_peaks( [mspeak_most_abundant]) if mspeak_most_abundant: return mspeak_most_abundant.best_molecular_formula_candidate else: raise Exception( "Could not find a possible molecular formula match for the most abundant peak of m/z %.5f" % mspeak_most_abundant.mz_exp)
def assign_mf_nsox(mass_spectrum_obj): #print(len(mass_spectrum_obj), 'before kendrick filter') filter_by_resolving_power() #print(len(mass_spectrum_obj), 'after kendrick filter') #print(len(mass_spectrum_obj), 'after resolving power filter') MSParameters.molecular_search.usedAtoms['O'] = (4, 20) MSParameters.molecular_search.usedAtoms['N'] = (1, 3) MSParameters.molecular_search.usedAtoms['S'] = (1, 5) MSParameters.molecular_search.usedAtoms['Cl'] = (0, 0) MSParameters.molecular_search.min_dbe = 0 MSParameters.molecular_search.max_dbe = 36 MSParameters.molecular_search.isProtonated = True MSParameters.molecular_search.isRadical = True MSParameters.molecular_search.isAdduct = True SearchMolecularFormulas(mass_spectrum_obj, first_hit=True).run_worker_mass_spectrum(mass_spectrum_obj,)
def run_nmdc_workflow(args): # mass_spectrum = get_masslist(file_location) file_location, ref_calibration_file, field_strength = args if field_strength == 21: # return "21T", None # print("{} {}".format("21T", file_location)) print("{} {} {}".format("processing", field_strength, file_location)) mass_spectrum, transient_time = run_thermo(file_location) else: print("{} {} {}".format("processing", field_strength, file_location)) mass_spectrum, transient_time = run_bruker(file_location) # return "not 21T", None is_pos = True if mass_spectrum.polarity > 0 else False if len(mass_spectrum) < 30: print("{} {}".format("too few peaks", file_location)) return "too few peaks", None set_parameters(mass_spectrum, field_strength=field_strength, pos=is_pos) if ref_calibration_file: calspec(mass_spectrum, ref_calibration_file) # MzDomainCalibration(mass_spectrum, ref_calibration_file).run() # mass_spectrum.filter_by_max_resolving_power(field_strength, transient_time) SearchMolecularFormulas(mass_spectrum, first_hit=False).run_worker_mass_spectrum() mass_spectrum.percentile_assigned(report_error=True) mass_spectrum.molecular_search_settings.score_method = "prob_score" mass_spectrum.molecular_search_settings.output_score_method = "prob_score" return "all_good", mass_spectrum
def test_heteroatoms_classification(): MSParameters.molecular_search.error_method = 'None' MSParameters.molecular_search.min_ppm_error = -10 MSParameters.molecular_search.max_ppm_error = 10 MSParameters.molecular_search.mz_error_range = 1 MSParameters.molecular_search.isProtonated = True MSParameters.molecular_search.isRadical= False MSParameters.molecular_search.isAdduct= False MSParameters.molecular_search.usedAtoms['C'] = (1, 100) MSParameters.molecular_search.usedAtoms['H'] = (4, 200) MSParameters.molecular_search.usedAtoms['O'] = (1, 18) #MSParameters.molecular_search.usedAtoms = usedatoms mass_spec_obj = create_mass_spectrum() assignOx = SearchMolecularFormulas(mass_spec_obj).run_worker_mass_spectrum() #test classification mass_spec_obj.percentile_assigned() mass_spectrum_by_classes = HeteroatomsClassification(mass_spec_obj) mass_spectrum_by_classes.plot_ms_assigned_unassigned() mass_spectrum_by_classes.atoms_ratio_all("H", "C") mass_spectrum_by_classes.dbe_all() mass_spectrum_by_classes.carbon_number_all() mass_spectrum_by_classes.abundance_assigned() mass_spectrum_by_classes.mz_exp_assigned() mass_spectrum_by_classes.abundance_count_percentile(Labels.unassigned) mass_spectrum_by_classes.peaks_count_percentile(Labels.unassigned)
def run_assignment(file_location, workflow_params): file_path = Path(file_location) if file_path.suffix == '.raw': first_scan, last_scan = workflow_params.raw_file_start_scan, workflow_params.raw_file_final_scan mass_spectrum = run_thermo_reduce_profile(file_location, workflow_params, first_scan, last_scan) elif file_path.suffix == '.d': mass_spectrum = run_bruker_transient(file_location, workflow_params.corems_json_path) elif file_path.suffix == '.txt' or file_path.suffix == 'csv': mass_spectrum = get_masslist(file_location, workflow_params.corems_json_path, polarity=workflow_params.polarity, is_centroid=workflow_params.is_centroid) mass_spectrum.set_parameter_from_json(workflow_params.corems_json_path) if workflow_params.calibrate: ref_file_location = Path(workflow_params.calibration_ref_file_path) MzDomainCalibration(mass_spectrum, ref_file_location).run() # force it to one job. daemon child can not have child process mass_spectrum.molecular_search_settings.db_jobs = 1 SearchMolecularFormulas(mass_spectrum, first_hit=False).run_worker_mass_spectrum() return mass_spectrum
list_dict = [] dirnames = get_dirnames() if dirnames: for file_location in dirnames: print(file_location) mass_spectrum = get_mass_spectrum(file_location) set_settings_for_bromothymol_blue(mass_spectrum) #set_settings_for_chlorophenol_red(mass_spectrum) SearchMolecularFormulas(mass_spectrum, first_hit=True).run_worker_mass_spectrum() #mass_error_prediction = MassErrorPrediction(mass_spectrum) #mass_error_prediction.get_results() ax = mass_spectrum.plot_mz_domain_profile() #plt.show() for mspeak in mass_spectrum: if mspeak: for mf in mspeak: ax.plot(mspeak.mz_exp,
#access the transient object bruker_transient_obj = bruker_reader.get_transient() #calculates the transient duration time T = bruker_transient_obj.transient_time #access the mass spectrum object mass_spectrum_obj = bruker_transient_obj.get_mass_spectrum(plot_result=False, auto_process=True) # - search monoisotopic molecular formulas for all mass spectral peaks # - calculate fine isotopic structure based on monoisotopic molecular formulas found and current dynamic range # - search molecular formulas of correspondent calculated isotopologues, # - settings are stored at SearchConfig.json and can be changed directly on the file or inside the framework class SearchMolecularFormulas(mass_spectrum_obj, first_hit=False).run_worker_mass_spectrum() # iterate over mass spectral peaks objs for mspeak in mass_spectrum_obj.sort_by_abundance(): # returns true if there is at least one molecular formula associated # with the mass spectral peak # same as mspeak.is_assigned -- > bool if mspeak: # get the molecular formula with the highest mass accuracy molecular_formula = mspeak.molecular_formula_lowest_error # plot mz and peak height, use mass_spectrum_obj.mz_exp to access all mz # and mass_spectrum_obj.mz_exp_profile to access mz with all available datapoints pyplot.plot(mspeak.mz_exp, mspeak.abundance, 'o', c='g')
def single_process(mf_references_dict: Dict[str, Dict[float, List[MolecularFormula]]], datapath: Path, current_mix: str, mf_results_dic: dict): plt.rcParams["figure.figsize"] = (16, 8) #get target compounds mz and molecular formulas dict_tarrget_mzs = mf_references_dict.get(current_mix) target_mzs = dict_tarrget_mzs.keys() lcms_obj, parser = run_thermo(datapath, target_mzs) target_mzs = parser.selected_mzs #TODO need to convert this to a lcms object scan_number_mass_spectrum = {} results_list = [] # mz is from calculate mz tic_data, ax_tic = lcms_obj.get_tic(ms_type='MS !d', peak_detection=True, smooth=True, plot=False) eics_data, ax_eic = lcms_obj.get_eics(tic_data, smooth=True, plot=False, legend=False, peak_detection=True, ax=ax_tic) lcms_obj.process_ms1(dict_tarrget_mzs) #_write_frame_to_new_sheet(path_to_file="HILIC NEG Results.xlsx", sheet_name='all_eic_results', data=results_list) # TODO: create lcms and add dependent scans based on scan number # Add Adducts search, right now only working for de or protonated species # Export function with csv files precision_decimals = 0 ms_peaks_assigned = SearchMolecularFormulasLC( lcms_obj).run_target_worker_ms1() for eic_peak in lcms_obj: dependent_scans = parser.iRawDataPlus.GetScanDependents( eic_peak.apex_scan, precision_decimals) mass_spectcrum_obj = eic_peak.mass_spectrum percursordata = {} for scan_dependent_detail in dependent_scans.ScanDependentDetailArray: for precursor_mz in scan_dependent_detail.PrecursorMassArray: percursordata[precursor_mz] = scan_dependent_detail.ScanIndex #print(scan, [(mf.name, mf.mz_calc) for mf in mf_references_list], percursordata) #print() #print(scan, mass_spectcrum_obj.retention_time) #print(mf_references_list) #SearchMolecularFormulas(mass_spectcrum_obj).run_worker_ms1() #for precursor_mz in percursordata.keys(): #ax = mass_spectcrum_obj.plot_mz_domain_profile() is_assigned = False #target_title = 'Target Molecule(s) = ' #for peak in mass_spectcrum_obj: # for mf in peak: # is_assigned = True # if not mf.is_isotopologue: # target_title += "{}-{} m/z = {:.4f}".format(mf.name, mf.string_formated, mf.protonated_mz) # annotation = "Mol. Form = {}\nm\z = {:.4f}\nerror = {:.4f}\nconfidence score = {:.2f}\nisotopologue score = {:.2f}".format(mf.string_formated, peak.mz_exp, mf.mz_error, mf.confidence_score, mf.isotopologue_similarity) # ax.annotate(annotation , xy=(peak.mz_exp, peak.abundance), # xytext=(+3, np.sign(peak.abundance)*-40), textcoords="offset points", # horizontalalignment="left", # verticalalignment="bottom" if peak.abundance > 0 else "top") #if is_assigned: # dir = Path(str(datapath.parent).replace('RAW Files', 'Results MS2 Noise Threshould')) # if not dir.exists(): # dir.mkdir(parents=True, exist_ok=True) # ms1_output_file = '{}_{}_{}'.format(scan, 'MS1', datapath.stem) # ax.set_title("Retention Time = {:.3f} {}".format(mass_spectcrum_obj.retention_time, target_title), fontsize=9,) # plt.tight_layout() # #plt.show() # plt.savefig(str(dir) + '/' + ms1_output_file + '.png') # plt.clf() # mass_spectcrum_obj.to_csv(str(dir) + '/' + ms1_output_file) #else: # plt.clf() scan = eic_peak.apex_scan for peak in mass_spectcrum_obj: for mf in peak: if not mf.is_isotopologue: #error = MZSearch.calc_mz_error(mf.mz_calc, precursor_mz) #check_error = MZSearch.check_ppm_error(LCMSParameters.lcms_obj.eic_tolerance_ppm, error) #if check_error: print(scan, mass_spectcrum_obj.retention_time, mf.name, mf.mz_calc, mf.mz_error, mf.confidence_score, mf.isotopologue_similarity) #print(peak.mz_exp, precursor_mz, percursordata.get(peak.mz_exp)) dependent_scans = parser.iRawDataPlus.GetScanDependents( scan, precision_decimals) selected_for_ms2 = False for scan_dependent_detail in dependent_scans.ScanDependentDetailArray: for index, precursor_mz in enumerate( scan_dependent_detail.PrecursorMassArray): error_ppm_window = (scan_dependent_detail. IsolationWidthArray[index] / precursor_mz) * 1000000 error = MZSearch.calc_mz_error( mf.mz_calc, precursor_mz) check_error = MZSearch.check_ppm_error( error_ppm_window, error) if check_error: selected_for_ms2 = True print( precursor_mz, scan_dependent_detail.ScanIndex, scan_dependent_detail. IsolationWidthArray[index], scan_dependent_detail.FilterString) parser.chromatogram_settings.start_scan = scan_dependent_detail.ScanIndex parser.chromatogram_settings.end_scan = scan_dependent_detail.ScanIndex ms2_mass_spec = parser.get_centroid_msms_data( scan_dependent_detail.ScanIndex) ax = ms2_mass_spec.plot_mz_domain_profile() ax.set_title( "Retention Time = {:.2f}, Precursor m/z = {:.4f}, Isolation window m/z = {:.1f} \ Target Molecule = {} m/z = {:.4f} Molecular formula {}\n " .format( eic_peak.retention_time, precursor_mz, scan_dependent_detail. IsolationWidthArray[index], mf.name, mf.mz_calc, mf.string_formated), fontsize=9, ) #ms_peaks_assigned = SearchMolecularFormulas(mass_spectcrum_obj).search_mol_formulas( mf_references_list, ion_type, find_isotopologues=True) used_atoms = { 'C': (1, mf.get('C')), 'H': (1, mf.get('H')) } for atoms, value in mf.class_dict.items(): used_atoms[atoms] = (0, value) print(used_atoms) ms2_mass_spec.molecular_search_settings.usedAtoms = used_atoms ms2_mass_spec.molecular_search_settings.min_ppm_error = -15 #parser.chromatogram_settings.eic_tolerance_ppm ms2_mass_spec.molecular_search_settings.max_ppm_error = 15 #parser.chromatogram_settings.eic_tolerance_ppm ms2_mass_spec.molecular_search_settings.use_min_peaks_filter = False ms2_mass_spec.molecular_search_settings.use_runtime_kendrick_filter = False ms2_mass_spec.molecular_search_settings.min_hc_filter = -np.inf ms2_mass_spec.molecular_search_settings.max_hc_filter = np.inf ms2_mass_spec.molecular_search_settings.min_oc_filter = -np.inf ms2_mass_spec.molecular_search_settings.max_oc_filter = np.inf ms2_mass_spec.molecular_search_settings.isRadical = False SearchMolecularFormulas( ms2_mass_spec, find_isotopologues=False ).run_worker_mass_spectrum() fragment_mz = [] fragment_formulas = [] fragment_error = [] cumulative_neutral_loss = [] for msmspeak in ms2_mass_spec: for mf_msms in msmspeak: fragment_mz.append( round(msmspeak.mz_exp, 6)) fragment_formulas.append( mf_msms.string) fragment_error.append(mf_msms.mz_error) cumulative_neutral_loss.append( mf.subtract_formula(mf_msms)) annotation = "{} {:.4f}".format( mf_msms.string, mf_msms.mz_error) ax.annotate( annotation, xy=(msmspeak.mz_exp, msmspeak.abundance), xytext=( -3, np.sign(msmspeak.abundance) * -3), textcoords="offset points", horizontalalignment="left", verticalalignment="bottom" if msmspeak.abundance > 0 else "top") print(mf_msms, mf_msms.mz_error, mf.subtract_formula(mf_msms)) ms2_output_file = '{}_{}_{}'.format( scan_dependent_detail.ScanIndex, 'MS2', datapath.stem) result = { 'Mix Name': current_mix, 'Data Set': datapath.stem, 'Compound Name': mf.name, 'MS1 Scan': scan, 'Retention Time': mass_spectcrum_obj.retention_time, 'm/z': peak.mz_exp, 'm/z Calculated': mf.mz_calc, 'Mol. Formula': mf.string, 'm/z Error': mf.mz_error, 'Ion Type': mf.ion_type, 'Confidence Score': mf.confidence_score, 'Isotopologue Score': mf.isotopologue_similarity, 'm/z Precursor': precursor_mz, 'Isolation Window': scan_dependent_detail. IsolationWidthArray[index], 'MS2 Scan': scan_dependent_detail.ScanIndex, 'MS2 m/z': fragment_mz, 'MS2 Mol. Formulas': fragment_formulas, 'MS2 m/z error': fragment_error, 'Cumulative Neutral Loss': cumulative_neutral_loss, 'MS1 Output': 'ms1_output_file', 'MS2 Output': ms2_output_file } dir = Path( str(datapath.parent).replace( 'RAW Files', 'Results MS2 Noise Threshould')) if not dir.exists(): dir.mkdir(parents=True, exist_ok=True) ms2_mass_spec.to_csv( str(dir) + '/' + ms2_output_file) if mf.name not in mf_results_dic.keys(): mf_results_dic[mf.name] = [result] else: mf_results_dic[mf.name].append(result) plt.tight_layout() plt.savefig( str(dir) + '/' + ms2_output_file + '.png') #plt.show() plt.clf() # save results without the fragmentation if not selected_for_ms2: result = { 'Mix Name': current_mix, 'Data Set': datapath.stem, 'Compound Name': mf.name, 'MS1 Scan': scan, 'Retention Time': mass_spectcrum_obj.retention_time, 'm/z': peak.mz_exp, 'm/z Calculated': mf.mz_calc, 'Mol. Formula': mf.string, 'm/z Error': mf.mz_error, 'Ion Type': mf.ion_type, 'Confidence Score': mf.confidence_score, 'Isotopologue Score': mf.isotopologue_similarity, 'm/z Precursor': None, 'Isolation Window': None, 'MS2 Scan': None, 'MS2 m/z': None, 'MS2 Mol. Formulas': None, 'MS2 m/z error': None, 'Cumulative Neutral Loss': None, 'MS1 Output': 'ms1_output_file', 'MS2 Output': None } if mf.name not in mf_results_dic.keys(): mf_results_dic[mf.name] = [result] else: mf_results_dic[mf.name].append(result) return mf_results_dic for molecule_name, data in dict_res.items(): _write_frame_to_new_sheet(path_to_file='C18 Results.xlsx', sheet_name='molecular_formula_results', data=data)
def test_old_calibration(): ''' Mass calibration test module: - creates a mass spectrum object - find oxygen most abundant peaks separated by 14Da - calibrate on frequency domain using ledford equation - filter data based on kendrick mass with CH2O base - search for all molecular formula candidates Returns ------- Nothing Store the results inside the mass spectrum class (See Docs for the structural details) ''' usedatoms = {'C': (1,100) , 'H': (4,200), 'O': (1,10)} MSParameters.molecular_search.error_method = 'None' MSParameters.molecular_search.min_ppm_error = -5 MSParameters.molecular_search.max_ppm_error = 5 MSParameters.molecular_search.mz_error_range = 1 MSParameters.molecular_search.isProtonated = True MSParameters.molecular_search.isRadical= True MSParameters.molecular_search.usedAtoms = usedatoms mass_spectrum = create_mass_spectrum() find_formula_thread = FindOxygenPeaks(mass_spectrum) find_formula_thread.run() #find_formula_thread.join() mspeaks_results = find_formula_thread.get_list_found_peaks() calibrate = FreqDomain_Calibration(mass_spectrum, mspeaks_results) calibrate.linear() calibrate.step_fit() calibrate.quadratic(iteration=True) calibrate.ledford_calibration() MSParameters.molecular_search.error_method = 'symmetrical' MSParameters.molecular_search.min_ppm_error = -3 MSParameters.molecular_search.max_ppm_error = 3 MSParameters.molecular_search.mz_error_range = 1 MSParameters.molecular_search.mz_error_average = 0 MSParameters.molecular_search.min_abun_error = -30 # percentage MSParameters.molecular_search.max_abun_error = 70 # percentage MSParameters.molecular_search.isProtonated = True MSParameters.molecular_search.isRadical= True MSParameters.molecular_search.usedAtoms = {'C': (1, 100), 'H': (4, 200), 'O': (0, 20), 'N': (0, 1), 'S': (0, 0), 'P': (0, 0), } #print(len(mass_spectrum)) ClusteringFilter().filter_kendrick(mass_spectrum) #print(len(mass_spectrum)) SearchMolecularFormulas(mass_spectrum).run_worker_mass_spectrum() ClusteringFilter().remove_assignment_by_mass_error(mass_spectrum)
def single_process(mf_references_dict: Dict[str, Dict[float, List[MolecularFormula]]], datapath: Path, current_mix: str, mf_results_dic: dict): plt.rcParams["figure.figsize"] = (16,8) #get target compounds mz and molecular formulas dict_tarrget_mzs = mf_references_dict.get(current_mix) target_mzs = dict_tarrget_mzs.keys() eics_data, parser = run_thermo(datapath, target_mzs) #TODO need to convert this to a lcms object scan_number_mass_spectrum = {} results_list = [] # mz is from calculate mz for mz, eic_data in eics_data.items(): #all possible m/z from the same mix, should be one per m/z as per current lib possible_mf = dict_tarrget_mzs.get(mz) if eic_data.apexes: dict_res = {} names = [mf_obj.name for mf_obj in possible_mf] molecular_formulae = [mf_obj.string for mf_obj in possible_mf] rts = [eic_data.time[apex[1]] for apex in eic_data.apexes] scans = [eic_data.scans[apex[1]] for apex in eic_data.apexes] peak_height = [eic_data.eic[apex[1]] for apex in eic_data.apexes] #print("m/z = {}, formulas = {}, names = {}, peaks indexes = {}, retention times = {}, abundance = {}".format(mz, # molecular_formulae, # names, # scans, # rts, # peak_height) ) dict_res["Mix Name"] = current_mix dict_res["Dataset"] = datapath.stem dict_res["Compound Name"] = names[0] dict_res["Neutral Formula"] = molecular_formulae[0] dict_res["Target m/z (de)protonated"] = round(mz,6) dict_res["Retention Times"] = rts dict_res["Scans"] = scans dict_res["Peak Height"] = peak_height results_list.append(dict_res) for peak_index in eic_data.apexes: apex_index = peak_index[1] retention_time = eic_data.time[apex_index] original_scan = eic_data.scans[apex_index] if original_scan in scan_number_mass_spectrum.keys(): scan_number_mass_spectrum[original_scan][1].extend(possible_mf) else: parser.chromatogram_settings.start_scan = original_scan parser.chromatogram_settings.end_scan = original_scan mass_spec = parser.get_average_mass_spectrum_in_scan_range() mass_spec.min_ppm_error = - 5 mass_spec.max_ppm_error = 5 mass_spec.retention_time = retention_time scan_number_mass_spectrum[original_scan] = [mass_spec, [i for i in possible_mf]] #mass_spec.plot_mz_domain_profile() #plt.show() _write_frame_to_new_sheet(path_to_file="HILIC NEG Results.xlsx", sheet_name='all_eic_results', data=results_list) # TODO: create lcms and add dependent scans based on scan number # Add Adducts search, right now only working for de or protonated species # Export function with csv files ion_type = Labels.protonated_de_ion precision_decimals = 0 for scan, ms_mf in scan_number_mass_spectrum.items(): dependent_scans = parser.iRawDataPlus.GetScanDependents(scan, precision_decimals) mass_spectcrum_obj = ms_mf[0] mf_references_list = ms_mf[1] percursordata = {} for scan_dependent_detail in dependent_scans.ScanDependentDetailArray: for precursor_mz in scan_dependent_detail.PrecursorMassArray: percursordata[precursor_mz] = scan_dependent_detail.ScanIndex #print(scan, [(mf.name, mf.mz_calc) for mf in mf_references_list], percursordata) #print() #print(scan, mass_spectcrum_obj.retention_time) #print(mf_references_list) ms_peaks_assigned = SearchMolecularFormulas(mass_spectcrum_obj).search_mol_formulas( mf_references_list, ion_type, find_isotopologues=True) #for precursor_mz in percursordata.keys(): ax = mass_spectcrum_obj.plot_mz_domain_profile() is_assigned = False target_title = 'Target Molecule(s) = ' for peak in mass_spectcrum_obj: for mf in peak: is_assigned = True if not mf.is_isotopologue: target_title += "{}-{} m/z = {:.4f}".format(mf.name, mf.string_formated, mf.protonated_mz) annotation = "Mol. Form = {}\nm\z = {:.4f}\nerror = {:.4f}\nconfidence score = {:.2f}\nisotopologue score = {:.2f}".format(mf.string_formated, peak.mz_exp, mf.mz_error, mf.confidence_score, mf.isotopologue_similarity) ax.annotate(annotation , xy=(peak.mz_exp, peak.abundance), xytext=(+3, np.sign(peak.abundance)*-40), textcoords="offset points", horizontalalignment="left", verticalalignment="bottom" if peak.abundance > 0 else "top") if is_assigned: dir = Path(str(datapath.parent).replace('RAW Files', 'Results No Mix Overlap')) if not dir.exists(): dir.mkdir(parents=True, exist_ok=True) ms1_output_file = '{}_{}_{}'.format(scan, 'MS1', datapath.stem) ax.set_title("Retention Time = {:.3f} {}".format(mass_spectcrum_obj.retention_time, target_title), fontsize=9,) plt.tight_layout() #plt.show() plt.savefig(str(dir) + '/' + ms1_output_file + '.png') plt.clf() mass_spectcrum_obj.to_csv(str(dir) + '/' + ms1_output_file) else: plt.clf() for peak in mass_spectcrum_obj: for mf in peak: if not mf.is_isotopologue: #error = MZSearch.calc_mz_error(mf.mz_calc, precursor_mz) #check_error = MZSearch.check_ppm_error(LCMSParameters.lc_ms.eic_tolerance_ppm, error) #if check_error: print('YEAHHHHH') print(scan, mass_spectcrum_obj.retention_time, mf.name, mf.mz_calc, mf.mz_error, mf.confidence_score, mf.isotopologue_similarity) #print(peak.mz_exp, precursor_mz, percursordata.get(peak.mz_exp)) dependent_scans = parser.iRawDataPlus.GetScanDependents(scan, precision_decimals) selected_for_ms2 = False for scan_dependent_detail in dependent_scans.ScanDependentDetailArray: for index, precursor_mz in enumerate(scan_dependent_detail.PrecursorMassArray): error_ppm_window = (scan_dependent_detail.IsolationWidthArray[index]/precursor_mz) *1000000 error = MZSearch.calc_mz_error(mf.mz_calc, precursor_mz) check_error = MZSearch.check_ppm_error(error_ppm_window, error) if check_error: selected_for_ms2 = True print(precursor_mz,scan_dependent_detail.ScanIndex, scan_dependent_detail.IsolationWidthArray[index], scan_dependent_detail.FilterString) parser.chromatogram_settings.start_scan = scan_dependent_detail.ScanIndex parser.chromatogram_settings.end_scan = scan_dependent_detail.ScanIndex ms2_mass_spec = parser.get_centroid_msms_data(scan_dependent_detail.ScanIndex) ax = ms2_mass_spec.plot_mz_domain_profile() ax.set_title("Retention Time = {:.2f}, Precursor m/z = {:.4f}, Isolation window m/z = {:.1f} \ Target Molecule = {} m/z = {:.4f} Molecular formula {}\n ".format(mass_spec.retention_time, precursor_mz, scan_dependent_detail.IsolationWidthArray[index], mf.name, mf.mz_calc, mf.string_formated), fontsize=9,) #ms_peaks_assigned = SearchMolecularFormulas(mass_spectcrum_obj).search_mol_formulas( mf_references_list, ion_type, find_isotopologues=True) used_atoms = {'C' : (1, mf.get('C')), 'H': (1, mf.get('H')) } for atoms, value in mf.class_dict.items(): used_atoms[atoms] = (0, value) print(used_atoms) ms2_mass_spec.molecular_search_settings.usedAtoms = used_atoms ms2_mass_spec.molecular_search_settings.min_ppm_error = -15 #parser.chromatogram_settings.eic_tolerance_ppm ms2_mass_spec.molecular_search_settings.max_ppm_error = 15 #parser.chromatogram_settings.eic_tolerance_ppm ms2_mass_spec.molecular_search_settings.use_min_peaks_filter = False ms2_mass_spec.molecular_search_settings.use_runtime_kendrick_filter = False ms2_mass_spec.molecular_search_settings.min_hc_filter = -np.inf ms2_mass_spec.molecular_search_settings.max_hc_filter = np.inf ms2_mass_spec.molecular_search_settings.min_oc_filter = -np.inf ms2_mass_spec.molecular_search_settings.max_oc_filter = np.inf ms2_mass_spec.molecular_search_settings.isRadical = False SearchMolecularFormulas(ms2_mass_spec, find_isotopologues=False).run_worker_mass_spectrum() fragment_mz = [] fragment_formulas = [] fragment_error = [] cumulative_neutral_loss = [] for msmspeak in ms2_mass_spec: for mf_msms in msmspeak: fragment_mz.append(round(msmspeak.mz_exp,6)) fragment_formulas.append(mf_msms.string) fragment_error.append(mf_msms.mz_error) cumulative_neutral_loss.append(mf.subtract_formula(mf_msms)) annotation = "{} {:.4f}".format(mf_msms.string, mf_msms.mz_error) ax.annotate(annotation , xy=(msmspeak.mz_exp, msmspeak.abundance), xytext=(-3, np.sign(msmspeak.abundance)*-3), textcoords="offset points", horizontalalignment="left", verticalalignment="bottom" if msmspeak.abundance > 0 else "top") print(mf_msms, mf_msms.mz_error, mf.subtract_formula(mf_msms)) ms2_output_file = '{}_{}_{}'.format(scan_dependent_detail.ScanIndex, 'MS2', datapath.stem) result = {'Mix Name': current_mix, 'Data Set': datapath.stem, 'Compound Name': mf.name, 'MS1 Scan': scan, 'Retention Time': mass_spectcrum_obj.retention_time, 'm/z': peak.mz_exp, 'm/z Calculated': mf.mz_calc, 'Mol. Formula' : mf.string, 'm/z Error': mf.mz_error, 'Ion Type': mf.ion_type, 'Confidence Score': mf.confidence_score, 'Isotopologue Score': mf.isotopologue_similarity, 'm/z Precursor': precursor_mz, 'Isolation Window': scan_dependent_detail.IsolationWidthArray[index], 'MS2 Scan': scan_dependent_detail.ScanIndex, 'MS2 m/z': fragment_mz, 'MS2 Mol. Formulas': fragment_formulas, 'MS2 m/z error':fragment_error, 'Cumulative Neutral Loss': cumulative_neutral_loss, 'MS1 Output': ms1_output_file, 'MS2 Output': ms2_output_file} dir = Path(str(datapath.parent).replace('RAW Files', 'Results MS2 Noise Threshould')) if not dir.exists(): dir.mkdir(parents=True, exist_ok=True) ms2_mass_spec.to_csv(str(dir) + '/' + ms2_output_file) if mf.name not in mf_results_dic.keys(): mf_results_dic[mf.name] = [result] else: mf_results_dic[mf.name].append(result) plt.tight_layout() plt.savefig(str(dir) + '/' + ms2_output_file+'.png') #plt.show() plt.clf() # save results without the fragmentation if not selected_for_ms2: result = {'Mix Name': current_mix, 'Data Set': datapath.stem, 'Compound Name': mf.name, 'MS1 Scan': scan, 'Retention Time': mass_spectcrum_obj.retention_time, 'm/z': peak.mz_exp, 'm/z Calculated': mf.mz_calc, 'Mol. Formula' : mf.string, 'm/z Error': mf.mz_error, 'Ion Type': mf.ion_type, 'Confidence Score': mf.confidence_score, 'Isotopologue Score': mf.isotopologue_similarity, 'm/z Precursor': None, 'Isolation Window': None, 'MS2 Scan': None, 'MS2 m/z': None, 'MS2 Mol. Formulas': None, 'MS2 m/z error':None, 'Cumulative Neutral Loss': None, 'MS1 Output': ms1_output_file, 'MS2 Output': None} if mf.name not in mf_results_dic.keys(): mf_results_dic[mf.name] = [result] else: mf_results_dic[mf.name].append(result) return mf_results_dic for molecule_name, data in dict_res.items(): _write_frame_to_new_sheet(path_to_file= 'C18 Results.xlsx', sheet_name='molecular_formula_results', data=data)
def find_series_mspeaks(self, mass_spectrum_obj, molecular_formula_obj_reference, deltamz=14): abundances = mass_spectrum_obj.abundance abun_mean = average(abundances, axis=0) abun_std = std(abundances, axis=0) upper_limit = abun_mean + 7 * abun_std list_most_abundant_peaks = list() min_mz = mass_spectrum_obj.min_mz_exp max_mz = mass_spectrum_obj.max_mz_exp initial_nominal_mass = molecular_formula_obj_reference.mz_nominal_calc mass = initial_nominal_mass nominal_masses = [] while mass <= max_mz: #print "shit 1", mass, min_mz mass += (deltamz) nominal_masses.append(mass) mass = initial_nominal_mass while mass >= min_mz: #print "shit 1", mass, min_mz mass -= (deltamz) nominal_masses.append(mass) nominal_masses = sorted(nominal_masses) for nominal_mass in nominal_masses: first_index, last_index = mass_spectrum_obj.get_nominal_mz_first_last_indexes( nominal_mass) ms_peaks = mass_spectrum_obj[first_index:last_index] if ms_peaks: ''' print (nominal_mass, first_index, last_index, mass_spectrum_obj[first_index].mz_exp, mass_spectrum_obj[last_index].mz_exp ) ''' mspeak_most_abundant = max( ms_peaks, key=lambda m: m.abundance if m.abundance <= upper_limit else 0) #mspeak_most_abundant = max(ms_peaks, key=lambda m: m.abundance) list_most_abundant_peaks.append(mspeak_most_abundant) print('Start molecular formula search') SearchMolecularFormulas( mass_spectrum_obj, self.sql_db).run_worker_ms_peaks(list_most_abundant_peaks) print('Done molecular formula search') return [mspeak for mspeak in list_most_abundant_peaks if mspeak]