def smart_roi_evaluation(param_dict): mass_spec = load_obj(param_dict['mass_spec_file']) params = load_obj(param_dict['params_file']) smart_roi = TopN_SmartRoiController( param_dict['ionisation_mode'], param_dict['isolation_width'], param_dict['mz_tol'], param_dict['min_ms1_intensity'], param_dict['min_roi_intensity'], param_dict['min_roi_length'], N=param_dict['N'], rt_tol=param_dict['rt_tol'], min_roi_length_for_fragmentation=param_dict[ 'min_roi_length_for_fragmentation'], reset_length_seconds=param_dict['reset_length_seconds'], intensity_increase_factor=param_dict['intensity_increase_factor'], drop_perc=param_dict['drop_perc'], ms1_shift=param_dict['ms1_shift'], params=params) run_env(mass_spec, smart_roi, param_dict['min_rt'], param_dict['max_rt'], param_dict['save_file_name']) coverage = run_coverage_evaluation(param_dict['box_file'], param_dict['save_file_name'], param_dict['half_isolation_window']) return coverage
def run_experiment(self, idx): controller_name = self.controller_schedule['Sample ID'][idx] mzml_files = glob.glob(os.path.join(self.base_dir, '*.mzML')) if controller_name + '.mzML' not in [ os.path.basename(file) for file in mzml_files ]: controller, ms_params = super().run_experiment(idx) # load data and set up MS logger.info(self.controller_schedule.iloc[[idx]].to_dict()) method = self.controller_schedule['Controller Method'][idx] dataset = self.controller_schedule['Dataset'][idx] if method is not None and dataset is not None: dataset = load_obj(self.controller_schedule['Dataset'][idx]) mass_spec = IndependentMassSpectrometer( ms_params['ionisation_mode'], dataset) # Run sample env = Environment(mass_spec, controller, self.rt_range[0][0], self.rt_range[0][1], progress_bar=self.progress_bar) env.run() env.write_mzML(self.base_dir, controller_name + '.mzML') if self.write_env: save_obj( controller, os.path.join(self.base_dir, controller_name + '.p')) else: logger.info('Experiment already completed. Skipping...') mzml_file = os.path.join(self.base_dir, controller_name + '.mzML') return mzml_file, controller_name
def load_controller(results_dir, experiment_name, N, rt_tol): analysis_name = 'experiment_%s_N_%d_rttol_%d' % (experiment_name, N, rt_tol) pickle_in = '%s/%s.p' % (results_dir, analysis_name) print('Loading %s' % analysis_name) try: controller = load_obj(pickle_in) except FileNotFoundError: controller = None return controller
def weighted_dew_evaluation(param_dict): mass_spec = load_obj(param_dict['mass_spec_file']) params = load_obj(param_dict['params_file']) weighted_dew = WeightedDEWController( param_dict['ionisation_mode'], param_dict['N'], param_dict['isolation_width'], param_dict['mz_tol'], param_dict['rt_tol'], param_dict['min_ms1_intensity'], exclusion_t_0=param_dict['exclusion_t_0'], log_intensity=param_dict['log_intensity'], params=params) run_env(mass_spec, weighted_dew, param_dict['min_rt'], param_dict['max_rt'], param_dict['save_file_name']) coverage = run_coverage_evaluation(param_dict['box_file'], param_dict['save_file_name'], param_dict['half_isolation_window']) return coverage
def _load_ROI_file(self, file_index, roi_rt_range=None): num_ROI = 0 for i in range(len(self.ROI_sources)): ROI_files = list(Path(self.ROI_sources[i]).glob('*.p')) len_ROI = len(ROI_files) if len_ROI > file_index: ROI_file = ROI_files[file_index - num_ROI] ROI = load_obj(ROI_file) # logger.debug("Loaded {}".format(ROI_file)) if roi_rt_range is not None: ROI = self._filter_ROI(ROI, roi_rt_range) return ROI num_ROI += len_ROI
def top_n_evaluation(param_dict): mass_spec = load_obj(param_dict['mass_spec_file']) params = load_obj(param_dict['params_file']) topn = TopNController(param_dict['ionisation_mode'], param_dict['N'], param_dict['isolation_width'], param_dict['mz_tol'], param_dict['rt_tol'], param_dict['min_ms1_intensity'], params=params) chemical_coverage = run_env(mass_spec, topn, param_dict['min_rt'], param_dict['max_rt'], param_dict['save_file_name']) coverage = run_coverage_evaluation(param_dict['box_file'], param_dict['save_file_name'], param_dict['half_isolation_window']) print('coverage', coverage) print('chemical_coverage', chemical_coverage) if param_dict['coverage_type'] == 'coverage': return coverage else: return chemical_coverage
def make_queries_from_chemicals(chemicals_file_name): chemicals = load_obj(chemicals_file_name) query_spectra = [] for chem in chemicals: precursor_mz = chem.isotopes[0][0] peaks = [] for child in chem.children: mz = child.isotopes[0][0] intensity = child.parent.max_intensity * child.prop_ms2_mass peak = np.array([mz, intensity]) peaks.append(peak) new_spectrum = Spectrum(precursor_mz, peaks) query_spectra.append(new_spectrum) return query_spectra
def fragmentation_performance_aligned(param_dict): controller = load_obj(param_dict["controller_directory"]) min_acceptable_intensity = param_dict["min_acceptable_intensity"] aligned_chemicals = pd.read_csv(param_dict["aligned_chemicals_location"]) n_chemicals_aligned = len(aligned_chemicals["mzmed"]) chemicals_found = 0 events = np.array([ event for event in controller.environment.mass_spec.fragmentation_events if event.ms_level == 2 ]) event_query_rts = np.array([event.query_rt for event in events]) event_query_mzs = np.array([ controller.environment.mass_spec._get_mz(event.chem, event.query_rt, 0, 0) for event in events ]) chemicals_found = [0 for i in range(n_chemicals_aligned)] for aligned_index in range(n_chemicals_aligned): rtmin = aligned_chemicals['peak_rtmin'][aligned_index] rtmax = aligned_chemicals['peak_rtmax'][aligned_index] mzmin = aligned_chemicals['peak_mzmin'][aligned_index] mzmax = aligned_chemicals['peak_mzmax'][aligned_index] rtmin_check = event_query_rts > rtmin rtmax_check = event_query_rts < rtmax mzmin_check = event_query_mzs > mzmin mzmax_check = event_query_mzs < mzmax idx = np.nonzero(rtmin_check & rtmax_check & mzmin_check & mzmax_check)[0] for i in idx: event = events[i] inten = controller.environment.mass_spec._get_intensity( event.chem, event.query_rt, 0, 0) if inten > min_acceptable_intensity: chemicals_found[aligned_index] = 1 break return chemicals_found
def run_vimms(no_injections, rt_box_size, mz_box_size): rt_range = [(0, 1440)] min_rt, max_rt = rt_range[0] ionisation_mode, isolation_width = POSITIVE, 1 N, rt_tol, mz_tol, min_ms1_intensity = 10, 15, 10, 5000 min_roi_intensity, min_roi_length, min_roi_length_for_fragmentation = \ 500, 3, 3 grid = GridEstimator( LocatorGrid(min_rt, max_rt, rt_box_size, 0, 3000, mz_box_size), IdentityDrift()) hmdbpath = os.path.join(os.path.abspath(os.getcwd()), "..", "..", "tests", "fixtures", "hmdb_compounds.p") hmdb = load_obj(hmdbpath) df = DatabaseFormulaSampler(hmdb, min_mz=100, max_mz=1000) cm = ChemicalMixtureCreator(df, adduct_prior_dict={POSITIVE: {"M+H": 1}}) chemicals = cm.sample(2000, 1) boxes = [] for i in range(no_injections): mz_noise = GaussianPeakNoise(0.1) mass_spec = IndependentMassSpectrometer(POSITIVE, chemicals, mz_noise=mz_noise) controller = NonOverlapController( ionisation_mode, isolation_width, mz_tol, min_ms1_intensity, min_roi_intensity, min_roi_length, N, grid, rt_tol=rt_tol, min_roi_length_for_fragmentation=min_roi_length_for_fragmentation ) env = Environment(mass_spec, controller, min_rt, max_rt, progress_bar=True) set_log_level_warning() env.run() boxes.append( [r.to_box(0.01, 0.01) for r in controller.roi_builder.get_rois()]) return boxes
def fragmentation_performance_chemicals(controller_directory, min_acceptable_intensity, controller_file_spec="*.p"): global total_matched_chemicals os.chdir(controller_directory) file_names = glob.glob(controller_file_spec) n_samples = len(file_names) controllers = [] all_chemicals = [] for controller_index in range(n_samples): controller = load_obj(file_names[controller_index]) controllers.append(controller) all_chemicals.extend(controller.environment.mass_spec.chemicals) all_rts = [chem.rt for chem in all_chemicals] chemicals_found_total = np.unique(all_rts) sample_chemical_start_rts = [[] for i in range(n_samples)] sample_chemical_start_rts_total = [] for i in range(n_samples): for event in controllers[i].mass_spec.fragmentation_events: if event.ms_level == 2: if controllers[i].mass_spec._get_intensity( event.chem, event.query_rt, 0, 0) > min_acceptable_intensity: sample_chemical_start_rts[i].append(event.chem.rt) sample_chemical_start_rts[i] = np.unique( np.array(sample_chemical_start_rts[i])).tolist() # at this point we have collected the RTs of the all the chemicals that # have been fragmented above the min_intensity threshold flatten_rts = [] for l in sample_chemical_start_rts[0:(i + 1)]: flatten_rts.extend(l) sample_chemical_start_rts_total.append( len(np.unique(np.array(flatten_rts)))) total_matched_chemicals = sample_chemical_start_rts_total logger.debug("Completed Controller", i + 1) return chemicals_found_total, total_matched_chemicals
type=str) parser.add_argument('--spike_max', dest='spike_max', default=1000, type=float) parser.add_argument('--output_swath_file', dest='output_swath_file', type=str, default=None) parser.add_argument('--print_chems', dest='print_chems', action='store_true') args = parser.parse_args() formula_database = load_obj(args.formula_database_file) logger.debug("Loaded {} formulas".format(len(formula_database))) fs = DatabaseFormulaSampler(formula_database, min_mz=args.min_mz, max_mz=args.max_mz) ri = UniformRTAndIntensitySampler( min_rt=args.min_rt, max_rt=args.max_rt, min_log_intensity=np.log(args.min_ms1_sampling_intensity), max_log_intensity=np.log(args.max_ms1_sampling_intensity)) cs = UniformMS2Sampler() cm = ChemicalMixtureCreator(fs,
import pytest from loguru import logger from vimms.ChemicalSamplers import UniformMZFormulaSampler, UniformRTAndIntensitySampler, \ GaussianChromatogramSampler, EvenMZFormulaSampler, ConstantChromatogramSampler, \ MZMLFormulaSampler, MZMLRTandIntensitySampler, MZMLChromatogramSampler from vimms.Chemicals import ChemicalMixtureCreator, ChemicalMixtureFromMZML from vimms.Common import load_obj, set_log_level_warning, set_log_level_debug, \ ADDUCT_DICT_POS_MH, ScanParameters from vimms.Roi import RoiParams # define some useful constants DIR_PATH = os.path.dirname(os.path.realpath(__file__)) BASE_DIR = os.path.abspath(Path(DIR_PATH, 'fixtures')) HMDB = load_obj(Path(BASE_DIR, 'hmdb_compounds.p')) OUT_DIR = str(Path(DIR_PATH, 'results')) ROI_SOURCES = [str(Path(BASE_DIR, 'beer_t10_simulator_files'))] # MIN_MS1_INTENSITY = 1.75E5 MIN_MS1_INTENSITY = 1 MZ_RANGE = [(0, 1050)] RT_RANGE = [(0, 1200)] CENTRE_RANGE = 600 MIN_RT = RT_RANGE[0][0] MAX_RT = RT_RANGE[0][1] N_CHEMS = 10 BEER_CHEMS = load_obj(Path(BASE_DIR, 'QCB_22May19_1.p')) BEER_MIN_BOUND = 550
def calculate_performance(params): # get parameters fragfile = params['fragfile'] N = params['N'] rt_tol = params['rt_tol'] roi_mz_tol = params['roi_mz_tol'] roi_min_ms1_intensity = params['roi_min_ms1_intensity'] fragmentation_min_ms1_intensity = params['fragmentation_min_ms1_intensity'] min_rt = params['min_rt'] max_rt = params['max_rt'] roi_min_length = params['roi_min_length'] fullscan_filename = params['fullscan_filename'] P_peaks_df = params['P_peaks_df'] Q_peaks_df = params['Q_peaks_df'] matching_mz_tol = params['matching_mz_tol'] matching_rt_tol = params['matching_rt_tol'] scenario = params['scenario'] controller_file = params['controller_file'] chemicals_file = params['chemicals_file'] if chemicals_file.endswith('.p'): print('Loading chemicals') chemicals = load_obj(chemicals_file) else: print('Extracting chemicals') chemicals = get_chemicals(chemicals_file, roi_mz_tol, roi_min_ms1_intensity, min_rt, max_rt, min_length=roi_min_length) if type(chemicals) == list: chemicals = np.array(chemicals) if controller_file.endswith('.p'): print('Loading fragmentation events') controller = load_obj(controller_file) chem_to_frag_events = None else: print('Extracting fragmentation events') controller = None precursor_df = get_precursor_info(controller_file) chem_to_frag_events = get_chem_to_frag_events(chemicals, precursor_df) # compute performance under each scenario print('Computing performance under scenario %d' % scenario) tp, fp, fn, prec, rec, f1 = 0, 0, 0, 0, 0, 0 if scenario == 1: tp, fp, fn, prec, rec, f1 = compute_performance_scenario_1(controller, chemicals, fragmentation_min_ms1_intensity, fullscan_filename, P_peaks_df, matching_mz_tol, matching_rt_tol, chem_to_frag_events=chem_to_frag_events) elif scenario == 2: fragfile_filename = os.path.basename(fragfile) tp, fp, fn, prec, rec, f1 = compute_performance_scenario_2(controller, chemicals, fragmentation_min_ms1_intensity, fullscan_filename, fragfile_filename, P_peaks_df, Q_peaks_df, matching_mz_tol, matching_rt_tol, chem_to_frag_events=chem_to_frag_events) return N, rt_tol, scenario, tp, fp, fn, prec, rec, f1
def main(): global file_spectra parser = argparse.ArgumentParser(description='Limited dataset creation') parser.add_argument('input_file_names', type=str) parser.add_argument('library_cache', type=str) parser.add_argument('libraries', type=str, nargs='+') parser.add_argument('--score_thresh', dest='score_thresh', type=float, default=0.7) parser.add_argument('--ms1_tol', dest='ms1_tol', type=float, default=1.) parser.add_argument('--ms2_tol', dest='ms2_tol', type=float, default=0.2) parser.add_argument('--min_matched_peaks', dest='min_matched_peaks', type=int, default=1) parser.add_argument('--output_csv_file', dest='output_csv_file', type=str, default='hits.csv') parser.add_argument('--log_level', dest='log_level', type=str, default='warning') parser.add_argument('--mgf_id_field', dest='mgf_id_field', type=str, default='SCANS') args = parser.parse_args() input_file_names = args.input_file_names if ',' in input_file_names: # multiple items input_file_names = input_file_names.split(',') else: # single item input_file_names = [input_file_names] assert len(input_file_names) > 0 # assume all the files have the same extension as the first one first = input_file_names[0] root, ext = os.path.splitext(first) if ext.lower() == '.mzml': query_spectra = {} for input_file_name in input_file_names: # load the ms2 scans from the .mzML file_spectra = load_scans_from_mzml(input_file_name) logger.warning("Loaded {} MS2 spectra from {}".format( len(file_spectra), input_file_name)) query_spectra[input_file_name] = file_spectra elif ext.lower() == '.mgf': query_spectra = {} for input_file_name in input_file_names: # load the ms2 scans from the .mgf file_spectra = load_mgf(input_file_name, id_field=args.mgf_id_field, spectra={}) logger.warning("Loaded {} MS2 spectra from {}".format( len(file_spectra), input_file_name)) query_spectra[input_file_name] = file_spectra else: logger.warning("Unknown input file format -- should be .mzML or .mgf") sys.exit(0) if args.log_level == 'warning': set_log_level_warning() elif args.log_level == 'debug': set_log_level_debug() libraries = args.libraries spec_libraries = {} if args.library_cache is not None: for library in libraries: # attempt to load library lib_file = os.path.join(args.library_cache, library + '.p') if os.path.isfile(lib_file): logger.warning("Loading {}".format(lib_file)) spec_libraries[library] = load_obj(lib_file) logger.warning("Loaded {}".format(lib_file)) else: logger.warning("Could not find {}".format(lib_file)) sys.exit(0) else: logger.warning("You must supply a library folder") sys.exit(0) all_hits = [] for input_file_name in query_spectra.keys(): file_spectra = query_spectra[input_file_name] logger.warning('Processing {}'.format(input_file_name)) for spec_id in tqdm(file_spectra.keys()): for library in spec_libraries: hits = spec_libraries[library].spectral_match( file_spectra[spec_id], score_thresh=args.score_thresh, ms2_tol=args.ms2_tol, ms1_tol=args.ms1_tol, min_match_peaks=args.min_matched_peaks) for hit in hits: new_hit = [ spec_id, library, hit[0], hit[1], hit[2].metadata['inchikey'] ] all_hits.append(new_hit) if len(all_hits) == 0: logger.warning("No hits found!") else: logger.warning('Writing output to {}'.format(args.output_csv_file)) with open(args.output_csv_file, 'w', newline='') as f: writer = csv.writer(f) writer.writerow( ['spec_id', 'library', 'hit_id', 'score', 'inchikey']) for hit in all_hits: writer.writerow(hit) # summary s, _, t, sc, ik = zip(*all_hits) logger.warning("{} unique spectra got hits".format(len(set(s)))) logger.warning("{} unique structures were hit".format( len(set([a.split('-')[0] for a in ik if a is not None]))))