def run_environment(env): # set the log level to WARNING so we don't see too many messages when environment is running set_log_level_warning() # run the simulation logger.info('Running simulation') env.run() logger.info('Done') # set the log level back to DEBUG set_log_level_debug()
def test_target_creation(self): toxid_file = Path(BASE_DIR, 'StdMix1_pHILIC_Current.csv') targets = create_targets_from_toxid(toxid_file) assert len(targets) > 0 toxid_file = Path(BASE_DIR, 'StdMix2_pHILIC_Current.csv') targets = create_targets_from_toxid(toxid_file) assert len(targets) > 0 toxid_file = Path(BASE_DIR, 'StdMix3_pHILIC_Current.csv') targets = create_targets_from_toxid(toxid_file) assert len(targets) > 0 set_log_level_debug() logger.debug(targets[-1].mz)
def test_TopNDEW_agent(self): set_log_level_debug() fs = UniformMZFormulaSampler() ri = UniformRTAndIntensitySampler(min_rt=0, max_rt=80) cr = GaussianChromatogramSampler(sigma=1) ms = FixedMS2Sampler() cs = ChemicalMixtureCreator(fs, rt_and_intensity_sampler=ri, chromatogram_sampler=cr, ms2_sampler=ms) d = cs.sample(500, 2) ionisation_mode = POSITIVE # Example shows how the same Agent object can be used in consecutive controllers agent = TopNDEWAgent(ionisation_mode, 10, 0.7, 10, 15, 1500) controller = AgentBasedController(agent) spike_noise = UniformSpikeNoise(0.1, 1000) mass_spec = IndependentMassSpectrometer(ionisation_mode, d, spike_noise=spike_noise) env = Environment(mass_spec, controller, 0, 100, progress_bar=True) set_log_level_warning() env.run() check_non_empty_MS1(controller) check_non_empty_MS2(controller) check_mzML(env, OUT_DIR, 'shell.mzML') controller = AgentBasedController(agent) mass_spec = IndependentMassSpectrometer(ionisation_mode, d, spike_noise=spike_noise) env = Environment(mass_spec, controller, 0, 100, progress_bar=True) set_log_level_warning() env.run() check_non_empty_MS1(controller) check_non_empty_MS2(controller) check_mzML(env, OUT_DIR, 'shell2.mzML') controller = AgentBasedController(agent) mass_spec = IndependentMassSpectrometer(ionisation_mode, d, spike_noise=spike_noise) env = Environment(mass_spec, controller, 0, 100, progress_bar=True) set_log_level_warning() env.run() check_non_empty_MS1(controller) # check_non_empty_MS2(controller) # ms2 scans have been exhausted at this point check_mzML(env, OUT_DIR, 'shell3.mzML')
def run_WeightedDEW(chems, scan_duration, params, out_dir): """ Simulate WeightedDEW controller :param chems: a list of UnknownChemicals present in the injection :param ps: old PeakSampler object, now only used to generate MS2 scans (TODO: should be removed as part of issue #46) :param params: a dictionary of parameters :param out_file: output mzML file :param out_dir: output directory :return: None """ logger.info('Running WeightedDEW simulation') logger.info(params) warn_handler_id = set_log_level_warning() t0_values = params['t0_values'] rt_tol_values = params['rt_tol_values'] params_list = [] for t0 in t0_values: for r in rt_tol_values: # copy params and add additional attributes we need copy_params = dict(params) copy_params['t0'] = t0 copy_params['r'] = r copy_params['chems'] = chems copy_params['scan_duration'] = scan_duration copy_params['out_dir'] = out_dir params_list.append(copy_params) # Try to run the controllers in parallel. If fails, then run it serially logger.warning('Running controllers in parallel, please wait ...') try: import ipyparallel as ipp rc = ipp.Client() dview = rc[:] # use all engines with dview.sync_imports(): pass dview.map_sync(run_single_WeightedDEW, params_list) except OSError: # cluster has not been started run_serial = True except ipp.error.TimeoutError: # takes too long to run run_serial = True if run_serial: # if any exception from above, try to run it serially logger.warning( 'IPython cluster not found, running controllers in serial mode') for copy_params in params_list: run_single_WeightedDEW(copy_params) set_log_level_debug(remove_id=warn_handler_id)
def run_experiment(param): ''' Runs a Top-N experiment :param param: the experimental parameters :return: the analysis name that has been successfully ran ''' analysis_name = param['analysis_name'] mzml_out = param['mzml_out'] pickle_out = param['pickle_out'] N = param['N'] rt_tol = param['rt_tol'] if os.path.isfile(mzml_out) and os.path.isfile(pickle_out): logger.debug('Skipping %s' % (analysis_name)) else: logger.debug('Processing %s' % (analysis_name)) peak_sampler = param['peak_sampler'] if peak_sampler is None: # extract density from the fragmenatation file mzml_path = param['mzml_path'] fragfiles = param['fragfiles'] fragfile = fragfiles[( N, rt_tol, )] min_rt = param['min_rt'] max_rt = param['max_rt'] peak_sampler = get_peak_sampler(mzml_path, fragfile, min_rt, max_rt) mass_spec = IndependentMassSpectrometer(param['ionisation_mode'], param['data']) controller = TopNController(param['ionisation_mode'], param['N'], param['isolation_width'], param['mz_tol'], param['rt_tol'], param['min_ms1_intensity']) # create an environment to run both the mass spec and controller env = Environment(mass_spec, controller, param['min_rt'], param['max_rt'], progress_bar=param['pbar']) set_log_level_warning() env.run() set_log_level_debug() env.write_mzML(None, mzml_out) save_obj(controller, pickle_out) return analysis_name
def test_targeted(self): fs = EvenMZFormulaSampler() ri = UniformRTAndIntensitySampler(min_rt=0, max_rt=10) cr = ConstantChromatogramSampler() ms = FixedMS2Sampler() cs = ChemicalMixtureCreator(fs, rt_and_intensity_sampler=ri, chromatogram_sampler=cr, ms2_sampler=ms) d = cs.sample(2, 2) # sample chems with m/z = 100 and 200 ionisation_mode = POSITIVE targets = [] targets.append(Target(101, 100, 102, 10, 20, adduct='M+H')) targets.append(Target(201, 200, 202, 10, 20, metadata={'a': 1})) ce_values = [10, 20, 30] n_replicates = 4 controller = TargetedController(targets, ce_values, n_replicates=n_replicates, limit_acquisition=True) mass_spec = IndependentMassSpectrometer(ionisation_mode, d) env = Environment(mass_spec, controller, 5, 25, progress_bar=True) set_log_level_warning() env.run() # check that we go all the scans we wanted for ms_level in controller.scans: assert len(controller.scans[ms_level]) > 0 set_log_level_debug() target_counts = {t: {c: 0 for c in ce_values} for t in targets} for s in controller.scans[2]: params = s.scan_params pmz = params.get(ScanParameters.PRECURSOR_MZ)[0].precursor_mz filtered_targets = list( filter( lambda x: (x.from_rt <= s.rt <= x.to_rt) and (x.from_mz <= pmz <= x.to_mz), targets)) assert len(filtered_targets) == 1 target = filtered_targets[0] ce = params.get(ScanParameters.COLLISION_ENERGY) target_counts[target][ce] += 1 for t in target_counts: for ce, count in target_counts[t].items(): assert count == n_replicates
def test_chem_edges(self, ten_chems): set_log_level_debug() min_ms1_intensity = 1e3 min_rt = 200 max_rt = 300 N = 10 scan_duration_dict = {1: 0.6, 2: 0.2} scan_levels, scan_start_times = setup_scans(scan_duration_dict, N, min_rt, max_rt) edges = make_edges_chems(ten_chems, scan_start_times, scan_levels, min_ms1_intensity) scan_names, box_names, _ = zip(*edges) scanSet = set(scan_names) boxSet = set(box_names) reduced_edges = list(zip(scan_names, box_names)) matchList, size = reducedUnweightedMaxMatchingFromLists( scanSet, boxSet, reduced_edges) print("The matching has size: {}".format(size))
def test_AIF_controller_with_beer_chems(self): logger.info('Testing Top-N controller with QC beer chemicals') # isolation_width = 1 # N = 10 # rt_tol = 15 # mz_tol = 10 ionisation_mode = POSITIVE min_mz = 100 max_mz = 500 # min_rt = 0 # max_rt = 500 # create a simulated mass spec without noise and Top-N controller scan_time_dict = {1: 0.124, 2: 0.124} mass_spec = IndependentMassSpectrometer(ionisation_mode, BEER_CHEMS, scan_duration=scan_time_dict) params = AdvancedParams(default_ms1_scan_window=[min_mz, max_mz]) ms1_source_cid_energy = 30 controller = AIF(ms1_source_cid_energy, params=params) # create an environment to run both the mass spec and controller env = Environment(mass_spec, controller, BEER_MIN_BOUND, BEER_MAX_BOUND, progress_bar=True) # set the log level to WARNING so we don't see too many messages # when environment is running set_log_level_warning() # run the simulation env.run() # set the log level back to DEBUG set_log_level_debug() # write simulated output to mzML file filename = 'AIF_qcbeer_chems_no_noise.mzML' check_mzML(env, OUT_DIR, filename)
def test_AIF_controller_with_simulated_chems(self, fragscan_dataset): logger.info('Testing Top-N controller with simulated chemicals') # create some chemical object assert len(fragscan_dataset) == N_CHEMS # isolation_width = 1 # N = 10 # rt_tol = 15 # mz_tol = 10 ionisation_mode = POSITIVE min_mz = 100 max_mz = 500 # shorten the rt range for quicker tests # min_rt = 0 # max_rt = 400 scan_time_dict = {1: 0.12, 2: 0.06} # create a simulated mass spec without noise and Top-N controller logger.info('Without noise') mass_spec = IndependentMassSpectrometer(ionisation_mode, fragscan_dataset, scan_duration=scan_time_dict) params = AdvancedParams(default_ms1_scan_window=[min_mz, max_mz]) ms1_source_cid_energy = 30 controller = AIF(ms1_source_cid_energy, params=params) # create an environment to run both the mass spec and controller min_bound, max_bound = get_rt_bounds(fragscan_dataset, CENTRE_RANGE) env = Environment(mass_spec, controller, min_bound, max_bound, progress_bar=True) # set the log level to WARNING so we don't see too many messages when # environment is running set_log_level_warning() # run the simulation env.run() # set the log level back to DEBUG set_log_level_debug() # write simulated output to mzML file filename = 'AIF_simulated_chems_no_noise.mzML' check_mzML(env, OUT_DIR, filename) # create a simulated mass spec with noise and Top-N controller logger.info('With noise') mz_noise = GaussianPeakNoiseLevelSpecific({2: 0.01}) intensity_noise = GaussianPeakNoiseLevelSpecific({2: 1000.}) mass_spec = IndependentMassSpectrometer(ionisation_mode, fragscan_dataset, mz_noise=mz_noise, intensity_noise=intensity_noise, scan_duration=scan_time_dict) params = AdvancedParams(default_ms1_scan_window=[min_mz, max_mz]) ms1_source_cid_energy = 30 controller = AIF(ms1_source_cid_energy, params=params) # create an environment to run both the mass spec and controller min_bound, max_bound = get_rt_bounds(fragscan_dataset, CENTRE_RANGE) env = Environment(mass_spec, controller, min_bound, max_bound, progress_bar=True) # set the log level to WARNING so we don't see too many messages # when environment is running set_log_level_warning() # run the simulation env.run() # set the log level back to DEBUG set_log_level_debug() # write simulated output to mzML file filename = 'AIF_simulated_chems_with_noise.mzML' check_mzML(env, OUT_DIR, filename)
def test_ms2_matching(self): rti = UniformRTAndIntensitySampler(min_rt=10, max_rt=20) fs = UniformMZFormulaSampler() adduct_prior_dict = {POSITIVE: {'M+H': 1}} cs = ChemicalMixtureCreator(fs, rt_and_intensity_sampler=rti, adduct_prior_dict=adduct_prior_dict) d = cs.sample(300, 2) group_list = ['control', 'control', 'case', 'case'] group_dict = {} group_dict['control'] = { 'missing_probability': 0.0, 'changing_probability': 0.0 } group_dict['case'] = { 'missing_probability': 0.0, 'changing_probability': 1.0 } mm = MultipleMixtureCreator(d, group_list, group_dict) cl = mm.generate_chemical_lists() N = 10 isolation_width = 0.7 mz_tol = 0.001 rt_tol = 30 min_ms1_intensity = 0 set_log_level_warning() output_folder = os.path.join(OUT_DIR, 'ms2_matching') write_msp(d, 'mmm.msp', out_dir=output_folder) initial_exclusion_list = [] for i, chem_list in enumerate(cl): controller = TopNController( POSITIVE, N, isolation_width, mz_tol, rt_tol, min_ms1_intensity, initial_exclusion_list=initial_exclusion_list) ms = IndependentMassSpectrometer(POSITIVE, chem_list) env = Environment(ms, controller, 10, 30, progress_bar=True) env.run() env.write_mzML(output_folder, '{}.mzML'.format(i)) mz_intervals = list( controller.exclusion.exclusion_list.boxes_mz.items()) rt_intervals = list( controller.exclusion.exclusion_list.boxes_rt.items()) unique_items_mz = set(i.data for i in mz_intervals) unique_items_rt = set(i.data for i in rt_intervals) assert len(unique_items_mz) == len(unique_items_rt) initial_exclusion_list = list(unique_items_mz) logger.warning(len(initial_exclusion_list)) set_log_level_debug() msp_file = os.path.join(output_folder, 'mmm.msp') # check with just the first file a, b = ms2_main(os.path.join(output_folder, '0.mzML'), msp_file, 1, 0.7) # check with all c, d = ms2_main(output_folder, os.path.join(output_folder, 'mmm.msp'), 1, 0.7) assert b == d assert c > a
def main(): global file_spectra parser = argparse.ArgumentParser(description='Limited dataset creation') parser.add_argument('input_file_names', type=str) parser.add_argument('library_cache', type=str) parser.add_argument('libraries', type=str, nargs='+') parser.add_argument('--score_thresh', dest='score_thresh', type=float, default=0.7) parser.add_argument('--ms1_tol', dest='ms1_tol', type=float, default=1.) parser.add_argument('--ms2_tol', dest='ms2_tol', type=float, default=0.2) parser.add_argument('--min_matched_peaks', dest='min_matched_peaks', type=int, default=1) parser.add_argument('--output_csv_file', dest='output_csv_file', type=str, default='hits.csv') parser.add_argument('--log_level', dest='log_level', type=str, default='warning') parser.add_argument('--mgf_id_field', dest='mgf_id_field', type=str, default='SCANS') args = parser.parse_args() input_file_names = args.input_file_names if ',' in input_file_names: # multiple items input_file_names = input_file_names.split(',') else: # single item input_file_names = [input_file_names] assert len(input_file_names) > 0 # assume all the files have the same extension as the first one first = input_file_names[0] root, ext = os.path.splitext(first) if ext.lower() == '.mzml': query_spectra = {} for input_file_name in input_file_names: # load the ms2 scans from the .mzML file_spectra = load_scans_from_mzml(input_file_name) logger.warning("Loaded {} MS2 spectra from {}".format( len(file_spectra), input_file_name)) query_spectra[input_file_name] = file_spectra elif ext.lower() == '.mgf': query_spectra = {} for input_file_name in input_file_names: # load the ms2 scans from the .mgf file_spectra = load_mgf(input_file_name, id_field=args.mgf_id_field, spectra={}) logger.warning("Loaded {} MS2 spectra from {}".format( len(file_spectra), input_file_name)) query_spectra[input_file_name] = file_spectra else: logger.warning("Unknown input file format -- should be .mzML or .mgf") sys.exit(0) if args.log_level == 'warning': set_log_level_warning() elif args.log_level == 'debug': set_log_level_debug() libraries = args.libraries spec_libraries = {} if args.library_cache is not None: for library in libraries: # attempt to load library lib_file = os.path.join(args.library_cache, library + '.p') if os.path.isfile(lib_file): logger.warning("Loading {}".format(lib_file)) spec_libraries[library] = load_obj(lib_file) logger.warning("Loaded {}".format(lib_file)) else: logger.warning("Could not find {}".format(lib_file)) sys.exit(0) else: logger.warning("You must supply a library folder") sys.exit(0) all_hits = [] for input_file_name in query_spectra.keys(): file_spectra = query_spectra[input_file_name] logger.warning('Processing {}'.format(input_file_name)) for spec_id in tqdm(file_spectra.keys()): for library in spec_libraries: hits = spec_libraries[library].spectral_match( file_spectra[spec_id], score_thresh=args.score_thresh, ms2_tol=args.ms2_tol, ms1_tol=args.ms1_tol, min_match_peaks=args.min_matched_peaks) for hit in hits: new_hit = [ spec_id, library, hit[0], hit[1], hit[2].metadata['inchikey'] ] all_hits.append(new_hit) if len(all_hits) == 0: logger.warning("No hits found!") else: logger.warning('Writing output to {}'.format(args.output_csv_file)) with open(args.output_csv_file, 'w', newline='') as f: writer = csv.writer(f) writer.writerow( ['spec_id', 'library', 'hit_id', 'score', 'inchikey']) for hit in all_hits: writer.writerow(hit) # summary s, _, t, sc, ik = zip(*all_hits) logger.warning("{} unique spectra got hits".format(len(set(s)))) logger.warning("{} unique structures were hit".format( len(set([a.split('-')[0] for a in ik if a is not None]))))