예제 #1
0
def run_environment(env):
    # set the log level to WARNING so we don't see too many messages when environment is running
    set_log_level_warning()
    # run the simulation
    logger.info('Running simulation')
    env.run()
    logger.info('Done')
    # set the log level back to DEBUG
    set_log_level_debug()
예제 #2
0
 def test_target_creation(self):
     toxid_file = Path(BASE_DIR, 'StdMix1_pHILIC_Current.csv')
     targets = create_targets_from_toxid(toxid_file)
     assert len(targets) > 0
     toxid_file = Path(BASE_DIR, 'StdMix2_pHILIC_Current.csv')
     targets = create_targets_from_toxid(toxid_file)
     assert len(targets) > 0
     toxid_file = Path(BASE_DIR, 'StdMix3_pHILIC_Current.csv')
     targets = create_targets_from_toxid(toxid_file)
     assert len(targets) > 0
     set_log_level_debug()
     logger.debug(targets[-1].mz)
예제 #3
0
    def test_TopNDEW_agent(self):
        set_log_level_debug()
        fs = UniformMZFormulaSampler()
        ri = UniformRTAndIntensitySampler(min_rt=0, max_rt=80)
        cr = GaussianChromatogramSampler(sigma=1)
        ms = FixedMS2Sampler()
        cs = ChemicalMixtureCreator(fs,
                                    rt_and_intensity_sampler=ri,
                                    chromatogram_sampler=cr,
                                    ms2_sampler=ms)
        d = cs.sample(500, 2)
        ionisation_mode = POSITIVE

        # Example shows how the same Agent object can be used in consecutive controllers

        agent = TopNDEWAgent(ionisation_mode, 10, 0.7, 10, 15, 1500)
        controller = AgentBasedController(agent)
        spike_noise = UniformSpikeNoise(0.1, 1000)
        mass_spec = IndependentMassSpectrometer(ionisation_mode,
                                                d,
                                                spike_noise=spike_noise)
        env = Environment(mass_spec, controller, 0, 100, progress_bar=True)
        set_log_level_warning()
        env.run()

        check_non_empty_MS1(controller)
        check_non_empty_MS2(controller)
        check_mzML(env, OUT_DIR, 'shell.mzML')

        controller = AgentBasedController(agent)
        mass_spec = IndependentMassSpectrometer(ionisation_mode,
                                                d,
                                                spike_noise=spike_noise)
        env = Environment(mass_spec, controller, 0, 100, progress_bar=True)
        set_log_level_warning()
        env.run()

        check_non_empty_MS1(controller)
        check_non_empty_MS2(controller)
        check_mzML(env, OUT_DIR, 'shell2.mzML')

        controller = AgentBasedController(agent)
        mass_spec = IndependentMassSpectrometer(ionisation_mode,
                                                d,
                                                spike_noise=spike_noise)
        env = Environment(mass_spec, controller, 0, 100, progress_bar=True)
        set_log_level_warning()
        env.run()

        check_non_empty_MS1(controller)
        # check_non_empty_MS2(controller) # ms2 scans have been exhausted at this point
        check_mzML(env, OUT_DIR, 'shell3.mzML')
예제 #4
0
def run_WeightedDEW(chems, scan_duration, params, out_dir):
    """
    Simulate WeightedDEW controller
    :param chems: a list of UnknownChemicals present in the injection
    :param ps: old PeakSampler object, now only used to generate MS2 scans
    (TODO: should be removed as part of issue #46)
    :param params: a dictionary of parameters
    :param out_file: output mzML file
    :param out_dir: output directory
    :return: None
    """
    logger.info('Running WeightedDEW simulation')
    logger.info(params)
    warn_handler_id = set_log_level_warning()

    t0_values = params['t0_values']
    rt_tol_values = params['rt_tol_values']
    params_list = []
    for t0 in t0_values:
        for r in rt_tol_values:
            # copy params and add additional attributes we need
            copy_params = dict(params)
            copy_params['t0'] = t0
            copy_params['r'] = r
            copy_params['chems'] = chems
            copy_params['scan_duration'] = scan_duration
            copy_params['out_dir'] = out_dir
            params_list.append(copy_params)

    # Try to run the controllers in parallel. If fails, then run it serially
    logger.warning('Running controllers in parallel, please wait ...')
    try:
        import ipyparallel as ipp
        rc = ipp.Client()
        dview = rc[:]  # use all engines
        with dview.sync_imports():
            pass
        dview.map_sync(run_single_WeightedDEW, params_list)
    except OSError:  # cluster has not been started
        run_serial = True
    except ipp.error.TimeoutError:  # takes too long to run
        run_serial = True

    if run_serial:  # if any exception from above, try to run it serially
        logger.warning(
            'IPython cluster not found, running controllers in serial mode')
        for copy_params in params_list:
            run_single_WeightedDEW(copy_params)

    set_log_level_debug(remove_id=warn_handler_id)
예제 #5
0
def run_experiment(param):
    '''
    Runs a Top-N experiment
    :param param: the experimental parameters
    :return: the analysis name that has been successfully ran
    '''
    analysis_name = param['analysis_name']
    mzml_out = param['mzml_out']
    pickle_out = param['pickle_out']
    N = param['N']
    rt_tol = param['rt_tol']

    if os.path.isfile(mzml_out) and os.path.isfile(pickle_out):
        logger.debug('Skipping %s' % (analysis_name))
    else:
        logger.debug('Processing %s' % (analysis_name))
        peak_sampler = param['peak_sampler']
        if peak_sampler is None:  # extract density from the fragmenatation file
            mzml_path = param['mzml_path']
            fragfiles = param['fragfiles']
            fragfile = fragfiles[(
                N,
                rt_tol,
            )]
            min_rt = param['min_rt']
            max_rt = param['max_rt']
            peak_sampler = get_peak_sampler(mzml_path, fragfile, min_rt,
                                            max_rt)

        mass_spec = IndependentMassSpectrometer(param['ionisation_mode'],
                                                param['data'])
        controller = TopNController(param['ionisation_mode'], param['N'],
                                    param['isolation_width'], param['mz_tol'],
                                    param['rt_tol'],
                                    param['min_ms1_intensity'])
        # create an environment to run both the mass spec and controller
        env = Environment(mass_spec,
                          controller,
                          param['min_rt'],
                          param['max_rt'],
                          progress_bar=param['pbar'])
        set_log_level_warning()
        env.run()
        set_log_level_debug()
        env.write_mzML(None, mzml_out)
        save_obj(controller, pickle_out)
        return analysis_name
예제 #6
0
    def test_targeted(self):
        fs = EvenMZFormulaSampler()
        ri = UniformRTAndIntensitySampler(min_rt=0, max_rt=10)
        cr = ConstantChromatogramSampler()
        ms = FixedMS2Sampler()
        cs = ChemicalMixtureCreator(fs,
                                    rt_and_intensity_sampler=ri,
                                    chromatogram_sampler=cr,
                                    ms2_sampler=ms)
        d = cs.sample(2, 2)  # sample chems with m/z = 100 and 200
        ionisation_mode = POSITIVE
        targets = []
        targets.append(Target(101, 100, 102, 10, 20, adduct='M+H'))
        targets.append(Target(201, 200, 202, 10, 20, metadata={'a': 1}))
        ce_values = [10, 20, 30]
        n_replicates = 4
        controller = TargetedController(targets,
                                        ce_values,
                                        n_replicates=n_replicates,
                                        limit_acquisition=True)
        mass_spec = IndependentMassSpectrometer(ionisation_mode, d)
        env = Environment(mass_spec, controller, 5, 25, progress_bar=True)
        set_log_level_warning()
        env.run()

        # check that we go all the scans we wanted
        for ms_level in controller.scans:
            assert len(controller.scans[ms_level]) > 0
        set_log_level_debug()
        target_counts = {t: {c: 0 for c in ce_values} for t in targets}

        for s in controller.scans[2]:
            params = s.scan_params
            pmz = params.get(ScanParameters.PRECURSOR_MZ)[0].precursor_mz
            filtered_targets = list(
                filter(
                    lambda x: (x.from_rt <= s.rt <= x.to_rt) and
                    (x.from_mz <= pmz <= x.to_mz), targets))
            assert len(filtered_targets) == 1
            target = filtered_targets[0]
            ce = params.get(ScanParameters.COLLISION_ENERGY)
            target_counts[target][ce] += 1

        for t in target_counts:
            for ce, count in target_counts[t].items():
                assert count == n_replicates
예제 #7
0
    def test_chem_edges(self, ten_chems):
        set_log_level_debug()
        min_ms1_intensity = 1e3
        min_rt = 200
        max_rt = 300
        N = 10
        scan_duration_dict = {1: 0.6, 2: 0.2}
        scan_levels, scan_start_times = setup_scans(scan_duration_dict, N,
                                                    min_rt, max_rt)
        edges = make_edges_chems(ten_chems, scan_start_times, scan_levels,
                                 min_ms1_intensity)

        scan_names, box_names, _ = zip(*edges)
        scanSet = set(scan_names)
        boxSet = set(box_names)
        reduced_edges = list(zip(scan_names, box_names))
        matchList, size = reducedUnweightedMaxMatchingFromLists(
            scanSet, boxSet, reduced_edges)
        print("The matching has size: {}".format(size))
예제 #8
0
    def test_AIF_controller_with_beer_chems(self):
        logger.info('Testing Top-N controller with QC beer chemicals')

        # isolation_width = 1
        # N = 10
        # rt_tol = 15
        # mz_tol = 10
        ionisation_mode = POSITIVE
        min_mz = 100
        max_mz = 500

        # min_rt = 0
        # max_rt = 500

        # create a simulated mass spec without noise and Top-N controller
        scan_time_dict = {1: 0.124, 2: 0.124}
        mass_spec = IndependentMassSpectrometer(ionisation_mode, BEER_CHEMS,
                                                scan_duration=scan_time_dict)
        params = AdvancedParams(default_ms1_scan_window=[min_mz, max_mz])
        ms1_source_cid_energy = 30
        controller = AIF(ms1_source_cid_energy, params=params)

        # create an environment to run both the mass spec and controller
        env = Environment(mass_spec, controller, BEER_MIN_BOUND, BEER_MAX_BOUND,
                          progress_bar=True)

        # set the log level to WARNING so we don't see too many messages
        # when environment is running
        set_log_level_warning()

        # run the simulation
        env.run()

        # set the log level back to DEBUG
        set_log_level_debug()

        # write simulated output to mzML file
        filename = 'AIF_qcbeer_chems_no_noise.mzML'
        check_mzML(env, OUT_DIR, filename)
예제 #9
0
    def test_AIF_controller_with_simulated_chems(self, fragscan_dataset):
        logger.info('Testing Top-N controller with simulated chemicals')

        # create some chemical object
        assert len(fragscan_dataset) == N_CHEMS

        # isolation_width = 1
        # N = 10
        # rt_tol = 15
        # mz_tol = 10
        ionisation_mode = POSITIVE

        min_mz = 100
        max_mz = 500

        # shorten  the rt range for quicker tests
        # min_rt = 0
        # max_rt = 400

        scan_time_dict = {1: 0.12, 2: 0.06}

        # create a simulated mass spec without noise and Top-N controller
        logger.info('Without noise')
        mass_spec = IndependentMassSpectrometer(ionisation_mode, fragscan_dataset,
                                                scan_duration=scan_time_dict)
        params = AdvancedParams(default_ms1_scan_window=[min_mz, max_mz])
        ms1_source_cid_energy = 30
        controller = AIF(ms1_source_cid_energy, params=params)

        # create an environment to run both the mass spec and controller
        min_bound, max_bound = get_rt_bounds(fragscan_dataset, CENTRE_RANGE)
        env = Environment(mass_spec, controller, min_bound, max_bound, progress_bar=True)

        # set the log level to WARNING so we don't see too many messages when
        # environment is running
        set_log_level_warning()

        # run the simulation
        env.run()

        # set the log level back to DEBUG
        set_log_level_debug()

        # write simulated output to mzML file
        filename = 'AIF_simulated_chems_no_noise.mzML'
        check_mzML(env, OUT_DIR, filename)

        # create a simulated mass spec with noise and Top-N controller
        logger.info('With noise')
        mz_noise = GaussianPeakNoiseLevelSpecific({2: 0.01})
        intensity_noise = GaussianPeakNoiseLevelSpecific({2: 1000.})
        mass_spec = IndependentMassSpectrometer(ionisation_mode, fragscan_dataset,
                                                mz_noise=mz_noise,
                                                intensity_noise=intensity_noise,
                                                scan_duration=scan_time_dict)
        params = AdvancedParams(default_ms1_scan_window=[min_mz, max_mz])
        ms1_source_cid_energy = 30
        controller = AIF(ms1_source_cid_energy, params=params)

        # create an environment to run both the mass spec and controller
        min_bound, max_bound = get_rt_bounds(fragscan_dataset, CENTRE_RANGE)
        env = Environment(mass_spec, controller, min_bound, max_bound, progress_bar=True)

        # set the log level to WARNING so we don't see too many messages
        # when environment is running
        set_log_level_warning()

        # run the simulation
        env.run()

        # set the log level back to DEBUG
        set_log_level_debug()

        # write simulated output to mzML file
        filename = 'AIF_simulated_chems_with_noise.mzML'
        check_mzML(env, OUT_DIR, filename)
예제 #10
0
    def test_ms2_matching(self):
        rti = UniformRTAndIntensitySampler(min_rt=10, max_rt=20)
        fs = UniformMZFormulaSampler()
        adduct_prior_dict = {POSITIVE: {'M+H': 1}}

        cs = ChemicalMixtureCreator(fs,
                                    rt_and_intensity_sampler=rti,
                                    adduct_prior_dict=adduct_prior_dict)
        d = cs.sample(300, 2)

        group_list = ['control', 'control', 'case', 'case']
        group_dict = {}
        group_dict['control'] = {
            'missing_probability': 0.0,
            'changing_probability': 0.0
        }
        group_dict['case'] = {
            'missing_probability': 0.0,
            'changing_probability': 1.0
        }

        mm = MultipleMixtureCreator(d, group_list, group_dict)

        cl = mm.generate_chemical_lists()

        N = 10
        isolation_width = 0.7
        mz_tol = 0.001
        rt_tol = 30
        min_ms1_intensity = 0

        set_log_level_warning()

        output_folder = os.path.join(OUT_DIR, 'ms2_matching')
        write_msp(d, 'mmm.msp', out_dir=output_folder)

        initial_exclusion_list = []
        for i, chem_list in enumerate(cl):
            controller = TopNController(
                POSITIVE,
                N,
                isolation_width,
                mz_tol,
                rt_tol,
                min_ms1_intensity,
                initial_exclusion_list=initial_exclusion_list)
            ms = IndependentMassSpectrometer(POSITIVE, chem_list)
            env = Environment(ms, controller, 10, 30, progress_bar=True)
            env.run()
            env.write_mzML(output_folder, '{}.mzML'.format(i))

            mz_intervals = list(
                controller.exclusion.exclusion_list.boxes_mz.items())
            rt_intervals = list(
                controller.exclusion.exclusion_list.boxes_rt.items())
            unique_items_mz = set(i.data for i in mz_intervals)
            unique_items_rt = set(i.data for i in rt_intervals)
            assert len(unique_items_mz) == len(unique_items_rt)

            initial_exclusion_list = list(unique_items_mz)
            logger.warning(len(initial_exclusion_list))

        set_log_level_debug()
        msp_file = os.path.join(output_folder, 'mmm.msp')
        # check with just the first file
        a, b = ms2_main(os.path.join(output_folder, '0.mzML'), msp_file, 1,
                        0.7)
        # check with all
        c, d = ms2_main(output_folder, os.path.join(output_folder, 'mmm.msp'),
                        1, 0.7)

        assert b == d
        assert c > a
예제 #11
0
def main():
    global file_spectra
    parser = argparse.ArgumentParser(description='Limited dataset creation')
    parser.add_argument('input_file_names', type=str)
    parser.add_argument('library_cache', type=str)
    parser.add_argument('libraries', type=str, nargs='+')
    parser.add_argument('--score_thresh',
                        dest='score_thresh',
                        type=float,
                        default=0.7)
    parser.add_argument('--ms1_tol', dest='ms1_tol', type=float, default=1.)
    parser.add_argument('--ms2_tol', dest='ms2_tol', type=float, default=0.2)
    parser.add_argument('--min_matched_peaks',
                        dest='min_matched_peaks',
                        type=int,
                        default=1)
    parser.add_argument('--output_csv_file',
                        dest='output_csv_file',
                        type=str,
                        default='hits.csv')
    parser.add_argument('--log_level',
                        dest='log_level',
                        type=str,
                        default='warning')
    parser.add_argument('--mgf_id_field',
                        dest='mgf_id_field',
                        type=str,
                        default='SCANS')
    args = parser.parse_args()
    input_file_names = args.input_file_names
    if ',' in input_file_names:  # multiple items
        input_file_names = input_file_names.split(',')
    else:  # single item
        input_file_names = [input_file_names]
    assert len(input_file_names) > 0
    # assume all the files have the same extension as the first one
    first = input_file_names[0]
    root, ext = os.path.splitext(first)
    if ext.lower() == '.mzml':
        query_spectra = {}
        for input_file_name in input_file_names:
            # load the ms2 scans from the .mzML
            file_spectra = load_scans_from_mzml(input_file_name)
            logger.warning("Loaded {} MS2 spectra from {}".format(
                len(file_spectra), input_file_name))
            query_spectra[input_file_name] = file_spectra

    elif ext.lower() == '.mgf':
        query_spectra = {}
        for input_file_name in input_file_names:
            # load the ms2 scans from the .mgf
            file_spectra = load_mgf(input_file_name,
                                    id_field=args.mgf_id_field,
                                    spectra={})
            logger.warning("Loaded {} MS2 spectra from {}".format(
                len(file_spectra), input_file_name))
            query_spectra[input_file_name] = file_spectra
    else:
        logger.warning("Unknown input file format -- should be .mzML or .mgf")
        sys.exit(0)
    if args.log_level == 'warning':
        set_log_level_warning()
    elif args.log_level == 'debug':
        set_log_level_debug()
    libraries = args.libraries
    spec_libraries = {}
    if args.library_cache is not None:
        for library in libraries:
            # attempt to load library
            lib_file = os.path.join(args.library_cache, library + '.p')
            if os.path.isfile(lib_file):
                logger.warning("Loading {}".format(lib_file))
                spec_libraries[library] = load_obj(lib_file)
                logger.warning("Loaded {}".format(lib_file))
            else:
                logger.warning("Could not find {}".format(lib_file))
                sys.exit(0)
    else:
        logger.warning("You must supply a library folder")
        sys.exit(0)
    all_hits = []
    for input_file_name in query_spectra.keys():
        file_spectra = query_spectra[input_file_name]
        logger.warning('Processing {}'.format(input_file_name))
        for spec_id in tqdm(file_spectra.keys()):
            for library in spec_libraries:
                hits = spec_libraries[library].spectral_match(
                    file_spectra[spec_id],
                    score_thresh=args.score_thresh,
                    ms2_tol=args.ms2_tol,
                    ms1_tol=args.ms1_tol,
                    min_match_peaks=args.min_matched_peaks)
                for hit in hits:
                    new_hit = [
                        spec_id, library, hit[0], hit[1],
                        hit[2].metadata['inchikey']
                    ]
                    all_hits.append(new_hit)
    if len(all_hits) == 0:
        logger.warning("No hits found!")
    else:
        logger.warning('Writing output to {}'.format(args.output_csv_file))
        with open(args.output_csv_file, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(
                ['spec_id', 'library', 'hit_id', 'score', 'inchikey'])
            for hit in all_hits:
                writer.writerow(hit)

        # summary
        s, _, t, sc, ik = zip(*all_hits)
        logger.warning("{} unique spectra got hits".format(len(set(s))))
        logger.warning("{} unique structures were hit".format(
            len(set([a.split('-')[0] for a in ik if a is not None]))))