Exemplo n.º 1
0
def smart_roi_evaluation(param_dict):
    mass_spec = load_obj(param_dict['mass_spec_file'])
    params = load_obj(param_dict['params_file'])
    smart_roi = TopN_SmartRoiController(
        param_dict['ionisation_mode'],
        param_dict['isolation_width'],
        param_dict['mz_tol'],
        param_dict['min_ms1_intensity'],
        param_dict['min_roi_intensity'],
        param_dict['min_roi_length'],
        N=param_dict['N'],
        rt_tol=param_dict['rt_tol'],
        min_roi_length_for_fragmentation=param_dict[
            'min_roi_length_for_fragmentation'],
        reset_length_seconds=param_dict['reset_length_seconds'],
        intensity_increase_factor=param_dict['intensity_increase_factor'],
        drop_perc=param_dict['drop_perc'],
        ms1_shift=param_dict['ms1_shift'],
        params=params)
    run_env(mass_spec, smart_roi, param_dict['min_rt'], param_dict['max_rt'],
            param_dict['save_file_name'])
    coverage = run_coverage_evaluation(param_dict['box_file'],
                                       param_dict['save_file_name'],
                                       param_dict['half_isolation_window'])
    return coverage
Exemplo n.º 2
0
 def run_experiment(self, idx):
     controller_name = self.controller_schedule['Sample ID'][idx]
     mzml_files = glob.glob(os.path.join(self.base_dir, '*.mzML'))
     if controller_name + '.mzML' not in [
             os.path.basename(file) for file in mzml_files
     ]:
         controller, ms_params = super().run_experiment(idx)
         # load data and set up MS
         logger.info(self.controller_schedule.iloc[[idx]].to_dict())
         method = self.controller_schedule['Controller Method'][idx]
         dataset = self.controller_schedule['Dataset'][idx]
         if method is not None and dataset is not None:
             dataset = load_obj(self.controller_schedule['Dataset'][idx])
             mass_spec = IndependentMassSpectrometer(
                 ms_params['ionisation_mode'], dataset)
             # Run sample
             env = Environment(mass_spec,
                               controller,
                               self.rt_range[0][0],
                               self.rt_range[0][1],
                               progress_bar=self.progress_bar)
             env.run()
             env.write_mzML(self.base_dir, controller_name + '.mzML')
             if self.write_env:
                 save_obj(
                     controller,
                     os.path.join(self.base_dir, controller_name + '.p'))
     else:
         logger.info('Experiment already completed. Skipping...')
     mzml_file = os.path.join(self.base_dir, controller_name + '.mzML')
     return mzml_file, controller_name
Exemplo n.º 3
0
def load_controller(results_dir, experiment_name, N, rt_tol):
    analysis_name = 'experiment_%s_N_%d_rttol_%d' % (experiment_name, N, rt_tol)
    pickle_in = '%s/%s.p' % (results_dir, analysis_name)
    print('Loading %s' % analysis_name)
    try:
        controller = load_obj(pickle_in)
    except FileNotFoundError:
        controller = None
    return controller
Exemplo n.º 4
0
def weighted_dew_evaluation(param_dict):
    mass_spec = load_obj(param_dict['mass_spec_file'])
    params = load_obj(param_dict['params_file'])
    weighted_dew = WeightedDEWController(
        param_dict['ionisation_mode'],
        param_dict['N'],
        param_dict['isolation_width'],
        param_dict['mz_tol'],
        param_dict['rt_tol'],
        param_dict['min_ms1_intensity'],
        exclusion_t_0=param_dict['exclusion_t_0'],
        log_intensity=param_dict['log_intensity'],
        params=params)
    run_env(mass_spec, weighted_dew, param_dict['min_rt'],
            param_dict['max_rt'], param_dict['save_file_name'])
    coverage = run_coverage_evaluation(param_dict['box_file'],
                                       param_dict['save_file_name'],
                                       param_dict['half_isolation_window'])
    return coverage
Exemplo n.º 5
0
 def _load_ROI_file(self, file_index, roi_rt_range=None):
     num_ROI = 0
     for i in range(len(self.ROI_sources)):
         ROI_files = list(Path(self.ROI_sources[i]).glob('*.p'))
         len_ROI = len(ROI_files)
         if len_ROI > file_index:
             ROI_file = ROI_files[file_index - num_ROI]
             ROI = load_obj(ROI_file)
             # logger.debug("Loaded {}".format(ROI_file))
             if roi_rt_range is not None:
                 ROI = self._filter_ROI(ROI, roi_rt_range)
             return ROI
         num_ROI += len_ROI
Exemplo n.º 6
0
def top_n_evaluation(param_dict):
    mass_spec = load_obj(param_dict['mass_spec_file'])
    params = load_obj(param_dict['params_file'])
    topn = TopNController(param_dict['ionisation_mode'],
                          param_dict['N'],
                          param_dict['isolation_width'],
                          param_dict['mz_tol'],
                          param_dict['rt_tol'],
                          param_dict['min_ms1_intensity'],
                          params=params)
    chemical_coverage = run_env(mass_spec, topn, param_dict['min_rt'],
                                param_dict['max_rt'],
                                param_dict['save_file_name'])
    coverage = run_coverage_evaluation(param_dict['box_file'],
                                       param_dict['save_file_name'],
                                       param_dict['half_isolation_window'])
    print('coverage', coverage)
    print('chemical_coverage', chemical_coverage)
    if param_dict['coverage_type'] == 'coverage':
        return coverage
    else:
        return chemical_coverage
Exemplo n.º 7
0
def make_queries_from_chemicals(chemicals_file_name):
    chemicals = load_obj(chemicals_file_name)
    query_spectra = []
    for chem in chemicals:
        precursor_mz = chem.isotopes[0][0]
        peaks = []
        for child in chem.children:
            mz = child.isotopes[0][0]
            intensity = child.parent.max_intensity * child.prop_ms2_mass
            peak = np.array([mz, intensity])
            peaks.append(peak)
        new_spectrum = Spectrum(precursor_mz, peaks)
        query_spectra.append(new_spectrum)
    return query_spectra
Exemplo n.º 8
0
def fragmentation_performance_aligned(param_dict):
    controller = load_obj(param_dict["controller_directory"])
    min_acceptable_intensity = param_dict["min_acceptable_intensity"]
    aligned_chemicals = pd.read_csv(param_dict["aligned_chemicals_location"])
    n_chemicals_aligned = len(aligned_chemicals["mzmed"])
    chemicals_found = 0

    events = np.array([
        event
        for event in controller.environment.mass_spec.fragmentation_events
        if event.ms_level == 2
    ])
    event_query_rts = np.array([event.query_rt for event in events])
    event_query_mzs = np.array([
        controller.environment.mass_spec._get_mz(event.chem, event.query_rt, 0,
                                                 0) for event in events
    ])

    chemicals_found = [0 for i in range(n_chemicals_aligned)]

    for aligned_index in range(n_chemicals_aligned):

        rtmin = aligned_chemicals['peak_rtmin'][aligned_index]
        rtmax = aligned_chemicals['peak_rtmax'][aligned_index]
        mzmin = aligned_chemicals['peak_mzmin'][aligned_index]
        mzmax = aligned_chemicals['peak_mzmax'][aligned_index]
        rtmin_check = event_query_rts > rtmin
        rtmax_check = event_query_rts < rtmax
        mzmin_check = event_query_mzs > mzmin
        mzmax_check = event_query_mzs < mzmax
        idx = np.nonzero(rtmin_check & rtmax_check & mzmin_check
                         & mzmax_check)[0]

        for i in idx:
            event = events[i]
            inten = controller.environment.mass_spec._get_intensity(
                event.chem, event.query_rt, 0, 0)
            if inten > min_acceptable_intensity:
                chemicals_found[aligned_index] = 1
                break
    return chemicals_found
Exemplo n.º 9
0
def run_vimms(no_injections, rt_box_size, mz_box_size):
    rt_range = [(0, 1440)]
    min_rt, max_rt = rt_range[0]
    ionisation_mode, isolation_width = POSITIVE, 1
    N, rt_tol, mz_tol, min_ms1_intensity = 10, 15, 10, 5000
    min_roi_intensity, min_roi_length, min_roi_length_for_fragmentation = \
        500, 3, 3
    grid = GridEstimator(
        LocatorGrid(min_rt, max_rt, rt_box_size, 0, 3000, mz_box_size),
        IdentityDrift())

    hmdbpath = os.path.join(os.path.abspath(os.getcwd()), "..", "..", "tests",
                            "fixtures", "hmdb_compounds.p")
    hmdb = load_obj(hmdbpath)
    df = DatabaseFormulaSampler(hmdb, min_mz=100, max_mz=1000)
    cm = ChemicalMixtureCreator(df, adduct_prior_dict={POSITIVE: {"M+H": 1}})
    chemicals = cm.sample(2000, 1)

    boxes = []
    for i in range(no_injections):
        mz_noise = GaussianPeakNoise(0.1)
        mass_spec = IndependentMassSpectrometer(POSITIVE, chemicals,
                                                mz_noise=mz_noise)
        controller = NonOverlapController(
            ionisation_mode, isolation_width, mz_tol, min_ms1_intensity,
            min_roi_intensity,
            min_roi_length, N, grid, rt_tol=rt_tol,
            min_roi_length_for_fragmentation=min_roi_length_for_fragmentation
        )
        env = Environment(mass_spec, controller, min_rt, max_rt,
                          progress_bar=True)
        set_log_level_warning()
        env.run()
        boxes.append(
            [r.to_box(0.01, 0.01) for r in controller.roi_builder.get_rois()])
    return boxes
Exemplo n.º 10
0
def fragmentation_performance_chemicals(controller_directory,
                                        min_acceptable_intensity,
                                        controller_file_spec="*.p"):
    global total_matched_chemicals
    os.chdir(controller_directory)
    file_names = glob.glob(controller_file_spec)
    n_samples = len(file_names)
    controllers = []
    all_chemicals = []
    for controller_index in range(n_samples):
        controller = load_obj(file_names[controller_index])
        controllers.append(controller)
        all_chemicals.extend(controller.environment.mass_spec.chemicals)
    all_rts = [chem.rt for chem in all_chemicals]
    chemicals_found_total = np.unique(all_rts)
    sample_chemical_start_rts = [[] for i in range(n_samples)]
    sample_chemical_start_rts_total = []
    for i in range(n_samples):
        for event in controllers[i].mass_spec.fragmentation_events:
            if event.ms_level == 2:
                if controllers[i].mass_spec._get_intensity(
                        event.chem, event.query_rt, 0,
                        0) > min_acceptable_intensity:
                    sample_chemical_start_rts[i].append(event.chem.rt)
        sample_chemical_start_rts[i] = np.unique(
            np.array(sample_chemical_start_rts[i])).tolist()
        # at this point we have collected the RTs of the all the chemicals that
        # have been fragmented above the min_intensity threshold
        flatten_rts = []
        for l in sample_chemical_start_rts[0:(i + 1)]:
            flatten_rts.extend(l)
        sample_chemical_start_rts_total.append(
            len(np.unique(np.array(flatten_rts))))
        total_matched_chemicals = sample_chemical_start_rts_total
        logger.debug("Completed Controller", i + 1)
    return chemicals_found_total, total_matched_chemicals
Exemplo n.º 11
0
                        type=str)
    parser.add_argument('--spike_max',
                        dest='spike_max',
                        default=1000,
                        type=float)
    parser.add_argument('--output_swath_file',
                        dest='output_swath_file',
                        type=str,
                        default=None)
    parser.add_argument('--print_chems',
                        dest='print_chems',
                        action='store_true')

    args = parser.parse_args()

    formula_database = load_obj(args.formula_database_file)

    logger.debug("Loaded {} formulas".format(len(formula_database)))

    fs = DatabaseFormulaSampler(formula_database,
                                min_mz=args.min_mz,
                                max_mz=args.max_mz)

    ri = UniformRTAndIntensitySampler(
        min_rt=args.min_rt,
        max_rt=args.max_rt,
        min_log_intensity=np.log(args.min_ms1_sampling_intensity),
        max_log_intensity=np.log(args.max_ms1_sampling_intensity))
    cs = UniformMS2Sampler()

    cm = ChemicalMixtureCreator(fs,
Exemplo n.º 12
0
import pytest
from loguru import logger

from vimms.ChemicalSamplers import UniformMZFormulaSampler, UniformRTAndIntensitySampler, \
    GaussianChromatogramSampler, EvenMZFormulaSampler, ConstantChromatogramSampler, \
    MZMLFormulaSampler, MZMLRTandIntensitySampler, MZMLChromatogramSampler
from vimms.Chemicals import ChemicalMixtureCreator, ChemicalMixtureFromMZML
from vimms.Common import load_obj, set_log_level_warning, set_log_level_debug, \
    ADDUCT_DICT_POS_MH, ScanParameters
from vimms.Roi import RoiParams

# define some useful constants

DIR_PATH = os.path.dirname(os.path.realpath(__file__))
BASE_DIR = os.path.abspath(Path(DIR_PATH, 'fixtures'))
HMDB = load_obj(Path(BASE_DIR, 'hmdb_compounds.p'))
OUT_DIR = str(Path(DIR_PATH, 'results'))

ROI_SOURCES = [str(Path(BASE_DIR, 'beer_t10_simulator_files'))]
# MIN_MS1_INTENSITY = 1.75E5
MIN_MS1_INTENSITY = 1

MZ_RANGE = [(0, 1050)]
RT_RANGE = [(0, 1200)]
CENTRE_RANGE = 600
MIN_RT = RT_RANGE[0][0]
MAX_RT = RT_RANGE[0][1]
N_CHEMS = 10

BEER_CHEMS = load_obj(Path(BASE_DIR, 'QCB_22May19_1.p'))
BEER_MIN_BOUND = 550
Exemplo n.º 13
0
def calculate_performance(params):
    # get parameters
    fragfile = params['fragfile']
    N = params['N']
    rt_tol = params['rt_tol']
    roi_mz_tol = params['roi_mz_tol']
    roi_min_ms1_intensity = params['roi_min_ms1_intensity']
    fragmentation_min_ms1_intensity = params['fragmentation_min_ms1_intensity']
    min_rt = params['min_rt']
    max_rt = params['max_rt']
    roi_min_length = params['roi_min_length']
    fullscan_filename = params['fullscan_filename']
    P_peaks_df = params['P_peaks_df']
    Q_peaks_df = params['Q_peaks_df']
    matching_mz_tol = params['matching_mz_tol']
    matching_rt_tol = params['matching_rt_tol']
    scenario = params['scenario']

    controller_file = params['controller_file']
    chemicals_file = params['chemicals_file']

    if chemicals_file.endswith('.p'):
        print('Loading chemicals')
        chemicals = load_obj(chemicals_file)
    else:
        print('Extracting chemicals')
        chemicals = get_chemicals(chemicals_file, roi_mz_tol, roi_min_ms1_intensity, min_rt, max_rt,
                                  min_length=roi_min_length)

    if type(chemicals) == list:
        chemicals = np.array(chemicals)

    if controller_file.endswith('.p'):
        print('Loading fragmentation events')
        controller = load_obj(controller_file)
        chem_to_frag_events = None
    else:
        print('Extracting fragmentation events')
        controller = None
        precursor_df = get_precursor_info(controller_file)
        chem_to_frag_events = get_chem_to_frag_events(chemicals, precursor_df)

    # compute performance under each scenario
    print('Computing performance under scenario %d' % scenario)
    tp, fp, fn, prec, rec, f1 = 0, 0, 0, 0, 0, 0
    if scenario == 1:
        tp, fp, fn, prec, rec, f1 = compute_performance_scenario_1(controller, chemicals,
                                                                   fragmentation_min_ms1_intensity,
                                                                   fullscan_filename, P_peaks_df,
                                                                   matching_mz_tol, matching_rt_tol,
                                                                   chem_to_frag_events=chem_to_frag_events)
    elif scenario == 2:
        fragfile_filename = os.path.basename(fragfile)
        tp, fp, fn, prec, rec, f1 = compute_performance_scenario_2(controller, chemicals,
                                                                   fragmentation_min_ms1_intensity,
                                                                   fullscan_filename, fragfile_filename,
                                                                   P_peaks_df, Q_peaks_df, matching_mz_tol,
                                                                   matching_rt_tol,
                                                                   chem_to_frag_events=chem_to_frag_events)

    return N, rt_tol, scenario, tp, fp, fn, prec, rec, f1
Exemplo n.º 14
0
def main():
    global file_spectra
    parser = argparse.ArgumentParser(description='Limited dataset creation')
    parser.add_argument('input_file_names', type=str)
    parser.add_argument('library_cache', type=str)
    parser.add_argument('libraries', type=str, nargs='+')
    parser.add_argument('--score_thresh',
                        dest='score_thresh',
                        type=float,
                        default=0.7)
    parser.add_argument('--ms1_tol', dest='ms1_tol', type=float, default=1.)
    parser.add_argument('--ms2_tol', dest='ms2_tol', type=float, default=0.2)
    parser.add_argument('--min_matched_peaks',
                        dest='min_matched_peaks',
                        type=int,
                        default=1)
    parser.add_argument('--output_csv_file',
                        dest='output_csv_file',
                        type=str,
                        default='hits.csv')
    parser.add_argument('--log_level',
                        dest='log_level',
                        type=str,
                        default='warning')
    parser.add_argument('--mgf_id_field',
                        dest='mgf_id_field',
                        type=str,
                        default='SCANS')
    args = parser.parse_args()
    input_file_names = args.input_file_names
    if ',' in input_file_names:  # multiple items
        input_file_names = input_file_names.split(',')
    else:  # single item
        input_file_names = [input_file_names]
    assert len(input_file_names) > 0
    # assume all the files have the same extension as the first one
    first = input_file_names[0]
    root, ext = os.path.splitext(first)
    if ext.lower() == '.mzml':
        query_spectra = {}
        for input_file_name in input_file_names:
            # load the ms2 scans from the .mzML
            file_spectra = load_scans_from_mzml(input_file_name)
            logger.warning("Loaded {} MS2 spectra from {}".format(
                len(file_spectra), input_file_name))
            query_spectra[input_file_name] = file_spectra

    elif ext.lower() == '.mgf':
        query_spectra = {}
        for input_file_name in input_file_names:
            # load the ms2 scans from the .mgf
            file_spectra = load_mgf(input_file_name,
                                    id_field=args.mgf_id_field,
                                    spectra={})
            logger.warning("Loaded {} MS2 spectra from {}".format(
                len(file_spectra), input_file_name))
            query_spectra[input_file_name] = file_spectra
    else:
        logger.warning("Unknown input file format -- should be .mzML or .mgf")
        sys.exit(0)
    if args.log_level == 'warning':
        set_log_level_warning()
    elif args.log_level == 'debug':
        set_log_level_debug()
    libraries = args.libraries
    spec_libraries = {}
    if args.library_cache is not None:
        for library in libraries:
            # attempt to load library
            lib_file = os.path.join(args.library_cache, library + '.p')
            if os.path.isfile(lib_file):
                logger.warning("Loading {}".format(lib_file))
                spec_libraries[library] = load_obj(lib_file)
                logger.warning("Loaded {}".format(lib_file))
            else:
                logger.warning("Could not find {}".format(lib_file))
                sys.exit(0)
    else:
        logger.warning("You must supply a library folder")
        sys.exit(0)
    all_hits = []
    for input_file_name in query_spectra.keys():
        file_spectra = query_spectra[input_file_name]
        logger.warning('Processing {}'.format(input_file_name))
        for spec_id in tqdm(file_spectra.keys()):
            for library in spec_libraries:
                hits = spec_libraries[library].spectral_match(
                    file_spectra[spec_id],
                    score_thresh=args.score_thresh,
                    ms2_tol=args.ms2_tol,
                    ms1_tol=args.ms1_tol,
                    min_match_peaks=args.min_matched_peaks)
                for hit in hits:
                    new_hit = [
                        spec_id, library, hit[0], hit[1],
                        hit[2].metadata['inchikey']
                    ]
                    all_hits.append(new_hit)
    if len(all_hits) == 0:
        logger.warning("No hits found!")
    else:
        logger.warning('Writing output to {}'.format(args.output_csv_file))
        with open(args.output_csv_file, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(
                ['spec_id', 'library', 'hit_id', 'score', 'inchikey'])
            for hit in all_hits:
                writer.writerow(hit)

        # summary
        s, _, t, sc, ik = zip(*all_hits)
        logger.warning("{} unique spectra got hits".format(len(set(s))))
        logger.warning("{} unique structures were hit".format(
            len(set([a.split('-')[0] for a in ik if a is not None]))))