예제 #1
0
def make_stats_table(
    input_fname='',
    input_dataset=[],
    include_lcmsruns=[],
    exclude_lcmsruns=[],
    include_groups=[],
    exclude_groups=[],
    output_loc=None,
    min_peak_height=0,
    rt_tolerance=np.inf,
    ppm_tolerance=np.inf,
    min_msms_score=0,
    min_num_frag_matches=0,
    allow_no_msms=False,
    min_relative_frag_intensity=None,
    use_labels=False,
    return_all=False,
    msms_refs_loc='/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v2.tab',
    dependencies={
        'peak_height': [],
        'peak_area': ['peak_height'],
        'rt_peak': ['peak_height', 'rt_delta'],
        'rt_delta': ['peak_height'],
        'mz_centroid': ['peak_height', 'mz_ppm'],
        'mz_ppm': ['peak_height'],
        'msms_score': ['peak_height', 'num_frag_matches'],
        'num_frag_matches': ['peak_height', 'msms_score']
    }):

    assert output_loc is not None or return_all

    if not input_dataset:
        metatlas_dataset = ma_data.get_dill_data(
            os.path.expandvars(input_fname))
    else:
        metatlas_dataset = input_dataset

    if output_loc is not None and not os.path.exists(output_loc):
        os.mkdir(output_loc)

    # filter runs from the metatlas dataset
    if include_lcmsruns:
        metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_include_list(
            metatlas_dataset, 'lcmsrun', include_lcmsruns)
    if include_groups:
        metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_include_list(
            metatlas_dataset, 'group', include_groups)

    if exclude_lcmsruns:
        metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_exclude_list(
            metatlas_dataset, 'lcmsrun', exclude_lcmsruns)
    if exclude_groups:
        metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_exclude_list(
            metatlas_dataset, 'group', exclude_groups)

    file_names = ma_data.get_file_names(metatlas_dataset)
    compound_names = ma_data.get_compound_names(metatlas_dataset,
                                                use_labels=use_labels)[0]

    metrics = [
        'msms_score', 'num_frag_matches', 'mz_centroid', 'mz_ppm', 'rt_peak',
        'rt_delta', 'peak_height', 'peak_area'
    ]

    dfs = {m: None for m in metrics}
    passing = {
        m: np.ones((len(compound_names), len(file_names))).astype(float)
        for m in metrics
    }

    for metric in ['peak_height', 'peak_area', 'rt_peak', 'mz_centroid']:
        dfs[metric] = dp.make_output_dataframe(input_dataset=metatlas_dataset,
                                               fieldname=metric,
                                               use_labels=use_labels)

    dfs['mz_ppm'] = dfs['peak_height'].copy()
    dfs['mz_ppm'] *= np.nan

    dfs['msms_score'] = dfs['mz_ppm'].copy()
    dfs['num_frag_matches'] = dfs['mz_ppm'].copy()
    dfs['rt_delta'] = dfs['mz_ppm'].copy()

    passing['peak_height'] = (np.nan_to_num(dfs['peak_height'].values) >=
                              min_peak_height).astype(float)

    msms_hits_df = dp.get_msms_hits(
        metatlas_dataset,
        use_labels,
        ref_index=['database', 'id', 'inchi_key', 'precursor_mz'])
    msms_hits_df.reset_index(inplace=True)

    for compound_idx, compound_name in enumerate(compound_names):

        ref_rt_peak = metatlas_dataset[0][compound_idx][
            'identification'].rt_references[0].rt_peak
        ref_mz = metatlas_dataset[0][compound_idx][
            'identification'].mz_references[0].mz

        dfs['rt_delta'].iloc[compound_idx] = abs(
            ref_rt_peak - dfs['rt_peak'].iloc[compound_idx])
        passing['rt_delta'][compound_idx] = (
            abs(ref_rt_peak -
                np.nan_to_num(dfs['rt_peak'].iloc[compound_idx].values)) <=
            rt_tolerance).astype(float)

        dfs['mz_ppm'].iloc[compound_idx] = 1e6 * (
            abs(ref_mz - dfs['mz_centroid'].iloc[compound_idx]) / ref_mz)
        passing['mz_ppm'][compound_idx] = (
            dfs['mz_ppm'].iloc[compound_idx].values <=
            ppm_tolerance).astype(float)

        inchi_key = metatlas_dataset[0][compound_idx][
            'identification'].compound[0].inchi_key

        for file_idx, file_name in enumerate(file_names):
            rows = msms_hits_df[(msms_hits_df['inchi_key'] == inchi_key) &
                                (msms_hits_df['file_name'] == file_name) &
                                ((abs(msms_hits_df['precursor_mz'].values.astype(float) - metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz)/metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz) \
                                   <= metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz_tolerance*1e-6)]

            if len(rows) == 0:
                dfs['msms_score'].iat[compound_idx, file_idx] = np.nan
                dfs['num_frag_matches'].iat[compound_idx, file_idx] = np.nan
            else:
                dfs['msms_score'].iat[compound_idx, file_idx] = rows.loc[
                    rows['score'].idxmax()]['score']
                dfs['num_frag_matches'].iat[compound_idx, file_idx] = rows.loc[
                    rows['score'].idxmax()]['num_matches']

    passing['msms_score'] = (np.nan_to_num(dfs['msms_score'].values) >=
                             min_msms_score).astype(float)
    passing['num_frag_matches'] = (np.nan_to_num(
        dfs['num_frag_matches'].values) >= min_num_frag_matches).astype(float)

    for metric in metrics:
        passing[metric][passing[metric] == 0] = np.nan

    stats_table = []

    for metric in metrics:
        test = np.product(np.array(
            [passing[dep] for dep in dependencies[metric]]),
                          axis=0)
        # group_df = (dfs[metric] * test).T.groupby('group').describe()
        if output_loc is not None:
            (dfs[metric] * test).to_csv(os.path.join(
                output_loc, 'filtered_%s.tab' % metric),
                                        sep='\t')
        stats_df = (dfs[metric] * test * passing[metric]).T.describe().T
        stats_df['range'] = stats_df['max'] - stats_df['min']
        stats_df.columns = pd.MultiIndex.from_product([['filtered'], [metric],
                                                       stats_df.columns])
        stats_table.append(stats_df)

    for metric in metrics:
        if output_loc is not None:
            dfs[metric].to_csv(os.path.join(output_loc,
                                            'unfiltered_%s.tab' % metric),
                               sep='\t')
        stats_df = dfs[metric].T.describe().T
        stats_df['range'] = stats_df['max'] - stats_df['min']
        stats_df.columns = pd.MultiIndex.from_product([['unfiltered'],
                                                       [metric],
                                                       stats_df.columns])
        stats_table.append(stats_df)

    stats_table = pd.concat(stats_table, axis=1)

    if output_loc is not None:
        stats_table.to_csv(os.path.join(output_loc, 'stats_table.tab'),
                           sep='\t')

        with open(os.path.join(output_loc, 'stats_table.readme'),
                  'w') as readme:
            for var in [
                    'dependencies', 'min_peak_height', 'rt_tolerance',
                    'ppm_tolerance', 'min_msms_score', 'min_num_frag_matches'
            ]:
                readme.write('%s\n' % var)
                try:
                    if np.isinf(eval(var)):
                        pprint.pprint('default', readme)
                    else:
                        pprint.pprint(eval(var), readme)
                except TypeError:
                    pprint.pprint(eval(var), readme)
                readme.write('\n')

    if return_all:
        return stats_table, dfs, passing
예제 #2
0
def make_scores_df(metatlas_dataset):
    """
    Returns pandas dataframe with columns 'max_intensity', 'median_rt_shift','median_mz_ppm', 'max_msms_score',
    'num_frag_matches', and 'max_relative_frag_intensity', rows of compounds in metatlas_dataset, and values
    of the best "score" for a given compound across all files.

    'max_intensity': highest intensity across all files for given compound
    'median_rt_shift': median shift of RT across all files for given compound to reference
    'median_mz_ppm': median ppm of mz across all files for given compound relative to reference
    'max_msms_score': highest compound dot-product score across all files for given compound relative to reference
    'num_frag_matches': number of matching mzs when calculating max_msms_score
    'max_relative_frag_intensity': ratio of second highest to first highest intensity of matching sample mzs

    :param metatlas_dataset:

    :return scores_df: pandas dataframe
    """

    file_names = ma_data.get_file_names(metatlas_dataset)
    compound_names = ma_data.get_compound_names(metatlas_dataset)[0]

    scores = []

    msms_hits_df = dp.get_msms_hits(
        metatlas_dataset,
        ref_index=['database', 'id', 'inchi_key', 'precursor_mz'])
    msms_hits_df.reset_index(inplace=True)

    for compound_idx in range(len(compound_names)):
        intensities = []
        rt_shifts = []
        mz_ppms = []
        max_msms_score = np.nan
        num_frag_matches = np.nan
        max_relative_frag_intensity = np.nan

        compound_ref_rt_peak = metatlas_dataset[0][compound_idx][
            'identification'].rt_references[0].rt_peak
        compound_ref_mz = metatlas_dataset[0][compound_idx][
            'identification'].mz_references[0].mz
        inchi_key = metatlas_dataset[0][compound_idx][
            'identification'].compound[0].inchi_key

        if len(msms_hits_df) == 0:
            comp_msms_hits = msms_hits_df
        else:
            comp_msms_hits = msms_hits_df[(msms_hits_df['inchi_key'] == metatlas_dataset[0][compound_idx]['identification'].compound[0].inchi_key) \
                                          & ((abs(msms_hits_df['precursor_mz'].values.astype(float) - metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz)/metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz) \
                                             <= metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz_tolerance*1e-6)]

        for file_idx in range(len(file_names)):
            try:
                assert (metatlas_dataset[file_idx][compound_idx]['data']
                        ['ms1_summary']['peak_height'] > 0)
                intensities.append(metatlas_dataset[file_idx][compound_idx]
                                   ['data']['ms1_summary']['peak_height'])
            except:  # AssertionError:
                pass

            try:
                assert (metatlas_dataset[file_idx][compound_idx]['data']
                        ['ms1_summary']['num_ms1_datapoints'] > 0)
                rt_shifts.append(
                    abs(compound_ref_rt_peak - metatlas_dataset[file_idx]
                        [compound_idx]['data']['ms1_summary']['rt_peak']))
                mz_ppms.append(
                    1e6 *
                    (abs(compound_ref_mz - metatlas_dataset[file_idx]
                         [compound_idx]['data']['ms1_summary']['mz_centroid'])
                     / compound_ref_mz))
            except:  # AssertionError:
                pass

        if len(comp_msms_hits['score']) > 0:
            row = comp_msms_hits.loc[comp_msms_hits['score'].idxmax()]
            max_msms_score = row['score']
            num_frag_matches = row['num_matches']

            if num_frag_matches > 1:
                msv_sample_matches = sp.partition_aligned_ms_vectors(
                    row['msv_query_aligned'], row['msv_ref_aligned'])[0]
                msv_sample_matches = msv_sample_matches[:,
                                                        msv_sample_matches[1].
                                                        argsort()[::-1]]
                msv_sample_matches_by_intensity = msv_sample_matches[:,
                                                                     msv_sample_matches[
                                                                         1].
                                                                     argsort()]

                max_relative_frag_intensity = msv_sample_matches_by_intensity[
                    1, -2] / msv_sample_matches_by_intensity[1, -1]

        try:
            max_intensity = np.nanmax(intensities)
        except ValueError:
            max_intensity = np.nan
        try:
            median_rt_shift = np.nanmedian(rt_shifts)
        except ValueError:
            median_rt_shift = np.nan
        try:
            median_mz_ppm = np.nanmedian(mz_ppms)
        except ValueError:
            median_mz_ppm = np.nan

        # assign scores
        scores.append([
            metatlas_dataset[0][compound_idx]
            ['identification'].compound[0].name, metatlas_dataset[0]
            [compound_idx]['identification'].compound[0].inchi_key,
            max_intensity, median_rt_shift, median_mz_ppm, max_msms_score,
            num_frag_matches, max_relative_frag_intensity
        ])

    scores_df = pd.DataFrame(scores,
                             columns=[
                                 'name', 'inchi_key', 'max_intensity',
                                 'median_rt_shift', 'median_mz_ppm',
                                 'max_msms_score', 'num_frag_matches',
                                 'max_relative_frag_intensity'
                             ])

    return scores_df