def describePrecursorIntensity(tandem_spectrum_metrics_MS2:mzqc.QualityMetric) -> List[mzqc.QualityMetric]: """ describePrecursorIntensity calculates the descriptive statistics metrics for spectra's peak density from a given level. From the proto-metrics on tandem spectra, the function calculates descriptive statistics metrics for the distribution of precursor intensity. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers. Parameters ---------- tandem_spectrum_metrics_MS2 : mzqc.QualityMetric Proto-metric of tandem spectra containing values for 'precursor_intensity' Returns ------- List[mzqc.QualityMetric] List of resulting QualityMetrics """ metrics: List[mzqc.QualityMetric] = list() arr = np.array(tandem_spectrum_metrics_MS2.value['precursor_intensity']) q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Maximum precursor intensity", value=max(arr)) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Minmum precursor intensity", value=min(arr)) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Q1, Q2, Q3 of precursor intensities", value=[q1, q2, q3]) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Sigma of precursor intensities", value=s) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Mean of precursor intensities", value=m) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Precursor intensity +/-1.5*IQR outlier", value=ol) ) return metrics
def getPeptideLengthMetrics( identification_sequence_metrics: mzqc.QualityMetric ) -> List[mzqc.QualityMetric]: """ describePeptideLengthMetrics calculates the descriptive statistics metrics for identified sequences' length From the proto-metrics on identification sequences, the function calculates descriptive statistics metrics for the distribution of peak density from all involved mass spectra. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers. Parameters ---------- identification_sequence_metrics : mzqc.QualityMetric QualityMetric with 'peptide' value, filtered for final outcome Returns ------- List[mzqc.QualityMetric] List of resulting QualityMetrics """ metrics: List[mzqc.QualityMetric] = list() regex_mod = r'(\([^\(]*\))' regex_noaa = r'([^A-Za-z])' # TODO test this: '.(iTRAQ4plex)M(Oxidation)C(Carbamidomethyl)HNVNR' lengths = np.array([ len(re.sub(regex_noaa, '', re.sub(regex_mod, '', x))) for x in identification_sequence_metrics.value['peptide'] ]) q1, q2, q3, s, m, ol = utils.extractDistributionStats(lengths) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Identified peptide lengths Q1, Q2, Q3", value=[q1, q2, q3])) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Identified peptide lengths sigma", value=s)) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Identified peptide lengths mean", value=m)) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name="Identified peptide lengths +/-1.5*IQR outlier", value=ol)) return metrics
def describeMSdensity(spectrum_acquisition_metrics_MS:mzqc.QualityMetric, start_time: datetime.datetime, ms_level: int) -> List[mzqc.QualityMetric]: """ describeMSdensity calculates the descriptive statistics metrics for spectra's peak density from a given level. From the proto-metrics on spectrum acquisition for a given MS level, the function calculates descriptive statistics metrics for the distribution of peak density from all involved mass spectra. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers. Parameters ---------- spectrum_acquisition_metrics_MS : mzqc.QualityMetric Proto-metric containing 'RT' and 'peakcount' values for all involved spectra start_time : datetime.datetime MS run start time ms_level : int The MS level considered to produce the right QC metric accession Returns ------- List[mzqc.QualityMetric] List of resulting QualityMetrics """ metrics: List[mzqc.QualityMetric] = list() rts = [start_time + datetime.timedelta(seconds=i) for i in spectrum_acquisition_metrics_MS.value['RT']] arr = np.array(spectrum_acquisition_metrics_MS.value['peakcount']) q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Q1, Q2, Q3 of peak density for MS level {ms_level} collection".format(ms_level=ms_level), value=[q1, q2, q3]) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Sigma of peak density for MS level {ms_level} collection".format(ms_level=ms_level), value=s) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Mean of peak density for MS level {ms_level} collection".format(ms_level=ms_level), value=m) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Peak density for MS level {ms_level} collection +/-1.5*IQR outlier".format(ms_level=ms_level), value=ol) ) return metrics
def describeMSCollectionTime(trap_metrics:mzqc.QualityMetric, ms_level: int) -> List[mzqc.QualityMetric]: """ describeMSCollectionTime calculates the descriptive statistics metrics for ion collection times of spectra from a given level. From the proto-metrics on ion collection for a given MS level, the function calculates descriptive statistics metrics for the distribution of ion collection times from all involved mass spectra. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers. Parameters ---------- trap_metrics : mzqc.QualityMetric The proto-metrics on ion collection times from the respective MS level containing 'traptime' values. ms_level : int The MS level considered to produce the right QC metric accession Returns ------- List[mzqc.QualityMetric] The list of metrics """ metrics: List[mzqc.QualityMetric] = list() arr = np.array(trap_metrics['traptime']) q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Q1, Q2, Q3 for MS level {ms_level} trap time collection".format(ms_level=ms_level), value=[q1, q2, q3]) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Sigma for MS level {ms_level} trap time collection".format(ms_level=ms_level), value=s) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Mean of frequency for MS level {ms_level} collection".format(ms_level=ms_level), value=m) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Frequency for MS level {ms_level} collection +/-1.5*IQR outlier".format(ms_level=ms_level), value=ol) ) return metrics
def getEnzymeContaminationMetrics(pep, pro, force_enzymes=False ) -> List[mzqc.QualityMetric]: """ getEnzymeContaminationMetrics calculates enzyme and enzyme contamination metrics from the identifications given. The function calculates the number of missed cleavages (internal), peptide length distribution, and peptide boundaries matching known enzyme patterns from the given identifications. Matching against digestion enzyme patterns other than the enyme used for identification processess has to be switched with 'force_enzymes' and is sensible if the identification was conducted with unspecific cleavage to detect enzyme contamination or enzyme setting mixup is suspected. Parameters ---------- pro : List[oms.ProteinIdentification] List of PyOpenMS ProteinIdentification as from reading a common identification file pep : List[oms.PeptideIdentification] List of PyOpenMS PeptideIdentification as from reading a common identification file force_enzymes : bool, optional If set, will force checking the identified peptide sequences against other known digestion enzyme patterns. By default False Returns ------- List[mzqc.QualityMetric] List of resulting QualityMetrics """ metrics: List[mzqc.QualityMetric] = list() # include all psm actually does not make much sense to assess the enzyme efficiency gre = { pro[0].getSearchParameters().digestion_enzyme.getName(): re.compile(pro[0].getSearchParameters().digestion_enzyme.getRegEx()) } # TODO pyopenms wrappers for DigestionEnzymeDB etc # li: List = list() # oms.DigestionEnzymeDB().getAllNames(li) # ore = {e: re.compile(oms.DigestionEnzymeDB().getEnzyme(e).getRegEx()) for e in li # if e not in gre and e != 'no cleavage'} enzymematch_tab: Dict[str, List[Any]] = defaultdict(list) missed_ranks = list() matched_ranks = list() # alt = dict() for i, pepi in enumerate(pep): pepi.sort() spec_id = pepi.getMetaValue('spectrum_reference') \ if pepi.metaValueExists('spectrum_reference') else i for i, h in enumerate(pepi.getHits()): pepseq = h.getPeptideEvidences()[0].getAABefore() \ + h.getSequence().toUnmodifiedString() \ + h.getPeptideEvidences()[0].getAAAfter() is_matched, internal_matches = matchEnzyme( next(iter(gre.values())), pepseq) if i == 0: enzymematch_tab['native_id'].append(spec_id) enzymematch_tab['matched'].append(is_matched) enzymematch_tab['missed'].append(internal_matches) else: missed_ranks.append(internal_matches) matched_ranks.append(is_matched) # if force_enzymes or not is_matched: # oth_enz_matched = {k: matchEnzyme(v, pepseq) for k,v in ore.items()} # alt[spec_id] = oth_enz_matched if len(missed_ranks): arr = np.array(missed_ranks) q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name= "Q1, Q2, Q3 of missed clevage counts for all lower rank identifications.", value=[q1, q2, q3])) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name= "Sigma of missed clevage counts for all lower rank identifications.", value=s)) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name= "Mean of missed clevage counts for all lower rank identifications.", value=m)) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name= "Missed clevage count for all lower rank identifications +/-1.5*IQR outlier", value=ol)) if len(matched_ranks): mdl: Dict[int, int] = defaultdict(int) arr = np.array(matched_ranks) uniq, counts = np.unique(arr, return_counts=True) mdl.update(dict(zip(uniq, counts))) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name= "Match/semi/none counts for all lower rank identifications.", value=[mdl[2], mdl[1], mdl[0]])) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Missed cleavages", value=enzymematch_tab)) arr = np.array(enzymematch_tab['missed']) q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name="Q1, Q2, Q3 of missed clevage counts for top identifications.", value=[q1, q2, q3])) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name="Sigma of missed clevage counts for top identifications.", value=s)) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name="Mean of missed clevage counts for top identifications.", value=m)) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name= "Missed clevage count for top identifications +/-1.5*IQR outlier", value=ol)) return metrics
def describeIdentifiedPrecursorIntensity(tandem_spectrum_metrics_MS2:mzqc.QualityMetric, identification_accuracy_metrics: mzqc.QualityMetric) -> List[mzqc.QualityMetric]: """ describeIdentifiedPrecursorIntensity calculates the descriptive statistics metrics for precursor intensities of identified tandem spectra. From the proto-metrics on identification accuracies and tandem spectra, the function calculates descriptive statistics metrics on the precursor intensities from all identified tandem spectra. Namely, min and max, mean, standard deviation, Quartiles, and 1.5*IQR outliers. Parameters ---------- tandem_spectrum_metrics_MS2 : mzqc.QualityMetric The proto-metrics on tandem spectra containing 'RT', 'precursor_mz', 'precursor_intensity', 'surveyscan_intensity_sum', 'surveyscan_intensity_max' values. identification_accuracy_metrics : mzqc.QualityMetric The proto-metrics on identification accuracies containing 'RT' and 'MZ' values Returns ------- List[mzqc.QualityMetric] The list of metrics """ metrics: List[mzqc.QualityMetric] = list() # Fraction of total MS2 scans identified in the first quartile of peptides sorted by MS1 intensity (sum) np_prec = np.array([tandem_spectrum_metrics_MS2.value['RT'], tandem_spectrum_metrics_MS2.value['precursor_mz'], tandem_spectrum_metrics_MS2.value['precursor_intensity'], tandem_spectrum_metrics_MS2.value['surveyscan_intensity_sum'], tandem_spectrum_metrics_MS2.value['surveyscan_intensity_max']]) # DS-3A reimpl.: median( (surv max / prec int) for all ident. prec ) id_coord = np.array([identification_accuracy_metrics.value['RT'],identification_accuracy_metrics.value['MZ']]) # TODO make sure intersection is round-proof intersected = np.intersect1d(np_prec[1],id_coord[1], assume_unique=False, return_indices=True) np_id = np_prec[:,intersected[1]] arr = np_id[2] q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Maximum identified precursor intensity", value=max(arr)) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Minmum identified precursor intensity", value=min(arr)) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Q1, Q2, Q3 of identified precursor intensities", value=[q1, q2, q3]) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Sigma of identified precursor intensities", value=s) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Mean of identified precursor intensities", value=m) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Precursor identified intensity +/-1.5*IQR outlier", value=ol) ) return metrics
def describeErrorRates(identification_accuracy_metrics:mzqc.QualityMetric) -> List[mzqc.QualityMetric]: """ describeErrorRates calculates the descriptive statistics metrics for charge ratios of identified tandem spectra. From the proto-metrics on identification accuracy, the function calculates descriptive statistics metrics on the error rates from all identified tandem spectra. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers. Parameters ---------- identification_accuracy_metrics : mzqc.QualityMetric The proto-metrics on identification accuracies containing 'delta_ppm' and 'abs_error' values. Returns ------- List[mzqc.QualityMetric] The list of metrics """ metrics: List[mzqc.QualityMetric] = list() if 'delta_ppm' not in identification_accuracy_metrics: warnings.warn("No error values in given annotation, ignoring identification error rate metrics.", Warning) return metrics metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="MS15A", value= np.median(identification_accuracy_metrics.value['abs_error']) ) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="MS15B", value=np.mean(identification_accuracy_metrics.value['abs_error']) ) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="MS15C", value=np.median(identification_accuracy_metrics.value['delta_ppm']) ) ) arr = np.array(identification_accuracy_metrics.value['delta_ppm']) q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="MS15D", value=q3-q1) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Delta ppm Q1, Q2, Q3", value=[q1,q2,q3]) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Delta ppm sigma", value=s) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Delta ppm mean", value=m) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Delta ppm +/-1.5*IQR outlier", value=ol) ) return metrics