Пример #1
0
def filter_unique_peptides(peptides, score, ns):
    """ Filters unique peptides from multiple Percolator output XML files.
        Takes a dir with a set of XMLs, a score to filter on and a namespace.
        Outputs an ElementTree.
    """
    scores = {'q': 'q_value',
              'pep': 'pep',
              'p': 'p_value',
              'svm': 'svm_score'}
    highest = {}
    for el in peptides:
        featscore = float(el.xpath('xmlns:%s' % scores[score],
                                   namespaces=ns)[0].text)
        seq = reader.get_peptide_seq(el, ns)

        if seq not in highest:
            highest[seq] = {
                'pep_el': formatting.stringify_strip_namespace_declaration(
                    el, ns), 'score': featscore}
        if score == 'svm':  # greater than score is accepted
            if featscore > highest[seq]['score']:
                highest[seq] = {
                    'pep_el':
                    formatting.stringify_strip_namespace_declaration(el, ns),
                    'score': featscore}
        else:  # lower than score is accepted
            if featscore < highest[seq]['score']:
                highest[seq] = {
                    'pep_el':
                    formatting.stringify_strip_namespace_declaration(el, ns),
                    'score': featscore}
        formatting.clear_el(el)

    for pep in list(highest.values()):
        yield pep['pep_el']
Пример #2
0
def filter_whole_proteins(elements, protein_fasta, lookup, seqtype, ns,
                          deamidation, minpeplen, enforce_tryp):
    whole_proteins = {
        str(prot.seq).replace('L', 'I'): prot.id
        for prot in fasta.parse_fasta(protein_fasta)
    }
    whole_proteins = {v: k for k, v in whole_proteins.items()}
    for element in elements:
        seq_matches_protein = False
        element_seqs = get_seqs_from_element(element, seqtype, ns, deamidation)
        element_prots = {
            seq:
            [(protid, pos)
             for protid, pos in lookup.get_protein_from_pep(seq[:minpeplen])]
            for seq in element_seqs
        }
        for pepseq, proteins in element_prots.items():
            for prot_id, pos in proteins:
                protseq = whole_proteins[prot_id]
                if pepseq in protseq:
                    if enforce_tryp and (
                            pos == 0 or not set([pepseq[-1], protseq[pos - 1]
                                                 ]).difference(['K', 'R'])):
                        # pepseq is tryptic on both ends, or
                        # pepseq is an N-term peptide),
                        # matches to protein seq so remove
                        seq_matches_protein = True
                        break
                    elif not enforce_tryp:
                        seq_matches_protein = True
                        break
        if seq_matches_protein:
            formatting.clear_el(element)
        else:
            yield formatting.string_and_clear(element, ns)
Пример #3
0
def filter_whole_proteins(elements, protein_fasta, lookup, seqtype, ns,
                          deamidation, minpeplen, enforce_tryp):
    whole_proteins = {str(prot.seq).replace('L', 'I'): prot.id for prot in
                      fasta.parse_fasta(protein_fasta)}
    whole_proteins = {v: k for k, v in whole_proteins.items()}
    for element in elements:
        seq_matches_protein = False
        element_seqs = get_seqs_from_element(element, seqtype, ns, deamidation)
        element_prots = {seq: [(protid, pos) for protid, pos in
                               lookup.get_protein_from_pep(seq[:minpeplen])]
                         for seq in element_seqs}
        for pepseq, proteins in element_prots.items():
            for prot_id, pos in proteins:
                protseq = whole_proteins[prot_id]
                if pepseq in protseq:
                    if enforce_tryp and (pos == 0 or not set(
                            [pepseq[-1],
                             protseq[pos - 1]]).difference(['K', 'R'])):
                        # pepseq is tryptic on both ends, or
                        # pepseq is an N-term peptide),
                        # matches to protein seq so remove
                        seq_matches_protein = True
                        break
                    elif not enforce_tryp:
                        seq_matches_protein = True
                        break
        if seq_matches_protein:
            formatting.clear_el(element)
        else:
            yield formatting.string_and_clear(element, ns)
Пример #4
0
def filter_peptide_length(features, elementtype, ns, minlen=0, maxlen=None):
    minlen = int(minlen)
    if maxlen is None:
        maxlen = float('inf')
    else:
        maxlen = int(maxlen)
    for feat in features:
        seq = get_either_seq(elementtype, feat, ns)
        seq = strip_modifications(seq)
        if len(seq) >= minlen and len(seq) <= maxlen:
            yield formatting.string_and_clear(feat, ns)
        else:
            formatting.clear_el(feat)
Пример #5
0
def protein_header_split_generator(elements, headers, ns):
    """Loop through proteins of each PSM/peptide. If a protein does not
    match any of headers, discard PSM/peptide immediately"""
    for el in elements:
        header_not_matching = False
        for protein in el.findall('{%s}protein_id' % ns['xmlns']):
            if not any((re.search(h, protein.text) for h in headers)):
                header_not_matching = True
                break
        if header_not_matching:
            formatting.clear_el(el)
        else:
            yield formatting.string_and_clear(el, ns)
Пример #6
0
def filter_peptide_length(features, elementtype, ns, minlen=0, maxlen=None):
    minlen = int(minlen)
    if maxlen is None:
        maxlen = float('inf')
    else:
        maxlen = int(maxlen)
    for feat in features:
        seq = get_either_seq(elementtype, feat, ns)
        seq = strip_modifications(seq)
        if len(seq) >= minlen and len(seq) <= maxlen:
            yield formatting.string_and_clear(feat, ns)
        else:
            formatting.clear_el(feat)
Пример #7
0
def protein_header_split_generator(elements, headers, ns):
    """Loop through proteins of each PSM/peptide. If a protein does not
    match any of headers, discard PSM/peptide immediately"""
    for el in elements:
        header_not_matching = False
        for protein in el.findall('{%s}protein_id' % ns['xmlns']):
            if not any((re.search(h, protein.text) for h in headers)):
                header_not_matching = True
                break
        if header_not_matching:
            formatting.clear_el(el)
        else:
            yield formatting.string_and_clear(el, ns)
Пример #8
0
def generate_xmltags(fn, returntag, ignore_tags, ns=None):
    """
    Base generator for percolator xml psm, peptide, protein output,
    as well as for mzML, mzIdentML.
    ignore_tags are the ones that are cleared when met by parser.
    """
    xmlns = create_namespace(ns)
    ns_ignore = ['{0}{1}'.format(xmlns, x) for x in ignore_tags]
    for ac, el in etree.iterparse(fn):
        if el.tag == '{0}{1}'.format(xmlns, returntag):
            yield el
        elif el.tag in ns_ignore:
            formatting.clear_el(el)
Пример #9
0
def filter_known_searchspace(elements, seqtype, lookup, ns, ntermwildcards,
                             deamidation):
    """Yields peptides from generator as long as their sequence is not found in
    known search space dict. Useful for excluding peptides that are found in
    e.g. ENSEMBL or similar"""
    for element in elements:
        seq_is_known = False
        for seq in get_seqs_from_element(element, seqtype, ns, deamidation):
            if lookup.check_seq_exists(seq, ntermwildcards):
                seq_is_known = True
                break
        if seq_is_known:
            formatting.clear_el(element)
        else:
            yield formatting.string_and_clear(element, ns)
Пример #10
0
def filter_known_searchspace(elements, seqtype, lookup, ns, ntermwildcards,
                             deamidation):
    """Yields peptides from generator as long as their sequence is not found in
    known search space dict. Useful for excluding peptides that are found in
    e.g. ENSEMBL or similar"""
    for element in elements:
        seq_is_known = False
        for seq in get_seqs_from_element(element, seqtype, ns, deamidation):
            if lookup.check_seq_exists(seq, ntermwildcards):
                seq_is_known = True
                break
        if seq_is_known:
            formatting.clear_el(element)
        else:
            yield formatting.string_and_clear(element, ns)
Пример #11
0
def mzmlfn_ms2_spectra_generator(mzmlfiles):
    for fn, spec, ns in mzmlfn_spectra_generator(mzmlfiles):
        specparams = get_all_cvparams(spec, ns)
        mslvl = fetch_cvparam_value_by_name(specparams, 'ms level')
        if mslvl != '2':
            continue
        scannr = get_spec_scan_nr(spec)
        rt = fetch_cvparams_values_from_subel(spec, 'scan',
                                              ['scan start time'], ns)
        iit = fetch_cvparams_values_from_subel(spec, 'scan',
                                               ['ion injection time'], ns)
        mz, charge = fetch_cvparams_values_from_subel(spec, 'selectedIon',
                                                      ['selected ion m/z',
                                                       'charge state'], ns)
        yield fn, {'scan': scannr, 'rt': rt[0], 'iit': iit[0], 'mz': mz,
                   'charge': charge}
        formatting.clear_el(spec)
Пример #12
0
def protein_header_split_generator(elements, ns, can_headers, headers):
    """Loop through proteins of each PSM/peptide. If a protein does not
    match any of headers, discard PSM/peptide immediately"""
    for el in elements:
        header_matching = False
        can = False
        for protein in el.findall('{%s}protein_id' % ns['xmlns']):
            if any(re.search(h, protein.text) for h in can_headers):
                can = True
                break  #as soon as a canonical match was found break
            """for classes other than known,
               check if there is at least one protein matching the specified header
               and those with matches to the canonical proteins will not be used"""
            if any(re.search(h, protein.text) for h in headers):
                header_matching = True
        if (header_matching and not can) or ((headers == can_headers) and can):
            yield formatting.string_and_clear(el, ns)
        else:
            formatting.clear_el(el)
Пример #13
0
def filter_unique_peptides(peptides, score, ns):
    """ Filters unique peptides from multiple Percolator output XML files.
        Takes a dir with a set of XMLs, a score to filter on and a namespace.
        Outputs an ElementTree.
    """
    scores = {'q': 'q_value', 'pep': 'pep', 'p': 'p_value', 'svm': 'svm_score'}
    highest = {}
    for el in peptides:
        featscore = float(
            el.xpath('xmlns:%s' % scores[score], namespaces=ns)[0].text)
        seq = reader.get_peptide_seq(el, ns)

        if seq not in highest:
            highest[seq] = {
                'pep_el':
                formatting.stringify_strip_namespace_declaration(el, ns),
                'score': featscore
            }
        if score == 'svm':  # greater than score is accepted
            if featscore > highest[seq]['score']:
                highest[seq] = {
                    'pep_el':
                    formatting.stringify_strip_namespace_declaration(el, ns),
                    'score':
                    featscore
                }
        else:  # lower than score is accepted
            if featscore < highest[seq]['score']:
                highest[seq] = {
                    'pep_el':
                    formatting.stringify_strip_namespace_declaration(el, ns),
                    'score':
                    featscore
                }
        formatting.clear_el(el)

    for pep in list(highest.values()):
        yield pep['pep_el']
Пример #14
0
def mzmlfn_ms2_spectra_generator(mzmlfiles):
    for fn, spec, ns in mzmlfn_spectra_generator(mzmlfiles):
        specparams = get_all_cvparams(spec, ns)
        mslvl = fetch_cvparam_value_by_name(specparams, 'ms level')
        if mslvl != '2':
            continue
        specscanid = spec.attrib['id']
        rt, iit, ionmob = fetch_cvparams_values_from_subel(
            spec, 'scan', [
                'scan start time', 'ion injection time',
                'inverse reduced ion mobility'
            ], ns)
        mz, charge = fetch_cvparams_values_from_subel(
            spec, 'selectedIon', ['selected ion m/z', 'charge state'], ns)
        yield fn, {
            'specscanid': specscanid,
            'ionmob': ionmob,
            'rt': rt,
            'iit': iit,
            'mz': mz,
            'charge': charge
        }
        formatting.clear_el(spec)
Пример #15
0
def target_decoy_generator(element_generator, decoy, ns):
    for el in element_generator:
        if el.attrib['{%s}decoy' % ns['xmlns']] == decoy:
            yield formatting.string_and_clear(el, ns)
        else:
            formatting.clear_el(el)
Пример #16
0
def get_score(elements, ns, scoretype='svm_score'):
    for el in elements:
        score = el.xpath('xmlns:{0}'.format(scoretype), namespaces=ns)[0].text
        formatting.clear_el(el)
        yield score
Пример #17
0
def get_score(elements, ns, scoretype='svm_score'):
    for el in elements:
        score = el.xpath('xmlns:{0}'.format(scoretype), namespaces=ns)[0].text
        formatting.clear_el(el)
        yield score
Пример #18
0
def target_decoy_generator(element_generator, decoy, ns):
    for el in element_generator:
        if el.attrib['{%s}decoy' % ns['xmlns']] == decoy:
            yield formatting.string_and_clear(el, ns)
        else:
            formatting.clear_el(el)