Exemplo n.º 1
0
    def setUp(self):
        self.prots = data_sets.Proteins(proteins=(data_sets.Protein(
            accession='P03995',
            gene='Gfap',
            description='Glial fibrillary acidic protein',
            full_sequence=(
                'MERRRITSARRSYASETVVRGLGPSRQLGTMPRFSLSRMTPPLPARVDFSLAG'
                'ALNAGFKETRASERAEMMELNDRFASYIEKVRFLEQQNKALAAELNQLRAKEP'
                'TKLADVYQAELRELRLRLDQLTANSARLEVERDNFAQDLGTLRQKLQDETNLR'
                'LEAENNLAAYRQEADEATLARVDLERKVESLEEEIQFLRKIYEEEVRELREQL'
                'AQQQVHVEMDVAKPDLTAALREIRTQYEAVATSNMQETEEWYRSKFADLTDAA'
                'SRNAELLRQAKHEANDYRRQLQALTCDLESLRGTNESLERQMREQEERHARES'
                'ASYQEALARLEEEGQSLKEEMARHLQEYQDLLNVKLALDIEIATYRKLLEGEE'
                'NRITIPVQTFSNLQIRETSLDTKSVSEGHLKRNIVVKTVEMRDGEVIKDSKQE'
                'HKDVVM'),
        ), ), )
        self.seq = data_sets.extract_sequence(self.prots, 'QEADEATLAR')

        self.mods = data_sets.Modifications(mods=[
            data_sets.Modification(
                rel_pos=0,
                mod_type='TMT6plex',
                nterm=True,
                sequence=self.seq,
            ),
        ], )

        self.seq.modifications = self.mods
        self.channels = OrderedDict([
            ('low1', '126'),
            ('low2', '127'),
            ('low3', '128'),
            ('med', '129'),
            ('high', '130'),
            ('norm', '131'),
        ])
        self.groups = OrderedDict([
            ('base', ['low1', 'low2', 'low3']),
            ('stim', ['med', 'high']),
        ])

        insert = {
            'Proteins': self.prots,
            'Sequence': self.seq,
            'Modifications': self.mods,
            '126': 1e4,
            '127': 1e4,
            '128': np.nan,
            '129': 4e4,
            '130': 4e4,
            '131': 1e4,
        }

        self.data = data_sets.DataSet(
            channels=self.channels,
            groups=self.groups,
        )

        self.data.add_peptide(insert)
Exemplo n.º 2
0
 def setUp(self):
     self.sequence = data_sets.Sequence(
         pep_seq="GEPNVsyICSR",
         protein_matches=(data_sets.ProteinMatch(
             protein=data_sets.Protein(
                 accession="Q9WV60",
                 gene="Gsk3b",
                 description="Glycogen synthase kinase-3 beta",
                 full_sequence=(
                     "MSGRPRTTSFAESCKPVQQPSAFGSMKVSRDKDGSKVTTVVATPGQGPD"
                     "RPQEVSYTDTKVIGNGSFGVVYQAKLCDSGELVAIKKVLQDKRFKNREL"
                     "QIMRKLDHCNIVRLRYFFYSSGEKKDEVYLNLVLDYVPETVYRVARHYS"
                     "RAKQTLPVIYVKLYMYQLFRSLAYIHSFGICHRDIKPQNLLLDPDTAVL"
                     "KLCDFGSAKQLVRGEPNVSYICSRYYRAPELIFGATDYTSSIDVWSAGC"
                     "VLAELLLGQPIFPGDSGVDQLVEIIKVLGTPTREQIREMNPNYTEFKFP"
                     "QIKAHPWTKVFRPRTPPEAIALCSRLLEYTPTARLTPLEACAHSFFDEL"
                     "RDPNVKLPNGRDTPALFNFTTQELSSNPPLATILIPPHARIQAAASPPA"
                     "NATAASDTNAGDRGQTNNAASASASNST"),
             ),
             rel_pos=209,
             exact=True,
         ), ),
     )
     self.sequence.modifications = data_sets.Modifications(
         (
             # S215-p
             data_sets.Modification(
                 rel_pos=5,
                 mod_type="Phospho",
                 sequence=self.sequence,
             ),
             # Y216-p
             data_sets.Modification(
                 rel_pos=6,
                 mod_type="Phospho",
                 sequence=self.sequence,
             ),
         ), )
     self.sequences = list(motif.generate_n_mers(self.sequence))
     self.foreground = self.sequences
     self.background = self.sequences
Exemplo n.º 3
0
 def setUp(self):
     self.sequence = data_sets.Sequence(
         pep_seq='GEPNVsyICSR',
         protein_matches=(data_sets.ProteinMatch(
             protein=data_sets.Protein(
                 accession='Q9WV60',
                 gene='Gsk3b',
                 description='Glycogen synthase kinase-3 beta',
                 full_sequence=(
                     'MSGRPRTTSFAESCKPVQQPSAFGSMKVSRDKDGSKVTTVVATPGQGPD'
                     'RPQEVSYTDTKVIGNGSFGVVYQAKLCDSGELVAIKKVLQDKRFKNREL'
                     'QIMRKLDHCNIVRLRYFFYSSGEKKDEVYLNLVLDYVPETVYRVARHYS'
                     'RAKQTLPVIYVKLYMYQLFRSLAYIHSFGICHRDIKPQNLLLDPDTAVL'
                     'KLCDFGSAKQLVRGEPNVSYICSRYYRAPELIFGATDYTSSIDVWSAGC'
                     'VLAELLLGQPIFPGDSGVDQLVEIIKVLGTPTREQIREMNPNYTEFKFP'
                     'QIKAHPWTKVFRPRTPPEAIALCSRLLEYTPTARLTPLEACAHSFFDEL'
                     'RDPNVKLPNGRDTPALFNFTTQELSSNPPLATILIPPHARIQAAASPPA'
                     'NATAASDTNAGDRGQTNNAASASASNST'),
             ),
             rel_pos=209,
             exact=True,
         ), ),
     )
     self.sequence.modifications = data_sets.Modifications(
         (
             # S215-p
             data_sets.Modification(
                 rel_pos=5,
                 mod_type='Phospho',
                 sequence=self.sequence,
             ),
             # Y216-p
             data_sets.Modification(
                 rel_pos=6,
                 mod_type='Phospho',
                 sequence=self.sequence,
             ),
         ), )
Exemplo n.º 4
0
def _get_proteins(df, cursor, pd_version):
    if pd_version[:2] in [(1, 4)]:
        prots = cursor.execute(
            '''
            SELECT
            Peptides.PeptideID,
            ProteinAnnotations.Description,
            Proteins.Sequence

            FROM Peptides

            JOIN PeptidesProteins
            ON Peptides.PeptideID=PeptidesProteins.PeptideID

            JOIN ProteinAnnotations
            ON ProteinAnnotations.ProteinID=PeptidesProteins.ProteinID

            JOIN Proteins
            ON Proteins.ProteinID=PeptidesProteins.ProteinID
            ''',
        )
    elif pd_version[:2] in [(2, 2)]:
        prots = cursor.execute(
            '''
            SELECT
            TargetPsms.PeptideID,
            TargetProteins.FastaTitleLines,
            TargetProteins.Sequence

            FROM TargetPsms

            JOIN TargetProteinsTargetPsms
            ON TargetPsms.PeptideID=
            TargetProteinsTargetPsms.TargetPsmsPeptideID

            JOIN TargetProteins
            ON TargetProteins.UniqueSequenceID=
            TargetProteinsTargetPsms.TargetProteinsUniqueSequenceID
            ''',
        )
    else:
        raise Exception(
            'Unsupported Proteome Discoverer Version: {}'.format(pd_version)
        )

    accessions = defaultdict(list)
    genes = defaultdict(list)
    descriptions = defaultdict(list)
    sequences = defaultdict(list)

    for peptide_id, prot_string, seq in prots:
        for fasta_line in prot_string.split('\n'):
            try:
                accessions[peptide_id].append(
                    pypuniprot.RE_DISCOVERER_ACCESSION.match(fasta_line).group(2)
                )
            except:
                print(fasta_line)
                raise

            gene = RE_GENE.match(prot_string)

            if gene:
                gene = gene.group(1)
            else:
                gene = RE_GENE_BACKUP.match(prot_string).group(2)

            genes[peptide_id].append(
                gene
            )
            descriptions[peptide_id].append(
                RE_DESCRIPTION.match(prot_string).group(2)
            )
            sequences[peptide_id].append(seq)

    df['Protein Descriptions'] = df.index.map(
        lambda peptide_id:
        '; '.join(descriptions[peptide_id])
    )

    df['Protein Group Accessions'] = df.index.map(
        lambda peptide_id:
        '; '.join(accessions[peptide_id])
    )

    df['Proteins'] = df.index.map(
        lambda peptide_id:
        data_sets.Proteins(
            proteins=tuple(
                data_sets.Protein(
                    accession=accession,
                    gene=gene,
                    full_sequence=seq,
                    description=desc,
                )
                for accession, gene, seq, desc in zip(
                    accessions[peptide_id],
                    genes[peptide_id],
                    sequences[peptide_id],
                    descriptions[peptide_id],
                )
            )
        )
    )

    return df
Exemplo n.º 5
0
def _get_proteins(df, cursor, pd_version):
    if pd_version[:2] in [(1, 4)] or pd_version[:2] in [(2, 1)]:
        prots = cursor.execute(
            '''
            SELECT
            Peptides.PeptideID,
            ProteinAnnotations.Description,
            Proteins.Sequence

            FROM Peptides

            JOIN PeptidesProteins
            ON Peptides.PeptideID=PeptidesProteins.PeptideID

            JOIN ProteinAnnotations
            ON ProteinAnnotations.ProteinID=PeptidesProteins.ProteinID

            JOIN Proteins
            ON Proteins.ProteinID=PeptidesProteins.ProteinID
            ''', )
    elif pd_version[:2] in [(2, 2)]:
        prots = cursor.execute(
            '''
            SELECT
            TargetPsms.PeptideID,
            TargetProteins.FastaTitleLines,
            TargetProteins.Sequence

            FROM TargetPsms

            JOIN TargetProteinsTargetPsms
            ON TargetPsms.PeptideID=
            TargetProteinsTargetPsms.TargetPsmsPeptideID

            JOIN TargetProteins
            ON TargetProteins.UniqueSequenceID=
            TargetProteinsTargetPsms.TargetProteinsUniqueSequenceID
            ''', )
    else:
        raise Exception(
            'Unsupported Proteome Discoverer Version: {}'.format(pd_version))

    accessions = defaultdict(list)
    genes = defaultdict(list)
    descriptions = defaultdict(list)
    sequences = defaultdict(list)

    for peptide_id, prot_string, seq in prots:
        for fasta_line in prot_string.split('\n'):
            accessions[peptide_id].append(fasta_line.replace(">", ""))
            gene = fasta_line.replace(">", "")
            genes[peptide_id].append(gene)
            descriptions[peptide_id].append("")
            sequences[peptide_id].append(seq)

    df['Protein Descriptions'] = df.index.map(
        lambda peptide_id: '; '.join(descriptions[peptide_id]))

    df['Protein Group Accessions'] = df.index.map(
        lambda peptide_id: '; '.join(accessions[peptide_id]))

    df['Proteins'] = df.index.map(
        lambda peptide_id: data_sets.Proteins(proteins=tuple(
            data_sets.Protein(
                accession=accession,
                gene=gene,
                full_sequence=seq,
                description=desc,
            ) for accession, gene, seq, desc in zip(
                accessions[peptide_id],
                genes[peptide_id],
                sequences[peptide_id],
                descriptions[peptide_id],
            ))))

    return df
Exemplo n.º 6
0
def _get_proteins(df, cursor):
    prots = cursor.execute(
        """
        SELECT
        Peptides.PeptideID,
        ProteinAnnotations.Description,
        Proteins.Sequence
        FROM Peptides
        JOIN PeptidesProteins
        ON Peptides.PeptideID=PeptidesProteins.PeptideID
        JOIN ProteinAnnotations
        ON ProteinAnnotations.ProteinID=PeptidesProteins.ProteinID
        JOIN Proteins
        ON Proteins.ProteinID=PeptidesProteins.ProteinID
        """,
    )

    accessions = defaultdict(list)
    genes = defaultdict(list)
    descriptions = defaultdict(list)
    sequences = defaultdict(list)

    for peptide_id, prot_string, seq in prots:
        accessions[peptide_id].append(
            pypuniprot.RE_DISCOVERER_ACCESSION.match(prot_string).group(1)
        )

        gene = RE_GENE.match(prot_string)

        if not gene:
            gene = RE_GENE_BACKUP.match(prot_string)

        genes[peptide_id].append(
            gene.group(1)
        )
        descriptions[peptide_id].append(
            RE_DESCRIPTION.match(prot_string).group(1)
        )
        sequences[peptide_id].append(seq)

    df["Protein Descriptions"] = df.index.map(
        lambda peptide_id:
        "; ".join(descriptions[peptide_id])
    )

    df["Protein Group Accessions"] = df.index.map(
        lambda peptide_id:
        "; ".join(accessions[peptide_id])
    )

    df["Proteins"] = df.index.map(
        lambda peptide_id:
        data_sets.Proteins(
            proteins=tuple(
                data_sets.Protein(
                    accession=accession,
                    gene=gene,
                    full_sequence=seq,
                    description=desc,
                )
                for accession, gene, seq, desc in zip(
                    accessions[peptide_id],
                    genes[peptide_id],
                    sequences[peptide_id],
                    descriptions[peptide_id],
                )
            )
        )
    )

    return df