Пример #1
0
 def test_amino_acid_composition(self):
     for seq in self.simple_sequences:
         comp = parser.amino_acid_composition(seq, term_aa=True, labels=uppercase)
         comp_default = parser.amino_acid_composition(seq, labels=uppercase)
         self.assertEqual(1, comp['nterm' + seq[0]])
         if len(seq) > 1:
             self.assertEqual(1, comp['cterm' + seq[-1]])
         self.assertEqual(sum(comp_default.values()), sum(comp.values()))
 def add_aa_counts(self):
     aa_counts = pd.DataFrame.from_records(
         [parser.amino_acid_composition(sequence) for sequence in self.data_frame['sequence']]) \
         .fillna(0, downcast='infer')
     aa_counts.columns = [
         '{} count'.format(column) for column in aa_counts.columns
     ]
     self.data_frame = pd.concat([self.data_frame, aa_counts], axis=1)
 def add_relative_counts(self):
     record_list = []
     for sequence in self.data_frame['sequence']:
         record_list.append({
             k: v / float(parser.length(sequence))
             for k, v in parser.amino_acid_composition(sequence).items()
         })
     aa_counts = pd.DataFrame.from_records(record_list).fillna(
         0, downcast='infer')
     aa_counts.columns = [
         '{} relative_count'.format(column) for column in aa_counts.columns
     ]
     self.data_frame = pd.concat([self.data_frame, aa_counts], axis=1)
Пример #4
0
    def test_modparser(self):
        """
        """
        df = pd.read_csv("data/TestModParser.csv")

        df["Sequence"] = df["Sequence"].apply(PF.remove_brackets)
        df["Sequence"] = df["Sequence"].apply(PF.replace_numbers)

        mod_dic, mods_seq = PF.extract_modifications(df["Sequence"], True)
        df["detected"] = [i[0] for i in mods_seq]
        df["Test"] = [
            True if i == j else False
            for i, j in zip(df["Mods"], df["detected"])
        ]

        #%%
        mods = [i[1] + i[0] for i in mod_dic.items()]
        for seqi in df["NewSeqs"]:
            print(seqi)
            print(
                parser.amino_acid_composition(seqi,
                                              labels=parser.std_labels + mods))
        #%%
        assert (True == df["Test"].all())
Пример #5
0
 def test_amino_acid_composition_simple(self):
     for seq in self.simple_sequences:
         comp = parser.amino_acid_composition(seq, labels=uppercase)
         for aa in set(seq):
             self.assertEqual(seq.count(aa), comp[aa])
Пример #6
0
def get_AA_matrix(sequences,
                  pos_specific=False,
                  ntermini=5,
                  lcp=1,
                  mods=0,
                  correct=False,
                  residues=parser.std_amino_acids):
    """
    Counts the amino acid in a peptide sequence. Counting uses the
    pyteomics amino_acid composition. Modified residues of the pattern
    "modA" are already supported.

    If the modifications should not be considered another sequence column 
    can be used. As read on the pyteomics doc an "lcp" factor can substantially 
    increase the prediction accuracy.
    
    Parameters:
    -----------------------------------
    df: ar, with sequences
    
    seq_column: string,
                sequence column that is used to generate the features
                
    mods: bool,
          1 (default) or zero. If one: oxM and M area treated as different
          entities.
          
    Examples:
    -----------------------------------
    #modification and termini supporting
    >>mystr = "nAAAAAAAAAAAAAAAGAAGcK"
    
    #just aa composition
    >>mystr = "AAAAAAAAAAAAAAAGAAGK"
    
    Returns:
    --------------------------------------
    df: dataframe with amino acid count columns
    """
    df = pd.DataFrame()
    df["Sequence"] = sequences.copy()
    #create dataframe with counts
    aa_counts = [parser.amino_acid_composition(i) for i in df["Sequence"]]
    aa_count_df = pd.DataFrame(aa_counts).replace(np.nan, 0)
    #only count without position index
    if pos_specific:
        residues_hash = {i: 0 for i in residues}

        #-1 one since last c-term not sued
        nfeatures = (2 * ntermini - 1) * len(residues)
        #init dic with counts
        #ini dataframe with same row index as df, to overwrite counts
        count_dic = {
            j + res + str(i): 0
            for res in residues for i in range(0, ntermini) for j in ["N"]
        }
        count_dic.update({
            j + res + str(i): 0
            for res in residues for i in range(1, ntermini) for j in ["C"]
        })

        count_df = pd.DataFrame(np.zeros((df.shape[0], nfeatures)))
        count_df.columns = sorted(count_dic.keys())
        count_df.index = df.index

        #super inefficient
        #todo: fixme
        for ii, rowi in df.iterrows():
            #if the peptides are shorter than 2x ntermini, the
            #counts would overlap. TO avoid this shorten the termini
            #counts when neceessary
            seq = rowi["Sequence"]
            n = len(seq)
            if (n - 2 * ntermini) < 0:
                tmp_ntermini = np.floor(n / 2.)
            else:
                tmp_ntermini = ntermini

            #iterate over number of termini, add count if desired (residues)
            for i in range(0, int(tmp_ntermini)):
                if seq[i] in residues_hash:
                    nterm = "N" + seq[i] + str(i)
                    count_df.at[ii, nterm] = count_df.loc[ii][nterm] + 1

                if seq[-i - 1] in residues_hash:
                    cterm = "C" + seq[-i - 1] + str(i)
                    #sinec the last amino acid is usually K/R don't add unnecessary
                    #features here
                    if i != 0:
                        count_df.at[ii, cterm] = count_df.loc[ii][cterm] + 1

        #correct other counts
        #by substracting the sequence specific counts
        new_df = aa_count_df.join(count_df)
        #iterate over columns
        for res in residues:
            tmp_df = new_df.filter(regex="(N|C){}\d".format(res))
            sums = tmp_df.sum(axis=1)
            #correct the internal counts
            new_df[res] = new_df[res] - sums
    else:
        return (aa_count_df)

    #multiply each raw value by a correction term, see pyteomics docu
    #for details ("lcp")
    if correct:
        cfactor = 1. + lcp * np.log(df["Sequence"].apply(len))
        new_df = new_df.mul(cfactor, axis=0)

    new_df = new_df.replace(np.nan, 0)
    return (new_df)
Пример #7
0
def handcrafted_features(data, tags):

    #
    # DOI 10.1007/s00251-017-1023-5
    # Code from https://github.com/bittremieux/TCR-Classifier/blob/master/tcr_classifier.ipynb
    # Modified to apply handcrafted features twice, once to the alpha chain and again to the beta chain
    # Modified to handle split for training, validation, and test cohorts
    # Modified for multinomial classification
    #

    # physicochemical amino acid properties
    basicity = {
        'A': 206.4,
        'B': 210.7,
        'C': 206.2,
        'D': 208.6,
        'E': 215.6,
        'F': 212.1,
        'G': 202.7,
        'H': 223.7,
        'I': 210.8,
        'K': 221.8,
        'L': 209.6,
        'M': 213.3,
        'N': 212.8,
        'P': 214.4,
        'Q': 214.2,
        'R': 237.0,
        'S': 207.6,
        'T': 211.7,
        'V': 208.7,
        'W': 216.1,
        'X': 210.2,
        'Y': 213.1,
        'Z': 214.9
    }

    hydrophobicity = {
        'A': 0.16,
        'B': -3.14,
        'C': 2.50,
        'D': -2.49,
        'E': -1.50,
        'F': 5.00,
        'G': -3.31,
        'H': -4.63,
        'I': 4.41,
        'K': -5.00,
        'L': 4.76,
        'M': 3.23,
        'N': -3.79,
        'P': -4.92,
        'Q': -2.76,
        'R': -2.77,
        'S': -2.85,
        'T': -1.08,
        'V': 3.02,
        'W': 4.88,
        'X': 4.59,
        'Y': 2.00,
        'Z': -2.13
    }

    helicity = {
        'A': 1.24,
        'B': 0.92,
        'C': 0.79,
        'D': 0.89,
        'E': 0.85,
        'F': 1.26,
        'G': 1.15,
        'H': 0.97,
        'I': 1.29,
        'K': 0.88,
        'L': 1.28,
        'M': 1.22,
        'N': 0.94,
        'P': 0.57,
        'Q': 0.96,
        'R': 0.95,
        'S': 1.00,
        'T': 1.09,
        'V': 1.27,
        'W': 1.07,
        'X': 1.29,
        'Y': 1.11,
        'Z': 0.91
    }

    mutation_stability = {
        'A': 13,
        'C': 52,
        'D': 11,
        'E': 12,
        'F': 32,
        'G': 27,
        'H': 15,
        'I': 10,
        'K': 24,
        'L': 34,
        'M': 6,
        'N': 6,
        'P': 20,
        'Q': 10,
        'R': 17,
        'S': 10,
        'T': 11,
        'V': 17,
        'W': 55,
        'Y': 31
    }

    # feature conversion and generation
    features_list = []

    for chain in ['tra', 'trb']:

        onehot_encoder = feature_extraction.DictVectorizer(sparse=False)
        features_list.append(
            pd.DataFrame(onehot_encoder.fit_transform(
                data[[chain + '_vgene',
                      chain + '_jgene']].to_dict(orient='records')),
                         columns=onehot_encoder.feature_names_))

        # sequence length
        features_list.append(data[chain + '_cdr3'].apply(
            lambda sequence: parser.length(sequence)).to_frame().rename(
                columns={chain + '_cdr3': 'length'}))

        # number of occurences of each amino acid
        aa_counts = pd.DataFrame.from_records([
            parser.amino_acid_composition(sequence)
            for sequence in data[chain + '_cdr3']
        ]).fillna(0)
        aa_counts.columns = [
            chain + '_count_{}'.format(column) for column in aa_counts.columns
        ]
        features_list.append(aa_counts)

        # physicochemical properties: (average) basicity, (average) hydrophobicity,
        #                             (average) helicity, pI, (average) mutation stability
        features_list.append(
            data[chain +
                 '_cdr3'].apply(lambda seq: sum([basicity[aa] for aa in seq]) /
                                parser.length(seq)).to_frame().rename(
                                    columns={chain + '_cdr3': 'avg_basicity'}))
        features_list.append(data[chain + '_cdr3'].apply(lambda seq: sum(
            [hydrophobicity[aa] for aa in seq]) / parser.length(seq)).to_frame(
            ).rename(columns={chain + '_cdr3': 'avg_hydrophobicity'}))
        features_list.append(
            data[chain +
                 '_cdr3'].apply(lambda seq: sum([helicity[aa] for aa in seq]) /
                                parser.length(seq)).to_frame().rename(
                                    columns={chain + '_cdr3': 'avg_helicity'}))
        features_list.append(data[chain + '_cdr3'].apply(
            lambda seq: electrochem.pI(seq)).to_frame().rename(
                columns={chain + '_cdr3': 'pI'}))
        features_list.append(data[chain + '_cdr3'].apply(
            lambda seq: sum([mutation_stability[aa] for aa in seq]) / parser.
            length(seq)).to_frame().rename(
                columns={chain + '_cdr3': 'avg_mutation_stability'}))

        # peptide mass
        features_list.append(data[chain + '_cdr3'].apply(
            lambda seq: mass.fast_mass(seq)).to_frame().rename(
                columns={chain + '_cdr3': 'mass'}))

        # positional features
        # amino acid occurence and physicochemical properties at a given position from the center
        pos_aa, pos_basicity, pos_hydro, pos_helicity, pos_pI, pos_mutation = [
            [] for _ in range(6)
        ]
        for sequence in data[chain + '_cdr3']:
            length = parser.length(sequence)
            start_pos = -1 * (length // 2)
            pos_range = list(range(start_pos, start_pos + length)) if length % 2 == 1 else\
              list(range(start_pos, 0)) + list(range(1, start_pos + length + 1))

            pos_aa.append({
                chain + '_pos_{}_{}'.format(pos, aa): 1
                for pos, aa in zip(pos_range, sequence)
            })
            pos_basicity.append({
                chain + '_pos_{}_basicity'.format(pos): basicity[aa]
                for pos, aa in zip(pos_range, sequence)
            })
            pos_hydro.append({
                chain + '_pos_{}_hydrophobicity'.format(pos):
                hydrophobicity[aa]
                for pos, aa in zip(pos_range, sequence)
            })
            pos_helicity.append({
                chain + '_pos_{}_helicity'.format(pos): helicity[aa]
                for pos, aa in zip(pos_range, sequence)
            })
            pos_pI.append({
                chain + '_pos_{}_pI'.format(pos): electrochem.pI(aa)
                for pos, aa in zip(pos_range, sequence)
            })
            pos_mutation.append({
                chain + '_pos_{}_mutation_stability'.format(pos):
                mutation_stability[aa]
                for pos, aa in zip(pos_range, sequence)
            })

        features_list.append(pd.DataFrame.from_records(pos_aa).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_basicity).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_hydro).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_helicity).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_pI).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_mutation).fillna(0))

    features_list.append(data['weights'])
    for tag in tags:
        features_list.append(data['labels_' + tag])
    features_list.append(data['split'])

    # combine all features
    data_processed = pd.concat(features_list, axis=1)

    return data_processed