def test_create_kmers_from_string(self): kmers = KmerHelper.create_kmers_from_string("ABCDEFG", 3) self.assertTrue("ABC" in kmers and "BCD" in kmers and "CDE" in kmers and "DEF" in kmers and "EFG" in kmers) self.assertEqual(5, len(kmers)) kmers = KmerHelper.create_kmers_from_string("AB", 3) self.assertTrue(len(kmers) == 0)
def compute_tcrb_relative_abundance(sequences: np.ndarray, counts: np.ndarray, k: int) -> dict: """ Computes the relative abundance of k-mers in the repertoire per following equations where C is the template count for the given receptor sequence, T is the total count across all receptor sequences. The relative abundance per receptor sequence is then computed and only the maximum sequence abudance was used for the k-mer so that the k-mer's relative abundance is equal to the abundance of the most frequent receptor sequence in which the receptor appears: .. math:: T^{TCR \\beta} = \\sum_{TCR\\beta} C^{TCR\\beta} RA^{TCR\\beta} = \\frac{C^{TCR\\beta}}{T^{TCR\\beta}} RA = \\max_{\\underset{with \\, kmer}{TCR\\beta}} {RA^{TCR \\beta}} For more details, please see the original publication: Ostmeyer J, Christley S, Toby IT, Cowell LG. Biophysicochemical motifs in T cell receptor sequences distinguish repertoires from tumor-infiltrating lymphocytes and adjacent healthy tissue. Cancer Res. Published online January 1, 2019:canres.2292.2018. `doi:10.1158/0008-5472.CAN-18-2292 <https://cancerres.aacrjournals.org/content/canres/79/7/1671.full.pdf>`_ Arguments: sequences: an array of (amino acid) sequences (corresponding to a repertoire) counts: an array of counts for each of the sequences k: the length of the k-mer (in the publication referenced above, k is 4) Returns: a dictionary where keys are k-mers and values are their relative abundances in the given list of sequences """ relative_abundance = {} total_count = np.sum(counts) relative_abundance_per_sequence = counts / total_count for index, sequence in enumerate(sequences): kmers = KmerHelper.create_kmers_from_string(sequence, k) for kmer in kmers: if kmer not in relative_abundance or relative_abundance[kmer] < relative_abundance_per_sequence[index]: relative_abundance[kmer] = relative_abundance_per_sequence[index] return relative_abundance
def compute_relative_abundance(sequences: np.ndarray, counts: np.ndarray, k: int) -> dict: """ Computes the relative abundance of k-mers in the repertoire per following equations where C is the template count, T is the total count and RA is relative abundance (the output of the function for each k-mer separately): .. math:: C^{kmer}=\\sum_{\\underset{with kmer}{TCR \\beta}} C^{TCR \\beta} T^{kmer} = \\sum_{kmer} C^{kmer} RA = \\frac{C^{kmer}}{T^{kmer}} For more details, please see the original publication: Ostmeyer J, Christley S, Toby IT, Cowell LG. Biophysicochemical motifs in T cell receptor sequences distinguish repertoires from tumor-infiltrating lymphocytes and adjacent healthy tissue. Cancer Res. Published online January 1, 2019:canres.2292.2018. `doi:10.1158/0008-5472.CAN-18-2292 <https://cancerres.aacrjournals.org/content/canres/79/7/1671.full.pdf>`_ Arguments: sequences: an array of (amino acid) sequences (corresponding to a repertoire) counts: an array of counts for each of the sequences k: the length of the k-mer (in the publication referenced above, k is 4) Returns: a dictionary where keys are k-mers and values are their relative abundances in the given list of sequences """ c_kmers = Counter() for index, sequence in enumerate(sequences): kmers = KmerHelper.create_kmers_from_string(sequence, k) c_kmers += {kmer: counts[index] for kmer in kmers} t_kmers = sum(c_kmers.values()) return {kmer: c_kmers[kmer] / t_kmers for kmer in c_kmers.keys()}