Exemplo n.º 1
0
 def trinucleotid_runs(data):
     """
     Calculates the dropchance based on the chance of trinucleotid microsatellites to mutate. Microsatellites are
     repeats of two to six units that have a higher chance to mutate than regular polymers.
     The dropchances are additive in this case (@shouldDrop()).
     :param data: The DNA sequence to check for microsatellites with length 3.
     :return: The dropchance based on the occurence of microsatellites with length 3.
     """
     return shouldDrop(
         data,
         [
             ("microsatelliteLongerThanX(3,10)", 0.001),
             ("microsatelliteLongerThanX(3,15)", 0.002),
             ("microsatelliteLongerThanX(3,20)", 0.003),
             ("microsatelliteLongerThanX(3,25)", 0.004),
             ("microsatelliteLongerThanX(3,30)", 0.005),
             ("microsatelliteLongerThanX(3,35)", 0.006),
             ("microsatelliteLongerThanX(3,40)", 0.007),
             ("microsatelliteLongerThanX(3,45)", 0.008),
             ("microsatelliteLongerThanX(3,50)", 0.009),
             ("microsatelliteLongerThanX(3,55)", 0.010),
             ("microsatelliteLongerThanX(3,60)", 0.011),
             ("microsatelliteLongerThanX(3,65)", 0.012),
             ("microsatelliteLongerThanX(3,70)", 0.013),
             ("microsatelliteLongerThanX(3,75)", 0.014),
             ("microsatelliteLongerThanX(3,80)", 0.015),
             ("microsatelliteLongerThanX(3,85)", 0.016),
             ("microsatelliteLongerThanX(3,90)", 0.017),
             ("microsatelliteLongerThanX(3,95)", 0.018),
             ("microsatelliteLongerThanX(3,100)", 0.019),
         ],
     )
Exemplo n.º 2
0
 def ch_at_permutation(data, ch):
     """
     Calculates the dropchance based on the number of occurrences of a nucleotide, A and T in this case since their
     chance to mutate is generally lower.
     These errors are additiv.
     :param data: The sequence to check for the number of occurences of a nucleotide.
     :param ch: The nucleotide to check for.
     :return: The dropchance based on the number of occurences of the given nucleotide.
     """
     preamble = "charCountBiggerEqualThanX("
     return shouldDrop(
         data,
         [
             (preamble + ch + ",20)", 0.001),
             (preamble + ch + ",40)", 0.001),
             (preamble + ch + ",60)", 0.001),
             (preamble + ch + ",80)", 0.001),
             (preamble + ch + ",100)", 0.001),
             (preamble + ch + ",120)", 0.001),
             (preamble + ch + ",140)", 0.001),
             (preamble + ch + ",160)", 0.001),
             (preamble + ch + ",180)", 0.001),
             (preamble + ch + ",200)", 0.001),
         ],
     )
Exemplo n.º 3
0
 def random_permutations(data):
     """
     Calculates the dropchance for simulated random mutations in the DNA-Data.
     :param data: The sequence to simulate random mutations for.
     :return: The dropchance based on random mutations.
     """
     return shouldDrop(data, [("*", 0.02)])
Exemplo n.º 4
0
 def illegal_symbols(data):
     """
     Checks the DNA data for illegal symbols and returns a dropchance of 1.0 if the sequence contains them.
     :param data: The sequence to check for illegal symbols.
     :return: The dropchance based on the occurence of illegal symbols (0.0 or 1.0).
     """
     res = shouldDrop(data, [("strContainsIllegalChars(ACGT)", 1.0)])
     return res
Exemplo n.º 5
0
    def motif_regex_search(data):
        """

        :param data:
        :return:
        """
        return shouldDrop(
            data,
            [
                # Promoter recognition motif (Euk).
                ("strContainsSubRegex(CANYYY)", 0.01),
                ("strContainsSubRegex(ANCCAATCA)", 0.01),
                ("strContainsSubRegex(KGGGCGGRRY)", 0.01),
                ("strContainsSubRegex(KRGGCGKRRY)", 0.01),
                # Promoter recognition motifs (Prok).
                ("strContainsSubRegex(AAAWWTWTTTTNNNAAA)", 0.05),
                # Ribosomal binding site (Euk).
                ("strContainsSubRegex(RCCACCATGG)", 0.05),
                # Ribosomal binding site (Prok).
                ("strContainsSubRegex(AGGAGGACAGCTAUG)", 0.05),
                # Lox sites.
                ("strContainsSubRegex(ATAACTTCGTATAGTAYACATTATACGAAGTTAT)",
                 0.01),
            ])
Exemplo n.º 6
0
    def motif_search(data):
        """

        :param data:
        :return:
        """
        return shouldDrop(
            data,
            [
                # Promoter recognition motif (Euk).
                ("strContainsSub(TATAAA)", 0.01),
                # Promoter recognition motifs (Prok).
                ("strContainsSub(TTGACA)", 0.05),
                ("strContainsSub(TGTATAATG)", 0.05),
                # Polyadenylation signals (Euk).
                ("strContainsSub(AATAAA)", 0.01),
                ("strContainsSub(TTGTGTGTTG)", 0.01),
                # Lox sites.
                ("strContainsSub(ATAACTTCGTATAGCATACATTATACGAAGTTAT)", 1.01),
                ("strContainsSub(ATAACTTCGTATAGCATACATTATACGAACGGTA)", 1.01),
                ("strContainsSub(TACCGTTCGTATAGCATACATTATACGAAGTTAT)", 1.01),
                ("strContainsSub(TACCGTTCGTATAGCATACATTATACGAACGGTA)", 1.01),
                ("strContainsSub(TACCGTTCGTATATGGTATTATATACGAAGTTAT)", 1.01),
                ("strContainsSub(TACCGTTCGTATATTCTATCTTATACGAAGTTAT)", 1.01),
                ("strContainsSub(TACCGTTCGTATAGGATACTTTATACGAAGTTAT)", 1.01),
                ("strContainsSub(TACCGTTCGTATATACTATACTATACGAAGTTAT)", 1.01),
                ("strContainsSub(TACCGTTCGTATACTATAGCCTATACGAAGTTAT)", 1.01),
                ("strContainsSub(ATAACTTCGTATATGGTATTATATACGAACGGTA)", 1.01),
                ("strContainsSub(ATAACTTCGTATAGTATACCTTATACGAAGTTAT)", 1.01),
                # Lox site spacers not covered by the Lox sites.
                ("strContainsSub(AGGTATGC)", 1.01),
                ("strContainsSub(TTGTATGG)", 1.01),
                ("strContainsSub(GGATAGTA)", 1.01),
                ("strContainsSub(GTGTATTT)", 1.01),
                ("strContainsSub(GGTTACGG)", 1.01),
                ("strContainsSub(TTTTAGGT)", 1.01),
                ("strContainsSub(GTACACAT)", 1.01),
                # Restriction enzyme recognition motifs.
                # BpiI
                ("strContainsSub(GAAGAC)", 1.01),
                # inverse BpiI
                ("strContainsSub(CTTCTG)", 1.01),
                # BsaI
                ("strContainsSub(GGTCTC)", 1.01),
                # inverse BsaI
                ("strContainsSub(CCAGAG)", 1.01),
                ("strContainsSub(CGTCTC)", 0.01),
                ("strContainsSub(GCGATG)", 0.01),
                ("strContainsSub(GCTCTTC)", 0.01),
                # Oligo Adapters.
                ("strContainsSub(CTCGTAGACTGCGTACCA)", 0.01),
                ("strContainsSub(GACGATGAGTCCTGAGTA)", 0.01),
                # 5' extensions.
                ("strContainsSub(GGTTCCACGTAAGCTTCC)", 0.01),
                ("strContainsSub(GCGATTACCCTGTACACC)", 0.01),
                ("strContainsSub(GCCAGTACATCAATTGCC)", 0.01),
                # Twister Adapters:
                ("strContainsSub(GAAGTGCCATTCCGCCTGACCT)", 1.0
                 ),  # Twister 5' Adapter
                ("strContainsSub(AGGCTAGGTGGAGGCTCAGTG)",
                 1.0),  # Twister 3' Adapter
            ])