def test_random_chromosome_compostion(self): """Tests creation of a chromosome of random length and sequence. Length is drawn from a negative binomial distribution with a mean of the expected dna length.""" chrom = Chromosome() assert type(chrom.sequence) is str assert re.match('[abcd*:/?+]+', chrom.sequence)
def generate(self): dna = Chromosome() cfg = AppSettings() if not os.path.exists("reports"): # pragma: no cover os.makedirs("reports") f = open("reports/_" + str(int(time.time())) + ".txt", "w") f.write("==========================\n" + "==========================\n" + "==\n" "== Genetics\n\n" + "nucleotides: " + dna.nucleotides() + "\n" + "expected length: " + str(cfg.genetics.chromosome_length) + "\n\n" + "Examples: \n") for i in range(1, 10): dna = Chromosome() f.write(dna.sequence + "\n") lengths = [] for i in range(1, 1000): dna = Chromosome() lengths.append(len(dna.sequence)) f.write(f"\n\nmean_length (standard deviation): " + \ f"{statistics.mean(lengths)} " + \ f"({statistics.stdev(lengths)})") f.close()
def test_insertion_length(self): """Tests that insertion mutations are of the correct length""" cfg = AppSettings() reps = 1000 deltas = [] for _ in range(0, reps): dna = Chromosome() init_length = len(dna.sequence) dna.insertion() deltas.append(len(dna.sequence) - init_length) expected_delta = cfg.genetics.mutation_length var = nbinom.var( 1, cfg.genetics.mutation_length / (1 + cfg.genetics.mutation_length)) conf_99 = ((var / reps)**(1 / 2)) * 4 observed_delta = (sum(deltas) / reps) assert (expected_delta - conf_99) < observed_delta < (expected_delta + conf_99)
def test_substitutions_changes(self): """Test that substitions occur at the expected rate.""" cfg = AppSettings() reps = 1000 deltas = [] for _ in range(0, reps): seq = "a" * 100 dna = Chromosome(seq) dna.substitutions() deltas.append(sum(1 for a, b in zip(seq, dna.sequence) if a != b)) # Expand the conf_99 to compensate for repeated mutations in the same place expected_delta = cfg.genetics.mutation_rate * 100 * \ (1 - 1/len(Chromosome.nucleotides())) # Because there is a little slop around synonymous substitions I multiply # the confidence by 10 just to limit the number of failing tests. conf_99 = ((poisson.var(cfg.genetics.mutation_rate * 100) / 1000) **(1 / 2)) * 10 observed_delta = sum(deltas) / reps assert (expected_delta - conf_99) < observed_delta < (expected_delta + conf_99)
def test_deletion_length(self): """Test that deletions return the correct averge length""" cfg = AppSettings() reps = 1000 deltas = [] for _ in range(0, reps): dna = Chromosome() init_length = len(dna.sequence) dna.deletion() deltas.append(init_length - len(dna.sequence)) expected_delta = cfg.genetics.mutation_length var = nbinom.var( 1, cfg.genetics.mutation_length / (1 + cfg.genetics.mutation_length)) # Because there is a little slop around short strings or positions near the # end of the string, I multiply # the confidence by 10 just to limit the number of failing tests. conf_99 = ((var / reps)**(1 / 2)) * 10 observed_delta = sum(deltas) / reps assert (expected_delta - conf_99) < observed_delta < (expected_delta + conf_99)
def test_inversion_diffs(self): cfg = AppSettings() reps = 1000 deltas = [] # observed number of differences for _ in range(0, reps): dna = Chromosome() old_seq = dna.sequence dna.inversion() deltas.append( sum(1 for a, b in zip(old_seq, dna.sequence) if a != b)) pmfs = [] expected_deltas = [] # expected differences # Assumes the length of an inversion is drawn from a negative binomial # distribution. Calculates the probability of each length until # 99.99% of the distribution is accounted for. The expected number of # differences for each length is multiplied by the probability of that length # and the sum of that gives the expected differences overall. k = 0 while sum(pmfs) <= 0.9999: pmf = nbinom.pmf(k, 1, (1 - cfg.genetics.mutation_length / (1 + cfg.genetics.mutation_length))) pmfs.append(pmf) diffs = math.floor( k / 2) * (1 - 1 / len(Chromosome.nucleotides())) * 2 expected_deltas.append(pmf * diffs) k += 1 expected_delta = sum(expected_deltas) # Since we are multiplying the binomial distribution (probably of differences at # a given lenght) by a negative binomial distribution (probability of a length) # we must compute the variance of two independent random variables # is Var(X * Y) = var(x) * var(y) + var(x) * mean(y) + mean(x) * var(y) # http://www.odelama.com/data-analysis/Commonly-Used-Math-Formulas/ mean_binom = cfg.genetics.mutation_length var_binom = binom.var(mean_binom, 1 / (len(Chromosome.nucleotides()))) mean_nbinom = cfg.genetics.mutation_length var_nbinom = nbinom.var(cfg.genetics.mutation_length, mean_nbinom / (1 + mean_nbinom)) var = var_binom * var_nbinom + \ var_binom * mean_nbinom + \ mean_binom * var_nbinom observed_delta = sum(deltas) / reps conf_99 = ((var / reps)**(1 / 2)) * 5 assert expected_delta - conf_99 < observed_delta < expected_delta + conf_99
def test_random_chromosome_length(self): """Ensures that random chromosomes are created at the correct average length.""" reps = 1000 cfg = AppSettings() lengths = [] for _ in range(0, reps): chrom = Chromosome() lengths.append(len(chrom.sequence)) mean_length = float(sum(lengths)) / len(lengths) expected_length = cfg.genetics.chromosome_length p = 1 - (expected_length / (1 + expected_length)) conf_99 = (nbinom.var(1, p) / reps)**(1 / 2) * 4 assert (expected_length - conf_99) <= mean_length <= (expected_length + conf_99)
def test_substitutions_length(self): """Ensure the substitions don't change sequence length.""" dna = Chromosome("a" * 100) dna.substitutions() assert len(dna.sequence) == 100
def test_nucleotides(self): """Tests nucleotide method returns correct value.""" assert Chromosome.nucleotides() == "abcd/:*+?"
def test_defined_sequence(self): """Tests creation of a chromosome from a specified sequence.""" chrom = Chromosome("abcd") assert chrom.sequence == "abcd"