def templates_per_base(partis_file, kmer_dict, k): """ Keyword arguments: partis_file -- A partis csv describing the mutations. kmer_dict -- A kmer dictionary describing the references. k -- The minimum match length for gcv tracts. Returns: A data frame where each row contains the number of templates for a putative mutation. The mutation is described in terms of its germline base, the target base, the id of the mutated sequence, the location within that sequence of the mutation, and the mutation that actually occurred. """ bases = ["A", "C", "G", "T"] mut_df = process_partis(partis_file) output = [] # make a data frame containing all the mutations we didn"t see for index, row in mut_df.iterrows(): for b in bases: r = row.copy() r["mutated_seq"] = make_mutated_sequence(r["naive_seq"], r["mutation_index"], b) n = n_alignments_per_mutation(r, kmer_dict, k) output_row = { "gl_base": r["gl_base"], "template_base": b, "mutated_seq_id": r["mutated_seq_id"], "mutation_index": r["mutation_index"], "true_mutation": r["mutated_base"] } output.append(output_row) return (pd.DataFrame(output))
def per_base_alignments(partis_file, kmer_dict, k, max_mutation_rate, use_indel_seqs): """Finds the number of templates for each potential base at each mutated site. Keyword arguments: partis_file -- A partis csv describing the mutations. kmer_dict -- A kmer dictionary describing the references. k -- The minimum match length for gcv tracts. Returns: A data frame giving the probability of seeing each observed mutation. Mutations are described by the name of the query sequence and the position of the mutation in that query sequence. """ bases = ["A", "C", "G", "T"] mut_df = process_partis(partis_file, max_mutation_rate=max_mutation_rate, use_indel_seqs=use_indel_seqs) output_rows = [] # make a data frame containing all the mutations we didn"t see for index, row in mut_df.iterrows(): output_row = row.copy() for b in bases: r = row.copy() unseen_seq = list(r["naive_seq"]) unseen_seq[r["mutation_index"]] = b r["mutated_seq"] = "".join(unseen_seq) r["mutated_base"] = b output_row[b] = n_alignments_per_mutation(pd.DataFrame( [r]), kmer_dict, k)["n_alignments"].item() output_rows.append(output_row) return (pd.DataFrame(output_rows))
def likelihood_given_gcv(partis_file, kmer_dict, k, max_mutation_rate, use_indel_seqs): """Finds the likelihood of mutations conditional on being due to gcv Keyword arguments: partis_file -- A partis csv describing the mutations. kmer_dict -- A kmer dictionary describing the references. k -- The minimum match length for gcv tracts. Returns: A data frame giving the probability of seeing each observed mutation. Mutations are described by the name of the query sequence and the position of the mutation in that query sequence. """ bases = ["A", "C", "G", "T"] mut_df = process_partis(partis_file, max_mutation_rate=max_mutation_rate, use_indel_seqs=use_indel_seqs) # make a data frame containing all the mutations we didn"t see unobs_mut_rows = [] for index, row in mut_df.iterrows(): for b in bases: if b not in set([row["gl_base"], row["mutated_base"]]): r = row.copy() unseen_seq = list(r["mutated_seq"]) unseen_seq[r["mutation_index"]] = b r["mutated_seq"] = "".join(unseen_seq) r["mutated_base"] = b unobs_mut_rows.append(r) unobs_mut_df = pd.DataFrame(unobs_mut_rows) # run motif finder on the observed and unobserved mutations motifs_obs = n_alignments_per_mutation(mut_df, kmer_dict, k) motifs_unobs = n_alignments_per_mutation(unobs_mut_df, kmer_dict, k) obs_and_unobs = pd.merge(motifs_obs, motifs_unobs, how="outer", on=["query_mutation_index", "query_name"], validate="one_to_one") # get the probabilities of seeing the observed # mutations. n_alignments_x is the number of alignments for # observed mutations because motifs_obs was in the first position # in pd.merge def get_prob(row): n_obs = row["n_alignments_x"] n_unobs = row["n_alignments_y"] if n_obs + n_unobs == 0: return (np.nan) return n_obs / (n_obs + n_unobs + 0.) obs_and_unobs["prob"] = obs_and_unobs.apply(get_prob, axis=1) return (obs_and_unobs)
def test_process_partis(self): # run partis on the test data partis_file = "test/partis_test.csv" mut_df = process_partis(partis_file) # there is one mutation in the test at position 7 with gl base # G and mutated base A self.assertEqual(mut_df.shape[0], 1) self.assertEqual(mut_df["mutated_seq"][0], "AAAAAAAA") self.assertEqual(mut_df["naive_seq"][0], "AAAAAAAG") self.assertEqual(mut_df["mutated_seq_id"][0], "s1") self.assertEqual(mut_df["mutation_index"][0], 7) self.assertEqual(mut_df["gl_base"][0], "G") self.assertEqual(mut_df["mutated_base"][0], "A")
def motif_finder(partis_file, reference_fasta, k, kmer_dict=None, reverse_complement=False, max_mutation_rate=1, use_indel_seqs=True, return_dict=False, unique_mutations=False): """Matches mutations to potential gene conversion donors. Keyword arguments: partis_file -- A file containing partis output describing the germline and mature sequences. reference_fasta -- A fasta file containing the sequences in the donor gene set. k -- kmer size. reverse_complement -- If True, also look for gene conversion templates on the reverse complement. max_mutation_rate -- Remove any sequences from the partis file that have a mutation rate above max_mutation_rate. use_indel_seqs -- If False, remove any seqeunces with indels from partis_file. Returns: A data frame with the query name, mutation index, and the number of alignments in the reference set explaining that mutation. """ mutations = process_partis(partis_file, max_mutation_rate=max_mutation_rate, use_indel_seqs=use_indel_seqs) n_mutations = get_n_mutations(mutations, unique=unique_mutations) if kmer_dict is None: kmer_dict = make_kmer_dict_from_fasta( reference_fasta, k, reverse_complement=reverse_complement) imf_out = indexed_motif_finder(mutations, kmer_dict, k, all_matches=False) hits, total_mutations = templated_number(imf_out, dale_method=False) if not return_dict: return (imf_out, hits / n_mutations) return (imf_out, hits / n_mutations, kmer_dict)
def test_unique_mutations(self): mutations = process_partis("test/dale_test_partis_1.csv") n_mutations = get_n_mutations(mutations, unique=False) n_unique_mutations = get_n_mutations(mutations, unique=True) self.assertEqual(n_mutations, 5) self.assertEqual(n_unique_mutations, 3)