示例#1
0
def templates_per_base(partis_file, kmer_dict, k):
    """

    Keyword arguments:
    partis_file -- A partis csv describing the mutations.
    kmer_dict -- A kmer dictionary describing the references.
    k -- The minimum match length for gcv tracts.

    Returns: A data frame where each row contains the number of
    templates for a putative mutation. The mutation is described in
    terms of its germline base, the target base, the id of the mutated
    sequence, the location within that sequence of the mutation, and
    the mutation that actually occurred.

    """
    bases = ["A", "C", "G", "T"]
    mut_df = process_partis(partis_file)
    output = []
    # make a data frame containing all the mutations we didn"t see
    for index, row in mut_df.iterrows():
        for b in bases:
            r = row.copy()
            r["mutated_seq"] = make_mutated_sequence(r["naive_seq"],
                                                     r["mutation_index"], b)
            n = n_alignments_per_mutation(r, kmer_dict, k)
            output_row = {
                "gl_base": r["gl_base"],
                "template_base": b,
                "mutated_seq_id": r["mutated_seq_id"],
                "mutation_index": r["mutation_index"],
                "true_mutation": r["mutated_base"]
            }
            output.append(output_row)
    return (pd.DataFrame(output))
示例#2
0
def per_base_alignments(partis_file, kmer_dict, k, max_mutation_rate,
                        use_indel_seqs):
    """Finds the number of templates for each potential base at each mutated site.

    Keyword arguments:
    partis_file -- A partis csv describing the mutations.
    kmer_dict -- A kmer dictionary describing the references.
    k -- The minimum match length for gcv tracts.

    Returns: A data frame giving the probability of seeing each
    observed mutation. Mutations are described by the name of the
    query sequence and the position of the mutation in that query
    sequence.

    """
    bases = ["A", "C", "G", "T"]
    mut_df = process_partis(partis_file,
                            max_mutation_rate=max_mutation_rate,
                            use_indel_seqs=use_indel_seqs)
    output_rows = []
    # make a data frame containing all the mutations we didn"t see
    for index, row in mut_df.iterrows():
        output_row = row.copy()
        for b in bases:
            r = row.copy()
            unseen_seq = list(r["naive_seq"])
            unseen_seq[r["mutation_index"]] = b
            r["mutated_seq"] = "".join(unseen_seq)
            r["mutated_base"] = b
            output_row[b] = n_alignments_per_mutation(pd.DataFrame(
                [r]), kmer_dict, k)["n_alignments"].item()
        output_rows.append(output_row)

    return (pd.DataFrame(output_rows))
示例#3
0
def likelihood_given_gcv(partis_file, kmer_dict, k, max_mutation_rate,
                         use_indel_seqs):
    """Finds the likelihood of mutations conditional on being due to gcv

    Keyword arguments:
    partis_file -- A partis csv describing the mutations.
    kmer_dict -- A kmer dictionary describing the references.
    k -- The minimum match length for gcv tracts.

    Returns: A data frame giving the probability of seeing each
    observed mutation. Mutations are described by the name of the
    query sequence and the position of the mutation in that query
    sequence.

    """
    bases = ["A", "C", "G", "T"]
    mut_df = process_partis(partis_file,
                            max_mutation_rate=max_mutation_rate,
                            use_indel_seqs=use_indel_seqs)
    # make a data frame containing all the mutations we didn"t see
    unobs_mut_rows = []
    for index, row in mut_df.iterrows():
        for b in bases:
            if b not in set([row["gl_base"], row["mutated_base"]]):
                r = row.copy()
                unseen_seq = list(r["mutated_seq"])
                unseen_seq[r["mutation_index"]] = b
                r["mutated_seq"] = "".join(unseen_seq)
                r["mutated_base"] = b
                unobs_mut_rows.append(r)
    unobs_mut_df = pd.DataFrame(unobs_mut_rows)

    # run motif finder on the observed and unobserved mutations
    motifs_obs = n_alignments_per_mutation(mut_df, kmer_dict, k)
    motifs_unobs = n_alignments_per_mutation(unobs_mut_df, kmer_dict, k)
    obs_and_unobs = pd.merge(motifs_obs,
                             motifs_unobs,
                             how="outer",
                             on=["query_mutation_index", "query_name"],
                             validate="one_to_one")

    # get the probabilities of seeing the observed
    # mutations. n_alignments_x is the number of alignments for
    # observed mutations because motifs_obs was in the first position
    # in pd.merge
    def get_prob(row):
        n_obs = row["n_alignments_x"]
        n_unobs = row["n_alignments_y"]
        if n_obs + n_unobs == 0:
            return (np.nan)
        return n_obs / (n_obs + n_unobs + 0.)

    obs_and_unobs["prob"] = obs_and_unobs.apply(get_prob, axis=1)
    return (obs_and_unobs)
示例#4
0
 def test_process_partis(self):
     # run partis on the test data
     partis_file = "test/partis_test.csv"
     mut_df = process_partis(partis_file)
     # there is one mutation in the test at position 7 with gl base
     # G and mutated base A
     self.assertEqual(mut_df.shape[0], 1)
     self.assertEqual(mut_df["mutated_seq"][0], "AAAAAAAA")
     self.assertEqual(mut_df["naive_seq"][0], "AAAAAAAG")
     self.assertEqual(mut_df["mutated_seq_id"][0], "s1")
     self.assertEqual(mut_df["mutation_index"][0], 7)
     self.assertEqual(mut_df["gl_base"][0], "G")
     self.assertEqual(mut_df["mutated_base"][0], "A")
示例#5
0
def motif_finder(partis_file,
                 reference_fasta,
                 k,
                 kmer_dict=None,
                 reverse_complement=False,
                 max_mutation_rate=1,
                 use_indel_seqs=True,
                 return_dict=False,
                 unique_mutations=False):
    """Matches mutations to potential gene conversion donors.

    Keyword arguments:
    partis_file -- A file containing partis output describing the
    germline and mature sequences.
    reference_fasta -- A fasta file containing the sequences in the
    donor gene set.
    k -- kmer size.
    reverse_complement -- If True, also look for gene conversion
    templates on the reverse complement.
    max_mutation_rate -- Remove any sequences from the partis file
    that have a mutation rate above max_mutation_rate.
    use_indel_seqs -- If False, remove any seqeunces with indels from
    partis_file.

    Returns: A data frame with the query name, mutation index, and the
    number of alignments in the reference set explaining that mutation.

    """
    mutations = process_partis(partis_file,
                               max_mutation_rate=max_mutation_rate,
                               use_indel_seqs=use_indel_seqs)
    n_mutations = get_n_mutations(mutations, unique=unique_mutations)
    if kmer_dict is None:
        kmer_dict = make_kmer_dict_from_fasta(
            reference_fasta, k, reverse_complement=reverse_complement)
    imf_out = indexed_motif_finder(mutations, kmer_dict, k, all_matches=False)
    hits, total_mutations = templated_number(imf_out, dale_method=False)
    if not return_dict:
        return (imf_out, hits / n_mutations)

    return (imf_out, hits / n_mutations, kmer_dict)
示例#6
0
 def test_unique_mutations(self):
     mutations = process_partis("test/dale_test_partis_1.csv")
     n_mutations = get_n_mutations(mutations, unique=False)
     n_unique_mutations = get_n_mutations(mutations, unique=True)
     self.assertEqual(n_mutations, 5)
     self.assertEqual(n_unique_mutations, 3)