예제 #1
0
 def test_multiple_refs(self):
     # test making a dictionary out of multiple reference sequences
     r1 = SeqRecord("ATA", name="r1")
     r2 = SeqRecord("CTA", name="r2")
     # dictionary of 3-mers
     d3 = make_kmer_dictionary([r1, r2], 3)
     # dictionary of 2-mers
     d2 = make_kmer_dictionary([r1, r2], 2)
     # the 3-mer dictionary should have ATA at position 0 in r1 and
     # CTA in position 0 in r2
     self.assertEqual(len(d3.keys()), 2)
     self.assertEqual(d3["ATA"], set([(r1.name, r1.seq, 0)]))
     self.assertEqual(d3["CTA"], set([(r2.name, r2.seq, 0)]))
     # the 2-mer dictionary should have AT at position 0 in r1, TA
     # at position 1 in both r1 and r2, and CT at position 0 in r2
     self.assertEqual(len(d2.keys()), 3)
     self.assertEqual(d2["AT"], set([(r1.name, r1.seq, 0)]))
     self.assertEqual(d2["TA"],
                      set([(r1.name, r1.seq, 1), (r2.name, r2.seq, 1)]))
     self.assertEqual(d2["CT"], set([(r2.name, r2.seq, 0)]))
예제 #2
0
 def test_rc(self):
     r1 = SeqRecord(Seq("ATC", IUPAC.unambiguous_dna), name="r1")
     d2 = make_kmer_dictionary([r1], 2, reverse_complement=True)
     d3 = make_kmer_dictionary([r1], 3, reverse_complement=True)
     # d2 should have AT, TC, and GA
     self.assertEqual(len(d2.keys()), 3)
     # the AT 2-mer is in r1 at position 0 and in the reverse complement of r1 at position 1
     self.assertEqual(
         d2["AT"],
         set([(r1.name, str(r1.seq), 0),
              ("r1_rc", str(r1.reverse_complement().seq), 1)]))
     # The TC 2-mer is in r1 at position 1
     self.assertEqual(d2["TC"], set([(r1.name, str(r1.seq), 1)]))
     # The GA 2-mer is in the reverse complement of r1 at position 0
     self.assertEqual(d2["GA"],
                      set([("r1_rc", str(r1.reverse_complement().seq), 0)]))
     # d3 should have ATC and GAT
     self.assertEqual(len(d3.keys()), 2)
     self.assertEqual(d3["ATC"], set([(r1.name, str(r1.seq), 0)]))
     self.assertEqual(d3["GAT"],
                      set([("r1_rc", str(r1.reverse_complement().seq), 0)]))
예제 #3
0
 def test_single_ref(self):
     # make k-mer dictionaries with k = 2,3,4 for the sequence ATA
     r1 = SeqRecord(Seq("ATA"), name="r1")
     d2 = make_kmer_dictionary([r1], 2)
     d3 = make_kmer_dictionary([r1], 3)
     d4 = make_kmer_dictionary([r1], 4)
     # the 2-mer dictionary should have AT and TA
     self.assertEqual(len(d2.keys()), 2)
     self.assertEqual("AT" in d2.keys(), True)
     self.assertEqual("TA" in d2.keys(), True)
     # the AT 2-mer was in the r1 sequence at position 0
     self.assertEqual(d2["AT"], set([(r1.name, str(r1.seq), 0)]))
     # the TA 2-mer was in the r1 sequence at position 1
     self.assertEqual(d2["TA"], set([(r1.name, str(r1.seq), 1)]))
     # the 3-mer dictionary should contain just ATA
     self.assertEqual(len(d3.keys()), 1)
     self.assertEqual("ATA" in d3.keys(), True)
     # the ATA 3-mer occurred in r1 at position 0
     self.assertEqual(d3["ATA"], set([(r1.name, str(r1.seq), 0)]))
     # there are no 4-mers in a sequence of size 3
     self.assertEqual(d4, {})
예제 #4
0
 def test_likelihood(self):
     # set up
     partis_file = "test/likelihood_test_partis.csv"
     ref_file = "test/likelihood_test_reference.fasta"
     references = [r for r in SeqIO.parse(ref_file, "fasta")]
     refdict = make_kmer_dictionary(references, 3)
     probs = likelihood_given_gcv(partis_file, refdict, 3, 1, True)
     prob_s1 = probs.loc[probs.query_name == "s1", "prob"]
     prob_s2 = probs.loc[probs.query_name == "s2", "prob"]
     prob_s3 = probs.loc[probs.query_name == "s3", "prob"]
     # s1 should have prob = 1/2, s2 prob = 0, s3 prob = nan
     self.assertEqual(prob_s1.item(), .5)
     self.assertEqual(prob_s2.item(), 0)
     self.assertEqual(np.isnan(prob_s3.item()), True)
예제 #5
0
 def test_two_alignments_same_reference(self):
     mut_df = pd.DataFrame([{
         "mutated_seq": "AAAAAAAA",
         "naive_seq": "AAAAAAAG",
         "mutated_seq_id": "s1",
         "mutation_index": 7,
         "gl_base": "G",
         "mutated_base": "A"
     }])
     r1 = SeqRecord("TTTT", name="r1")
     r2 = SeqRecord("TAAAGAAA", name="r2")
     kmer_dict = make_kmer_dictionary([r1, r2], k=3)
     nalign = n_alignments_per_mutation(mut_df, kmer_dict, k=3)
     # two templates for the mutation, both from r2
     self.assertEqual(
         nalign.loc[nalign.query_name == "s1", "n_alignments"].item(), 2)
예제 #6
0
 def test_one_alignment(self):
     mut_df = pd.DataFrame([{
         "mutated_seq": "AAAAAAAA",
         "naive_seq": "AAAAAAAG",
         "mutated_seq_id": "s1",
         "mutation_index": 7,
         "gl_base": "G",
         "mutated_base": "A"
     }])
     r1 = SeqRecord("TTTT", name="r1")
     r2 = SeqRecord("TAAA", name="r2")
     kmer_dict = make_kmer_dictionary([r1, r2], k=3)
     nalign = n_alignments_per_mutation(mut_df, kmer_dict, k=3)
     # there is one AAA sequence in r1 that could serve as a
     # template for the mutation
     self.assertEqual(
         nalign.loc[nalign.query_name == "s1", "n_alignments"].item(), 1)
예제 #7
0
 def test_no_alignments(self):
     mut_df = pd.DataFrame([{
         "mutated_seq": "AAAAAAAA",
         "naive_seq": "AAAAAAAG",
         "mutated_seq_id": "s1",
         "mutation_index": 7,
         "gl_base": "G",
         "mutated_base": "A"
     }])
     r1 = SeqRecord("TTTT", name="r1")
     r2 = SeqRecord("TCTC", name="r2")
     kmer_dict = make_kmer_dictionary([r1, r2], k=3)
     nalign = n_alignments_per_mutation(mut_df, kmer_dict, k=3)
     # there are no AAA sequences in the references (r1 and r2), so
     # we should get no alignments
     self.assertEqual(
         nalign.loc[nalign.query_name == "s1", "n_alignments"].item(), 0)
예제 #8
0
 def test_base_alignment(self):
     partis_file = "test/likelihood_test_partis.csv"
     ref_file = "test/likelihood_test_reference.fasta"
     references = [r for r in SeqIO.parse(ref_file, "fasta")]
     refdict = make_kmer_dictionary(references, 3)
     perbase = per_base_alignments(partis_file, refdict, 3, 1, True)
     self.assertEqual(perbase.loc[0, "A"].item(), 0)
     self.assertEqual(perbase.loc[0, "C"].item(), 0)
     self.assertEqual(perbase.loc[0, "T"].item(), 2)
     self.assertEqual(perbase.loc[0, "G"].item(), 2)
     self.assertEqual(perbase.loc[1, "A"].item(), 0)
     self.assertEqual(perbase.loc[1, "C"].item(), 0)
     self.assertEqual(perbase.loc[1, "T"].item(), 2)
     self.assertEqual(perbase.loc[1, "G"].item(), 2)
     self.assertEqual(perbase.loc[2, "A"].item(), 0)
     self.assertEqual(perbase.loc[2, "C"].item(), 0)
     self.assertEqual(perbase.loc[2, "T"].item(), 0)
     self.assertEqual(perbase.loc[2, "G"].item(), 0)
예제 #9
0
 def test_imf(self):
     mut_df = pd.DataFrame([{
         "mutated_seq": "AAAAAAAA",
         "naive_seq": "AAAAAAAG",
         "mutated_seq_id": "s1",
         "mutation_index": 7,
         "gl_base": "G",
         "mutated_base": "A"
     }])
     r1 = SeqRecord("ATA", name="r1")
     r2 = SeqRecord("CAA", name="r2")
     kmer_dict = make_kmer_dictionary([r1, r2], k=2)
     imf = indexed_motif_finder(mut_df, kmer_dict, k=2)
     # the partis file has a naive sequence AAAAAAAG and a mutated
     # sequence AAAAAAAA, so we should have one hit to r2
     self.assertEqual(imf.shape[0], 1)
     self.assertEqual(imf["reference_name"][0], "r2")
     self.assertEqual(imf["reference_alignment"][0], 2)
     self.assertEqual(imf["query_name"][0], "s1")
     self.assertEqual(imf["query_mutation_index"][0], 7)
                    dest='max_mutation_rate',
                    type=float,
                    default=.1)
parser.add_argument('--use-indel-seqs',
                    dest='use_indel_seqs',
                    type=bool,
                    default=True)
parser.add_argument('--rc', dest='rc', type=bool, default=False)
args = parser.parse_args()

partis_files = os.listdir(args.input_directory)
refs = [r for r in SeqIO.parse(args.references, "fasta")]
reference_name = os.path.splitext(os.path.basename(args.references))[0]
k_list = range(args.kmin, args.kmax + 1)
kmer_dicts = [
    make_kmer_dictionary(refs, k, reverse_complement=args.rc) for k in k_list
]
df_list = []
for f in partis_files:
    for (k, kmer_dict) in zip(k_list, kmer_dicts):
        out = likelihood_given_gcv(os.path.join(args.input_directory, f),
                                   kmer_dict,
                                   k,
                                   max_mutation_rate=args.max_mutation_rate,
                                   use_indel_seqs=args.use_indel_seqs)
        out["k"] = k
        out["source"] = f
        out["reference"] = reference_name
        df_list.append(out)
    print "finished " + f
예제 #11
0
parser.add_argument('--kmax', dest='kmax', type=int, default=14)
parser.add_argument('--max-mutation-rate',
                    dest='max_mutation_rate',
                    type=float,
                    default=.1)
parser.add_argument('--use-indel-seqs',
                    dest='use_indel_seqs',
                    type=bool,
                    default=True)
args = parser.parse_args()

partis_files = os.listdir(args.input_directory)
refs = [r for r in SeqIO.parse(args.references, "fasta")]
reference_name = os.path.splitext(os.path.basename(args.references))[0]
k_list = range(args.kmin, args.kmax + 1)
kmer_dicts = [make_kmer_dictionary(refs, k) for k in k_list]
k_list = range(args.kmin, args.kmax + 1)
refs = [r for r in SeqIO.parse(args.references, "fasta")]
kmer_dicts = [make_kmer_dictionary(refs, k) for k in k_list]
df_list = []
for f in partis_files:
    for (k, kmer_dict) in zip(k_list, kmer_dicts):
        out = per_base_alignments(os.path.join(args.input_directory, f),
                                  kmer_dict,
                                  k,
                                  max_mutation_rate=args.max_mutation_rate,
                                  use_indel_seqs=args.use_indel_seqs)
        out["k"] = k
        out["source"] = f
        out["reference"] = reference_name
        df_list.append(out)