def setUp(self): ref_seq = Seq.Seq('GA' * 9 + 'GC' * 6 + 'GT' * 15) query_seq = 'AA' * 4 + 'GA' * 5 + 'AC' + 'GC' * 5 + 'GT' * 15 aln_string = '>seq1\n{0}'.format(query_seq) self.aln = Alignment(helpers.parse_fasta(aln_string).values(), reference_sequence=ref_seq) self.mutation_patterns = [mut_pattern.GA, mut_pattern.GM]
class TestBasicAnalysis(unittest.TestCase): def setUp(self): aln_string = """ >seq1 GTCAGTCAGTCAGTCACCCC >seq2 GTCAGTCAGTCAGTCACCCC >seq3 ATCAATCAGTCAATCACCCC """ self.seqs = helpers.parse_fasta(aln_string) self.aln = Alignment(self.seqs.values()) def test_consensus_reference(self): self.assertEqual(str(self.aln.reference_sequence), 'GTCAGTCAGTCAGTCACCCC') def test_hm_pos(self): for result in self.aln.analyze((old_focus_pattern, old_control_pattern)): hm_pos = result['hm_pos'] if result['sequence'] in ['seq1', 'seq2']: self.assertFalse(hm_pos) else: self.assertTrue(hm_pos) def test_hm_pos_indices(self): hm_pos_indices = [1, 5, 13] for result in self.aln.analyze((old_focus_pattern, old_control_pattern)): if result['hm_pos']: self.assertEqual(result['mut_columns'], hm_pos_indices) else: self.assertEqual(result['mut_columns'], [])
class TestBasicAnalysis(unittest.TestCase): def setUp(self): aln_string = """ >seq1 GTCAGTCAGTCAGTCACCCC >seq2 GTCAGTCAGTCAGTCACCCC >seq3 ATCAATCAGTCAATCACCCC """ self.seqs = helpers.parse_fasta(aln_string) self.aln = Alignment(self.seqs.values()) def test_consensus_reference(self): self.assertEqual(str(self.aln.reference_sequence), 'GTCAGTCAGTCAGTCACCCC') def test_hm_pos(self): for result in self.aln.analyze( (old_focus_pattern, old_control_pattern)): hm_pos = result['hm_pos'] if result['sequence'] in ['seq1', 'seq2']: self.assertFalse(hm_pos) else: self.assertTrue(hm_pos) def test_hm_pos_indices(self): hm_pos_indices = [1, 5, 13] for result in self.aln.analyze( (old_focus_pattern, old_control_pattern)): if result['hm_pos']: self.assertEqual(result['mut_columns'], hm_pos_indices) else: self.assertEqual(result['mut_columns'], [])
def setUp(self): aln_string = """ >seq1 GTCAGTCAGTCAGTCACCCC >seq2 GTCAGTCAGTCAGTCACCCC >seq3 ATCAATCAGTCAATCACCCC """ self.seqs = helpers.parse_fasta(aln_string) self.aln = Alignment(self.seqs.values())
def setUp(self): aln_string = """ >seq1 GGTGACGCT >seq2 AGTAACGCT >seq3 GGTAACACT """ ref_seq = Seq.Seq('GGTGACGCT') self.aln = Alignment(helpers.parse_fasta(aln_string).values(), reference_sequence=ref_seq)
def setUp(self): aln_string = """ >seq1 GTCAGTCAGTCAGTCA GTCAGTCAGTCAGTCA >seq2 GTCAGTCAGTCAGTCA GTCAGTCAGTCAGTCA >seq3 ATCAATCAGTCAATCG ATCAATCAGTCAATCG""" self.seqs = helpers.parse_fasta_list(aln_string) self.aln = Alignment(self.seqs)
def setUp(self): ref_seq = helpers.parse_fasta(""" >all GGGGGGGGGTGTGTGTGT""") self.aln = Alignment(helpers.parse_fasta(""" >seq1 GGGGGGGGGTGTGTGTGT >seq2 AGAGAGAGGTGTGTGTGT >seq3 GGGGGGGGGTATATATAT """).values(), reference_sequence=ref_seq['all'])
class TestCallPatterns(unittest.TestCase): """The following motivates these tests: BetaRat(5.5, 0.5, 11.0, 16.0) cdf: 0.00392333077990058 map: 3.27321043801743 ppf: 2.47393902966 BetaRat(4.5, 1.5, 6.0, 21.0) cdf: 0.00570220397191901 map: 3.475475215347 ppf: 1.98195901549 As you can see, different methods can make different calls.""" def setUp(self): ref_seq = Seq.Seq('GA' * 9 + 'GC' * 6 + 'GT' * 15) query_seq = 'AA' * 4 + 'GA' * 5 + 'AC' + 'GC' * 5 + 'GT' * 15 aln_string = '>seq1\n{0}'.format(query_seq) self.aln = Alignment(helpers.parse_fasta(aln_string).values(), reference_sequence=ref_seq) self.mutation_patterns = [mut_pattern.GA, mut_pattern.GM] def test_map_caller(self): for result in self.aln.multiple_context_analysis( self.mutation_patterns, caller='map'): call = result['call'] self.assertEqual(call['call_pattern'], 'GA') def test_ppf_caller(self): for result in self.aln.multiple_context_analysis( self.mutation_patterns, caller='q_0.05', quants=[0.05], pos_quants_only=False): call = result['call'] self.assertEqual(call['call_pattern'], 'GM') def test_cdf_caller(self): """Note that CDF should be evaluated as smaller => more extreme, whereas the other statistics are the inverse""" for result in self.aln.multiple_context_analysis( self.mutation_patterns, caller='cutoff_cdf'): call = result['call'] self.assertEqual(call['call_pattern'], 'GM') def test_ppf_caller_without_all_quants(self): """Should raise if we don't have the given statistic to test for call comparison.""" analysis = self.aln.multiple_context_analysis(self.mutation_patterns, caller='q_0.05') with self.assertRaises(ValueError): analysis.next() def test_not_calling_negatives(self): """We should only call sequences when they are actually positive, even if there are non-positive sequences with more extreme call statistics.""" for result in self.aln.multiple_context_analysis( self.mutation_patterns, caller='map', significance_level=0.005): call = result['call'] self.assertEqual(call['call_pattern'], 'GM')
def split(args): hm_col_reader = csv.DictReader(args.columns) hm_columns = map(lambda x: int(x['column']), hm_col_reader) hm_columns = list(set(hm_columns)) seq_records = SeqIO.parse(args.alignment, 'fasta') aln = Alignment(seq_records) aln.split_hypermuts(hm_columns = hm_columns) fn_base = path.join(args.out_dir, args.prefix) hm_pos_handle = open(fn_base + '.pos.fasta', 'w') hm_neg_handle = open(fn_base + '.neg.fasta', 'w') AlignIO.write(aln.hm_pos_aln, hm_pos_handle, 'fasta') AlignIO.write(aln.hm_neg_aln, hm_neg_handle, 'fasta') for handle in [args.alignment, args.columns, hm_pos_handle, hm_neg_handle]: handle.close()
def split(args): hm_col_reader = csv.DictReader(args.columns) hm_columns = map(lambda x: int(x['column']), hm_col_reader) hm_columns = list(set(hm_columns)) seq_records = SeqIO.parse(args.alignment, 'fasta') aln = Alignment(seq_records) aln.split_hypermuts(hm_columns=hm_columns) fn_base = path.join(args.out_dir, args.prefix) hm_pos_handle = open(fn_base + '.pos.fasta', 'w') hm_neg_handle = open(fn_base + '.neg.fasta', 'w') AlignIO.write(aln.hm_pos_aln, hm_pos_handle, 'fasta') AlignIO.write(aln.hm_neg_aln, hm_neg_handle, 'fasta') for handle in [args.alignment, args.columns, hm_pos_handle, hm_neg_handle]: handle.close()
class TestCallPatterns(unittest.TestCase): """The following motivates these tests: BetaRat(5.5, 0.5, 11.0, 16.0) cdf: 0.00392333077990058 map: 3.27321043801743 ppf: 2.47393902966 BetaRat(4.5, 1.5, 6.0, 21.0) cdf: 0.00570220397191901 map: 3.475475215347 ppf: 1.98195901549 As you can see, different methods can make different calls.""" def setUp(self): ref_seq = Seq.Seq('GA'*9 + 'GC'*6 + 'GT'*15) query_seq = 'AA'*4 + 'GA'*5 + 'AC' + 'GC'*5 + 'GT'*15 aln_string = '>seq1\n{0}'.format(query_seq) self.aln = Alignment(helpers.parse_fasta(aln_string).values(), reference_sequence=ref_seq) self.mutation_patterns=[mut_pattern.GA, mut_pattern.GM] def test_map_caller(self): for result in self.aln.multiple_context_analysis(self.mutation_patterns, caller='map'): call = result['call'] self.assertEqual(call['call_pattern'], 'GA') def test_ppf_caller(self): for result in self.aln.multiple_context_analysis(self.mutation_patterns, caller='q_0.05', quants=[0.05], pos_quants_only=False): call = result['call'] self.assertEqual(call['call_pattern'], 'GM') def test_cdf_caller(self): """Note that CDF should be evaluated as smaller => more extreme, whereas the other statistics are the inverse""" for result in self.aln.multiple_context_analysis(self.mutation_patterns, caller='cutoff_cdf'): call = result['call'] self.assertEqual(call['call_pattern'], 'GM') def test_ppf_caller_without_all_quants(self): """Should raise if we don't have the given statistic to test for call comparison.""" analysis = self.aln.multiple_context_analysis(self.mutation_patterns, caller='q_0.05') with self.assertRaises(ValueError): analysis.next() def test_not_calling_negatives(self): """We should only call sequences when they are actually positive, even if there are non-positive sequences with more extreme call statistics.""" for result in self.aln.multiple_context_analysis(self.mutation_patterns, caller='map', significance_level=0.005): call = result['call'] self.assertEqual(call['call_pattern'], 'GM')
def setUp(self): ref_seq = helpers.parse_fasta(""" >all GGGGGGGGGTGTGTGTGT""") self.aln = Alignment(helpers.parse_fasta(""" >seq1 GGGGGGGGGTGTGTGTGT >seq2 AGAGAGAGGTGTGTGTGT >seq3 GGGGGGGGGTATATATAT """).values(), reference_sequence = ref_seq['all'])
class TestMutCounts(unittest.TestCase): def setUp(self): aln_string = """ >seq1 GGTGACGCT >seq2 AGTAACGCT >seq3 GGTAACACT """ ref_seq = Seq.Seq('GGTGACGCT') self.aln = Alignment(helpers.parse_fasta(aln_string).values(), reference_sequence=ref_seq) def __test_counts__(self, pattern, real_counts): for result in self.aln.analyze(pattern): seq_counts = [result[x] for x in ('focus_pos', 'control_pos', 'focus_neg', 'control_neg')] seq_real_counts = real_counts[result['sequence']] self.assertEqual(seq_counts, seq_real_counts) def test_ga_counts(self): self.__test_counts__(mut_pattern.GA, dict( seq1=[0, 0, 1, 3], seq2=[1, 1, 0, 2], seq3=[1, 1, 0, 2])) def test_gg_counts(self): self.__test_counts__(mut_pattern.GG, dict( seq1=[0, 0, 1, 3], seq2=[1, 1, 0, 2], seq3=[0, 2, 1, 1])) def test_gr_counts(self): self.__test_counts__(mut_pattern.GR, dict( seq1=[0, 0, 2, 2], seq2=[2, 0, 0, 2], seq3=[1, 1, 1, 1])) def test_gm_counts(self): self.__test_counts__(mut_pattern.GM, dict( seq1=[0, 0, 2, 2], seq2=[1, 1, 1, 1], seq3=[2, 0, 0, 2])) def test_gv_counts(self): self.__test_counts__(mut_pattern.GV, dict( seq1=[0, 0, 3, 1], seq2=[2, 0, 1, 1], seq3=[2, 0, 1, 1]))
class TestMutCounts(unittest.TestCase): def setUp(self): aln_string = """ >seq1 GGTGACGCT >seq2 AGTAACGCT >seq3 GGTAACACT """ ref_seq = Seq.Seq('GGTGACGCT') self.aln = Alignment(helpers.parse_fasta(aln_string).values(), reference_sequence=ref_seq) def __test_counts__(self, pattern, real_counts): for result in self.aln.analyze(pattern): seq_counts = [ result[x] for x in ('focus_pos', 'control_pos', 'focus_neg', 'control_neg') ] seq_real_counts = real_counts[result['sequence']] self.assertEqual(seq_counts, seq_real_counts) def test_ga_counts(self): self.__test_counts__( mut_pattern.GA, dict(seq1=[0, 0, 1, 3], seq2=[1, 1, 0, 2], seq3=[1, 1, 0, 2])) def test_gg_counts(self): self.__test_counts__( mut_pattern.GG, dict(seq1=[0, 0, 1, 3], seq2=[1, 1, 0, 2], seq3=[0, 2, 1, 1])) def test_gr_counts(self): self.__test_counts__( mut_pattern.GR, dict(seq1=[0, 0, 2, 2], seq2=[2, 0, 0, 2], seq3=[1, 1, 1, 1])) def test_gm_counts(self): self.__test_counts__( mut_pattern.GM, dict(seq1=[0, 0, 2, 2], seq2=[1, 1, 1, 1], seq3=[2, 0, 0, 2])) def test_gv_counts(self): self.__test_counts__( mut_pattern.GV, dict(seq1=[0, 0, 3, 1], seq2=[2, 0, 1, 1], seq3=[2, 0, 1, 1]))
class TestBasicSplit(unittest.TestCase): def assertSeqsEqual(self, seq_record, string): self.assertEqual(str(seq_record.seq), string) def setUp(self): aln_string = """ >seq1 GTCAGTCAGTCAGTCA GTCAGTCAGTCAGTCA >seq2 GTCAGTCAGTCAGTCA GTCAGTCAGTCAGTCA >seq3 ATCAATCAGTCAATCG ATCAATCAGTCAATCG""" self.seqs = helpers.parse_fasta_list(aln_string) self.aln = Alignment(self.seqs) def test_manual_split(self): columns = [1, 2, 3, 5] self.aln.split_hypermuts(hm_columns=columns) neg, pos = self.aln.hm_neg_aln, self.aln.hm_pos_aln self.assertEqual(neg.get_alignment_length(), 28) self.assertEqual(pos.get_alignment_length(), 4) self.assertEqual(neg[:, 0], "AAA") self.assertEqual(neg[:, 11], "AAG") self.assertSeqsEqual(neg[0, :], "ATCAGTCAGTCAGTCAGTCAGTCAGTCA") self.assertEqual(pos[:, 1], "TTT") def test_splitting_final_col(self): columns = [3, 7, 32] self.aln.split_hypermuts(hm_columns=columns) neg, pos = self.aln.hm_neg_aln, self.aln.hm_pos_aln self.assertEqual(neg.get_alignment_length(), 29) self.assertEqual(pos.get_alignment_length(), 3) self.assertEqual(neg[:, 0], "GGA") self.assertEqual(neg[:, 12], "CCC") self.assertSeqsEqual(neg[0, :], "GTAGTAGTCAGTCAGTCAGTCAGTCAGTC") self.assertEqual(pos[:, 1], "CCC") self.assertEqual(pos[:, 2], "AAG") def test_splitting_on_no_hm(self): columns = [] self.aln.split_hypermuts(hm_columns=columns) neg, pos = self.aln.hm_neg_aln, self.aln.hm_pos_aln self.assertEqual(neg.get_alignment_length(), 32) self.assertEqual(pos.get_alignment_length(), 0) self.assertEqual(neg[:, 0], "GGA") self.assertSeqsEqual(neg[0, :], "GTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCA") self.assertSeqsEqual(pos[0, :], "")
class TestBasicSplit(unittest.TestCase): def assertSeqsEqual(self, seq_record, string): self.assertEqual(str(seq_record.seq), string) def setUp(self): aln_string = """ >seq1 GTCAGTCAGTCAGTCA GTCAGTCAGTCAGTCA >seq2 GTCAGTCAGTCAGTCA GTCAGTCAGTCAGTCA >seq3 ATCAATCAGTCAATCG ATCAATCAGTCAATCG""" self.seqs = helpers.parse_fasta_list(aln_string) self.aln = Alignment(self.seqs) def test_manual_split(self): columns = [1, 2, 3, 5] self.aln.split_hypermuts(hm_columns=columns) neg, pos = self.aln.hm_neg_aln, self.aln.hm_pos_aln self.assertEqual(neg.get_alignment_length(), 28) self.assertEqual(pos.get_alignment_length(), 4) self.assertEqual(neg[:, 0], 'AAA') self.assertEqual(neg[:, 11], 'AAG') self.assertSeqsEqual(neg[0, :], 'ATCAGTCAGTCAGTCAGTCAGTCAGTCA') self.assertEqual(pos[:, 1], 'TTT') def test_splitting_final_col(self): columns = [3, 7, 32] self.aln.split_hypermuts(hm_columns=columns) neg, pos = self.aln.hm_neg_aln, self.aln.hm_pos_aln self.assertEqual(neg.get_alignment_length(), 29) self.assertEqual(pos.get_alignment_length(), 3) self.assertEqual(neg[:, 0], 'GGA') self.assertEqual(neg[:, 12], 'CCC') self.assertSeqsEqual(neg[0, :], 'GTAGTAGTCAGTCAGTCAGTCAGTCAGTC') self.assertEqual(pos[:, 1], 'CCC') self.assertEqual(pos[:, 2], 'AAG') def test_splitting_on_no_hm(self): columns = [] self.aln.split_hypermuts(hm_columns=columns) neg, pos = self.aln.hm_neg_aln, self.aln.hm_pos_aln self.assertEqual(neg.get_alignment_length(), 32) self.assertEqual(pos.get_alignment_length(), 0) self.assertEqual(neg[:, 0], 'GGA') self.assertSeqsEqual(neg[0, :], 'GTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCA') self.assertSeqsEqual(pos[0, :], '')
class TestContextBasedEvaluation(unittest.TestCase): def setUp(self): ref_seq = helpers.parse_fasta(""" >all GGGGGGGGGTGTGTGTGT""") self.aln = Alignment(helpers.parse_fasta(""" >seq1 GGGGGGGGGTGTGTGTGT >seq2 AGAGAGAGGTGTGTGTGT >seq3 GGGGGGGGGTATATATAT """).values(), reference_sequence = ref_seq['all']) def test_gg(self): for result in self.aln.analyze(mut_pattern.GG): hm_pos = result['hm_pos'] if result['sequence'] == 'seq2': self.assertTrue(hm_pos) else: self.assertFalse(hm_pos)
class TestContextBasedEvaluation(unittest.TestCase): def setUp(self): ref_seq = helpers.parse_fasta(""" >all GGGGGGGGGTGTGTGTGT""") self.aln = Alignment(helpers.parse_fasta(""" >seq1 GGGGGGGGGTGTGTGTGT >seq2 AGAGAGAGGTGTGTGTGT >seq3 GGGGGGGGGTATATATAT """).values(), reference_sequence=ref_seq['all']) def test_gg(self): for result in self.aln.analyze(mut_pattern.GG): hm_pos = result['hm_pos'] if result['sequence'] == 'seq2': self.assertTrue(hm_pos) else: self.assertFalse(hm_pos)
def setUp(self): ref_seq = Seq.Seq('GA'*9 + 'GC'*6 + 'GT'*15) query_seq = 'AA'*4 + 'GA'*5 + 'AC' + 'GC'*5 + 'GT'*15 aln_string = '>seq1\n{0}'.format(query_seq) self.aln = Alignment(helpers.parse_fasta(aln_string).values(), reference_sequence=ref_seq) self.mutation_patterns=[mut_pattern.GA, mut_pattern.GM]
def analyze(args): import logging logging.captureWarnings(True) # Fetch sequence records and analysis patterns seq_records = SeqIO.to_dict(SeqIO.parse(args.alignment, 'fasta')) patterns = [mut_pattern.patterns[p] for p in args.patterns] pattern_names = [p.name for p in patterns] prefix = path.join(args.out_dir, args.prefix) analysis_settings = dict(rpr_cutoff=args.rpr_cutoff, significance_level=args.significance_level, quants=args.quants, pos_quants_only=args.pos_quants_only, caller=args.caller, prior=args.prior, cdfs=args.cdfs, quadr_maxiter=args.quadr_maxiter, optim_maxiter=args.optim_maxiter) # Need to think about how best to fork things here; for instance, might make sense to let the user specify # the initial clusters for whatever reason... However, specifying the reference sequences shouldn't make # any sense there if args.reference_sequences: reference_sequences = SeqIO.to_dict( SeqIO.parse(args.reference_sequences, 'fasta')) else: reference_sequences = None # This lets the cluster map be optional, so that this script can be used # for naive hm filtering/analysis cluster_map = load_cluster_map( args.cluster_map, cluster_col=args.cluster_col) if args.cluster_map else None alignments = AlignmentSet(seq_records, cluster_map, consensus_threshold=args.consensus_threshold, reference_sequences=reference_sequences) # Create the analysis generator analysis = alignments.multiple_context_analysis(patterns, **analysis_settings) if args.cluster_threshold: for hm_it in range(args.cluster_iterations - 1): print " ..On hm/cluster iteration", hm_it # Grab the HM columns from the most recent analysis and split out the pos sites hm_columns = [] for result in analysis: hm_columns += result['call']['mut_columns'] hm_neg_aln = Alignment( seq_records.values()).split_hypermuts(hm_columns).hm_neg_aln # Cluster with the specified settings clustering = alnclst.Clustering(hm_neg_aln, args.cluster_threshold, args.consensus_threshold) clustering = clustering.recenter(args.recentering_iterations) clustering.merge_small_clusters(args.min_per_cluster) cluster_map = parse_clusters(clustering.mapping_iterator(), cluster_key=0, sequence_key=1) # Create the Alignment set clustered_alignment = AlignmentSet( seq_records, cluster_map, consensus_threshold=args.consensus_threshold) analysis = clustered_alignment.multiple_context_analysis( patterns, **analysis_settings) # write out the final clusters clusterout_handle = file(prefix + '.clst.csv', 'w') clustering.write(clusterout_handle) if args.interactive: local = copy.copy(locals()) import hyperfreq local.update( dict(hyperfreq=hyperfreq, Alignment=Alignment, AlignmentSet=AlignmentSet, mut_pattern=mut_pattern, write_analysis=write_analysis)) code.interact(local=local) # Write the final analysis to file write_analysis(analysis, prefix, pattern_names, args.quants, args.cdfs, call_only=args.call_only) if args.write_references: write_reference_seqs(alignments, prefix) # Closing files args.alignment.close() if args.cluster_map: args.cluster_map.close()