def show_failure_types(): from spats_clean import Spats, Pair, FastqRecord spats = Spats("test/5s/5s.fa", "test/5s") spats.setup() bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/" with open(bp + "t11/x/filtered_R1.fq", 'rb') as r1_in: with open(bp + "t11/x/filtered_R2.fq", 'rb') as r2_in: r1_record = FastqRecord() r2_record = FastqRecord() pair = Pair() while True: r1_record.read(r1_in) if not r1_record.identifier: break r2_record.read(r2_in) pair.set_from_records(r1_record, r2_record) spats.process_pair(pair) summary = "{} :: {}".format(pair.identifier, pair.site if pair.has_site else pair.failure) if pair.r1.match_errors: summary += " R1!: {}".format(pair.r1.match_errors) if pair.r1.adapter_errors: summary += " R1A!: {}, adapter_len={}".format(pair.r1.adapter_errors, pair.r1._rtrim) if pair.r2.match_errors: summary += " R2!: {}".format(pair.r2.match_errors) if pair.r2.adapter_errors: summary += " R2A!: {}, adapter_len={}".format(pair.r2.adapter_errors, pair.r2._rtrim - 4) print summary
def d5s_run(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/" #from spats_shape_seq.db import PairDB #pair_db = PairDB(bp + "dev_out/pairs.db") #if False: # pair_db.add_targets_table(bp + "5s/5S.fa") # pair_db.parse(bp + "5s/data/17571-AD1AW-KEW11-5S-2p1-18x-23FEB15-GGCTAC_S10_L001_R1_001.fastq", # bp + "5s/data/17571-AD1AW-KEW11-5S-2p1-18x-23FEB15-GGCTAC_S10_L001_R2_001.fastq") from spats_shape_seq import Spats from spats_shape_seq.partial import PartialFindProcessor s = Spats() #s.run._processor_class = PartialFindProcessor s.run.skip_database = True #s.run.writeback_results = True #s.run.resume_processing = True #s.run.result_set_name = "lookup" s.addTargets(bp + "5s/5S.fa") #s.process_pair_db(pair_db) s.process_pair_data( bp + "5s/data/17571-AD1AW-KEW11-5S-2p1-18x-23FEB15-GGCTAC_S10_L001_R1_001.fastq", bp + "5s/data/17571-AD1AW-KEW11-5S-2p1-18x-23FEB15-GGCTAC_S10_L001_R2_001.fastq" ) s.compute_profiles() s.write_reactivities(bp + "dev_out/rx2.out")
def tmut(): from spats_shape_seq import Spats from spats_shape_seq.db import PairDB from spats_shape_seq.diagram import diagram bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/mutsl/" pair_db = PairDB(bp + "ds_cmp.spats") if True: print "Parsing to db..." pair_db.wipe() pair_db.add_targets_table(bp + "mut_single.fa") fq_name = "mut2" pair_db.parse(bp + fq_name + "_R1.fastq", bp + fq_name + "_R2.fastq") spatss = [] for alg in [ "find_partial", "lookup" ]: spats = Spats(cotrans = False) spats.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' spats.run.count_mutations = True spats.run.algorithm = alg spats.run.allowed_target_errors = 1 spats.run.adapter_b = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG" spats.run._process_all_pairs = True spats.run.writeback_results = True spats.run.num_workers = 1 spats.run.result_set_name = "mut_" + alg spats.process_pair_db(pair_db) pair_db.store_run(spats.run) pair_db.store_counters(spats.run.result_set_name, spats.counters) spatss.append(spats) rdiff_func(bp + "ds_cmp.spats", "mut_find_partial", "mut_lookup", diag_spats = spatss[0])
def diag_case(): from spats_shape_seq import Spats from spats_shape_seq.pair import Pair from spats_shape_seq.tests.test_mut import cases #from spats_shape_seq.tests.test_pairs import prefix_cases as cases from spats_shape_seq.diagram import diagram #spats_config.minimum_target_match_length = 8 spats = Spats() #spats.addTargets("test/5s/5s.fa") spats.addTargets("test/mut/mut_single.fa") spats.run.debug = True spats.run.algorithm = "find_partial" spats.run.count_mutations = True #spats.run.mutations_require_quality_score = ord('.') - ord('!') spats.run.allowed_target_errors = 1 spats.run.ignore_stops_with_mismatched_overlap = True spats.run.adapter_b = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG" #spats.run.collapse_left_prefixes = True spats._case_errors = False def run_case(case): pair = Pair() pair.set_from_data(case[0], case[1], case[2]) spats.process_pair(pair) print diagram(pair, spats.run) if case[3] != pair.site: spats._case_errors = True print "******* mismatch: {} != {}".format(case[3], pair.site) for case in cases: if case[0].startswith("*"): run_case(case) spats.run.debug = False if spats._case_errors: raise Exception("Case failed")
def setUp(self): from spats_shape_seq import Spats self.spats = Spats() self.spats.run.cotrans = True self.spats.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' self.spats.run.collapse_left_prefixes = True self.spats.addTargets("test/cotrans/cotrans_single.fa")
class TestShortAdapterB(unittest.TestCase): def tearDown(self): self.spats = None def pair_for_case(self, case): pair = Pair() pair.set_from_data(case[0], case[1], case[2]) return pair def run_case(self, case): pair = self.pair_for_case(case) print('running: {} / {}'.format(case[0], self.spats.run.algorithm)) self.spats.process_pair(pair) self.assertEqual(case[3], pair.site, "site res={} != {} ({}, {}, {}, {})".format(pair.site, case[3], self.__class__.__name__, case[0], self.spats.run.algorithm, pair.failure)) def test_pairs(self): for alg in algorithms: if alg == 'native': continue self.run_algorithm(alg) def run_algorithm(self, alg): from spats_shape_seq import Spats self.spats = Spats() self.spats.run.algorithm = alg self.spats.addTargets("test/5SrRNA/5SrRNA.fa") self.run_pairs() def run_pairs(self): for case in short_adapter_b_cases: self.run_case(case) print("Ran {} adapter_b test cases.".format(len(short_adapter_b_cases)))
class TestOnlyPrefixes(unittest.TestCase): def setUp(self): from spats_shape_seq import Spats self.spats = Spats() self.spats.run.cotrans = True self.spats.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' self.spats.run.collapse_left_prefixes = True self.spats.run.collapse_only_prefixes = "T,ACGT,CCA" self.spats.addTargets("test/cotrans/cotrans_single.fa") def tearDown(self): self.spats = None def pair_for_case(self, case): pair = Pair() pair.set_from_data(case[0], case[1], case[2]) return pair def run_case(self, case): pair = self.pair_for_case(case) self.spats.counters.reset() self.spats.process_pair(pair) if case[5]: expect = case[4] if case[5] in self.spats.run.collapse_only_prefixes.split(',') else None self.assertEqual(expect, pair.site, "PREF res={} != {} ({}, {})".format(pair.site, case[4], self.__class__.__name__, case[0])) return pair def test_pairs(self): for case in prefix_cases: self.run_case(case) print("Ran {} prefix test cases.".format(len(cases)))
def run_dataset(self, case, algorithm): bp = "test/{}/".format(case) test_file = bp + "test.spats.tmp" try: shutil.copyfile(bp + "ds.spats", test_file) db = PairDB(test_file) s = Spats() db.load_run(s.run) if not s.run.cotrans and algorithm == "native": return s.run.writeback_results = True s.run.result_set_name = "test" s.run.algorithm = algorithm s.run.quiet = True s.loadTargets(db) if not s._processor.exists(): # just ignore the native test if it's not available self.assertEqual("native", algorithm) return s.process_pair_db(db, batch_size = 1024) # small batch_size just to exercise multiprocessing code msg = None count = 0 for res in db.differing_results("test", "test_validation"): msg = str([str(x) for x in res]) count += 1 self.assertEqual(0, count, "{} differing results: {} / {} \n{}".format(count, case, algorithm, msg)) finally: if os.path.exists(test_file): os.remove(test_file)
class TestPrefixPairs(unittest.TestCase): def setUp(self): from spats_shape_seq import Spats self.spats = Spats() self.spats.run.collapse_left_prefixes = True self.spats.addTargets("test/5s/5s.fa") def tearDown(self): self.spats = None def pair_for_case(self, case): pair = Pair() pair.set_from_data(case[0], case[1], case[2]) return pair def run_case(self, case): pair = self.pair_for_case(case) self.spats.counters.reset() self.spats.process_pair(pair) self.assertEqual(case[3], pair.site, "res={} != {} ({}, {})".format(pair.site, case[3], self.__class__.__name__, case[0])) if case[4]: self.assertEqual(1, getattr(self.spats.counters, 'prefix_RRRY_' + case[4]), "prefix {} not counted ({})".format(case[4], case[0])) return pair def test_pairs(self): for case in prefix_cases: self.run_case(case) print("Ran {} prefix test cases.".format(len(cases)))
class TestPrefixPairs(unittest.TestCase): def setUp(self): from spats_shape_seq import Spats self.spats = Spats() self.spats.run.collapse_left_prefixes = True self.spats.addTargets("test/5s/5s.fa") def tearDown(self): self.spats = None def pair_for_case(self, case): pair = Pair() pair.set_from_data(case[0], case[1], case[2]) return pair def run_case(self, case): pair = self.pair_for_case(case) self.spats.counters.reset() self.spats.process_pair(pair) self.assertEqual( case[3], pair.site, "res={} != {} ({}, {})".format(pair.site, case[3], self.__class__.__name__, case[0])) if case[4]: self.assertEqual( 1, getattr(self.spats.counters, 'prefix_RRRY_' + case[4]), "prefix {} not counted ({})".format(case[4], case[0])) return pair def test_pairs(self): for case in prefix_cases: self.run_case(case) print("Ran {} prefix test cases.".format(len(cases)))
def setUp(self): try: self.spats = Spats() self.test_set.spats_setUp(self.spats) except Exception as e: print("exception caught on testset '{}' setup : {}".format( self.test_set.name, e)) raise e
def setUp(self): self.spats = Spats() self.spats.run.count_mutations = True self.spats.run.mutations_require_quality_score = ord('.') - ord('!') self.spats.run.allowed_target_errors = 1 self.spats.run.ignore_stops_with_mismatched_overlap = True self.spats.run.adapter_b = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG" self.setup_processor()
def run_algorithm(self, alg): from spats_shape_seq import Spats self.spats = Spats() self.spats.run.algorithm = alg self.spats.run.count_mutations = True self.spats.run.allowed_target_errors = True self.spats.run.ignore_stops_with_mismatched_overlap = True self.spats.addTargets("test/SRP/SRP.fa") self.run_pairs()
class TestPairs(unittest.TestCase): def setUp(self): from spats_shape_seq import Spats self.spats = Spats() self.spats.addTargets("test/5s/5s.fa") def tearDown(self): self.spats = None def pair_for_case(self, case): pair = Pair() pair.set_from_data(case[0], case[1], case[2]) return pair def run_case(self, case): pair = self.pair_for_case(case) self.spats.process_pair(pair) self.assertEqual( case[3], pair.site, "res={} != {} ({}, {})".format(pair.site, case[3], self.__class__.__name__, case[0])) return pair def test_pairs(self): for case in cases: self.run_case(case) print("Ran {} pair->site cases.".format(len(cases))) def test_find_partial_weird_case(self): pair = Pair() pair.set_from_data("x", 'CTCAGTCCTTGGTGCCCGAGTCAGGATCGGAAGAG', 'TGACTCGGGCACCAAAGACTGAGAGATCGGAAGAG') self.spats.process_pair(pair) print("{} / {}".format(pair.site, pair.failure)) def test_minimum_length(self): from spats_shape_seq import Spats self.spats = Spats() self.spats.run.algorithm = "find_partial" self.spats.run.minimum_target_match_length = 11 self.spats.addTargets("test/5s/5s.fa") self.assertEqual(11, self.spats._targets.minimum_match_length) case = [ '1109:22737:14675', 'TCCAGTCCTTGGAGATCGGAAGAGCACACGTCTGA', 'CCAAGGACTGGAAGATCGGAAGAGCGTCGTGTAGG', None ] self.run_case(case) # this case only matches if the minimum length is set to 8 self.spats.run.minimum_target_match_length = 8 self.spats = Spats() self.spats.run.algorithm = "lookup" self.spats.addTargets("test/5s/5s.fa") case[3] = 135 self.run_case(case)
def dbrun(): db_path = sys.argv[2] run_name = sys.argv[3] from spats_shape_seq import Spats from spats_shape_seq.db import PairDB db = PairDB(db_path) s = Spats() s.run.writeback_results = True s.run.result_set_name = run_name #s.run.resume_processing = True s.process_pair_db(db)
def _dump_indel_lens(self): run_name = self._run_file() if not os.path.exists(run_name): raise Exception("Run must be run before attempting dump") spats = Spats() spats.load(run_name) countinfo = spats.counters.counts_dict() ilen_cnt = [] for lc in sorted([int(k.split('_')[-1]) for k in countinfo.keys() if k.startswith('mapped_indel_len_')]): ilen_cnt.append((lc, countinfo["mapped_indel_len_{}".format(lc)])) output_path = os.path.join(self.path, 'mapped_indel_len_counts.csv') self._write_csv(output_path, [ "Indel Length", "Reads" ], ilen_cnt)
def cotrans_test(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/" from spats_shape_seq import Spats s = Spats() from spats_shape_seq.partial import PartialFindProcessor #s.run._processor_class = PartialFindProcessor s.addTargets(bp + "F_wt.fa") from spats_shape_seq.pair import Pair pair = Pair() pair.set_from_data('x', 'GAGCGTCCTTGGTGCCCGAGTCAGAAATAGACTCCT', 'TATCACTACTGGTAGGAGTCTATTTCTGACTCGGGC') s.process_pair(pair) print "{}: {}".format(pair.target.name, pair.site)
def ligation_run(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/Shape_Seq_ligation/" from spats_shape_seq import Spats s = Spats() #s.config.debug = True #s.run.minimum_target_match_length = 10 #s.run.num_workers = 1 #from spats_shape_seq.partial import PartialFindProcessor #s.run._processor_class = PartialFindProcessor s.run.skip_database = True s.addTargets(bp + "panel_RNAs_complete.fa") s.process_pair_data(bp + "data/KEW1_S1_L001_R1_001.fastq", bp + "data/KEW1_S1_L001_R2_001.fastq")
def validate(self): """Validate the results of a previous 'process' run against a second (slower) algorithm. """ run_name = self._run_file() if not os.path.exists(run_name): raise Exception("Run must be performed before validating") spats = Spats() spats.load(run_name) if spats.validate_results(self.r1, self.r2): self._add_note("Validation pass") else: self._add_note("Validation FAILURE")
def cotrans_run(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/" from spats_shape_seq import Spats s = Spats() #from spats_shape_seq.partial import PartialFindProcessor #s.run._processor_class = PartialFindProcessor s.run.skip_database = True #s.run.writeback_results = True #s.run.resume_processing = True #s.run.result_set_name = "lookup" s.addTargets(bp + "F_wt.fa") s.process_pair_data(bp + "data/EJS_6_F_10mM_NaF_Rep1_GCCAAT_R1.fastq", bp + "data/EJS_6_F_10mM_NaF_Rep1_GCCAAT_R2.fastq") s.compute_profiles() s.write_reactivities(bp + "dev_out/rx.out")
class TestPairs(unittest.TestCase): def setUp(self): from spats_shape_seq import Spats self.spats = Spats() self.spats.addTargets("test/5s/5s.fa") def tearDown(self): self.spats = None def pair_for_case(self, case): pair = Pair() pair.set_from_data(case[0], case[1], case[2]) return pair def run_case(self, case): pair = self.pair_for_case(case) self.spats.process_pair(pair) self.assertEqual(case[3], pair.site, "res={} != {} ({}, {})".format(pair.site, case[3], self.__class__.__name__, case[0])) return pair def test_pairs(self): for case in cases: self.run_case(case) print("Ran {} pair->site cases.".format(len(cases))) def test_find_partial_weird_case(self): pair = Pair() pair.set_from_data("x", 'CTCAGTCCTTGGTGCCCGAGTCAGGATCGGAAGAG', 'TGACTCGGGCACCAAAGACTGAGAGATCGGAAGAG') self.spats.process_pair(pair) print("{} / {}".format(pair.site, pair.failure)) def test_minimum_length(self): from spats_shape_seq import Spats self.spats = Spats() self.spats.run.algorithm = "find_partial" self.spats.run.minimum_target_match_length = 11 self.spats.addTargets("test/5s/5s.fa") self.assertEqual(11, self.spats._targets.minimum_match_length) case = [ '1109:22737:14675', 'TCCAGTCCTTGGAGATCGGAAGAGCACACGTCTGA', 'CCAAGGACTGGAAGATCGGAAGAGCGTCGTGTAGG', None ] self.run_case(case) # this case only matches if the minimum length is set to 8 self.spats.run.minimum_target_match_length = 8 self.spats = Spats() self.spats.addTargets("test/5s/5s.fa") case[3] = 135 self.run_case(case)
def tmut_case(): from spats_shape_seq import Spats from spats_shape_seq.db import PairDB from spats_shape_seq.diagram import diagram bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/mutsl/" spats = Spats(cotrans = False) spats.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' spats.run.count_mutations = True spats.run.algorithm = "find_partial" spats.run.allowed_target_errors = 1 spats.run.adapter_b = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG" spats.run._process_all_pairs = True spats.run.writeback_results = True spats.run.num_workers = 1 spats.run.result_set_name = "mut" spats.addTargets(bp + "mut_single.fa") from spats_shape_seq.pair import Pair pair = Pair() #c = [ 'GAATGTCCTTGGTGCCCGAGTCAGTCCTTGGTGCCCGAGTCAGTCCTTGGTTCCCGAGTCACTCCTTTGTTCCCC', 'AGGACTGACTCGGGCACCAAGGACTTTCTCGTTCACCTATTTCTTTCTCTTCCCCCTTTTTCTTTCTCTTTCTCC' ] #c = [ 'GAGCGTCCTTGGTGCCCGAGTCAGATGCCGACCCGGGTGGGGGCCCTGCCAGCTACATCCCGGCACACGCGTCAT', 'TAGGTCAGGTCCGGAAGGAAGCAGCCAAGGCAGATGACGCGTGTGCCGGGATGTAGCTGGCAGGGCCCCCACCCG' ] #c = [ 'GAATGTCCTTGGTGCCCGAGTCAGGACACGCGTCATCTGCCTTGGCTGCTTCCTTCCGGACCTGACCTGGTAAAC', 'ATCGGGGGCTCTGTTGGTTCCCCCGCAACGCTACTCTGTTTACCAGGTCAGGTCCGGAAGGAAGCAGCCAAGTCA' ] #c = [ 'AGGCGTCCTTGGTGCCCGAGTCAGCCTTGGCTGCTTCCTTCCGGACCTGACCTGGTAAACAGAGTAGCGTTGCGG', 'ATCGGGGGCTCTGTTGGTTCCCCCGCAACGCTACTCTGTTTACCAGGTCAGGTCCGGAAGGAAGCAGCCAAGTCT' ] #c = [ 'TTCAGTCCTTGGTGCCCGAGTCAGCCAGCTACATCCCGGCACACGCGTCATCTGCCTTGGCTGCTTCCTTCCGGA', 'AGGTCAGATCCGGAAGGAAGCAGCCAAGGCAGATGACGCGTGTGCCGGGATGTAGCTGGCTGACTCGGGCACCAA' ] #c = [ 'AAATGTCCTTGGTGCCCGAGTCAGATCTGCCTTAAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGA', 'TAAGGCAGATCTGACTCGGGCACCAAGGACATTTAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCG' ] #c = [ 'CTCAGTCCTTGGTGCCCGAGTCAGTGAGCTAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTC', 'AGCTCACTGACTCGGGCACCAAGGACTGAGAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGG' ] #c = [ 'AAGCGTCCTTGGTGCCCGAGTCAGTGGAGGTAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCT', 'ACCTCCACTGACTCGGGCACCAAGGACGCTTAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTG' ] #c = [ 'TCCGGTCCTTGGTGCCCGAGTCAGATGTAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGT', 'ACATCTGACTCGGGCACCAAGGACCGGAAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTC' ] #c = [ 'TTTAAGTCCTTGGTGCCCGAGTCAGGTCATCTGCCTTGGCTGCTTCCTTCCGGACCTGACCTGGTAAACAGAGTA', 'TACTCTGTTTACCAGGTCAGGTCCGGAAGGAAGCAGCCAAGGCAGATGACCTGACTCGGGCACCAAGGACTTAAA' ] #c = [ 'TTCACAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'GGACAAGCAATGCTTACCTTGATGTTGAACTTTTGAATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGA' ] #c = [ 'AGATCAACAAGAATTAGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'ACAAGCAATGCTTGCCTTGATGTTGAACTTTTGAATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAAC' ] #c = [ 'AAATCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'AATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGATT' ] #c = [ 'TCCGCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'ATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGCGGA' ] #c = [ 'TCCACAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCATTTGCTCATCATTAACCTCCTGAATCACTAT', 'GGACAAGCAATGCTTGCCTTGATGTTGAACTTTTGAATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGA' ] #c = [ 'GGGTCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTTAGATCGGAAGAGCACAC', 'AAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGACCCAGATCGGAAGAGCGTCG' ] c = [ 'GAACCAACAAGAATTGGGACAACTCCAGTGAAAGGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAAGATCGGA', 'TCAGGAGGTTAATGATGAGCAAAGGAGAAGAACCTTTCACTGGAGTTGTCCCAATTCTTGTTGGTTCAGATCGGA' ] #c = [ 'CCTACAACAAGAATTGGGACAACTCCAGTGAGAAGTTCTTCTCCTTTGCTCATCATTAAGATCGGAAGAGCACAC', 'TAATGATGAGCAAAGGAGAAGAACTTCTCACTGGAGTTGTCCCAATTCTTGTTGTAGGAGATCGGAAGAGCGTCG' ] #c = [ 'CTTGCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCTTTAACCTCCTGAATCACTAA', 'TAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGCAAGA' ] pair.set_from_data('x', c[0], c[1]) spats.process_pair(pair) print diagram(pair, spats.run) if pair.has_site: print "{}: {} / {} {}".format(pair.target.name, pair.site, pair.end, pair.mutations) else: print "FAIL: {}".format(pair.failure)
def cotrans_debug(): from spats_shape_seq import Spats s = Spats() s.run.cotrans = True #s.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' #s.run.algorithm = "find_partial" #s.run._p_v102_compat = True s.run.minimum_target_match_length = 10 bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/" s.addTargets(bp + "cotrans_single.fa") from spats_shape_seq.pair import Pair pair = Pair() import cjb.util d = cjb.util.jsonAtPath("/tmp/spats_test.json") pair.set_from_data(str(d['id']), str(d['r1']), str(d['r2'])) #c = ['683779', 'TCCGGTCCTTGGTGCCCGAGTCAGAAAAAAATAGAA', 'TCTATTTTTTTCTGACTCGGGCACCAAGGACCGGAA', 82, 71] #c = [ "1116:19486:8968", "TCCGGTCCTTGGTGCCCGAGTCAGTCCTTCCTCCTA", "GAGTCTATTTTTTTAGGAGGAAGGACTGACTCGGGC", 93, 68 ] #c = [ "301028", "AAGTGTCCTTGGTGCCCGAGTCAGAGATAGATCGGA", "ATCTCTGACTCGGGCACCAAGGACACTTAGATCGGA", 96, 92 ] #c = [ "31631284", "TTCAGTCCTTGGTGCCCGAGTCAGAGATAGATCGGA", "ATCTCTGACTCGGGCACCAATGACCGGAAGATCGGA", 96, 92 ] #c = [ "7232", "AGGTGTCCTTGGTGCCCGAGTCAGTAGCTAAGAAAT", "TTATAGGCGATGGAGTTCGCCATAAACGCTGCTTAG", -1, -1 ] #c = [ "16845404", "AAATGTCCTTGGTGCCCGAGTCAGACTGGTAGGAGT", "TCTTATAGGCGATGGAGTTCGCCATAAACGCTGCTT", -1, -1 ] #c = [ "24102328", "AAGCGTCCTTGGTGCCCGAGTCAGGAGTCATAGATC", "ATGACTCCTGACTCGGGCACCAAGGACGCTTAGATC", 46, 39 ] #c = [ "51216106", "GGGTGTCCTTGGTGCCCGAGTCAGATTAGCTAAGCA", "AGCTAATCTGACTCGGGCACCAAGGACGCTGCTTAG", 41, 34 ] c = [ "1116:19486:8968", "TCCGGTCCTTGGTGCCCGAGTCAGTCCTTCCTCCTA", "GAGTCTATTTTTTTAGGAGGAAGGACTGACTCGGGC", 93, 68 ] #c = [ "41823514", "GAATGTCCTTGGTGCCCGAGTCAGAACTCCAAGATC", "TGGAGTTCTGACTCGGGCACCAAGGACATTCAGATC", -1, -1 ] #c = [ "180", "AAGCTGTCCTTGGTGCCCGAGTCAGGAAAAGTTCTT", "TTTTTTTAGGAGGAAGGATCTATGAGCAAAGGAGAA", 120, 75 ] #c = [ "67219", "GAGTGTCCTTGGTGCCCGAGTCAGTCGACAACTCCA", "TTATAGGCGATGGAGTTCGCCATAAACGCTGCTTAG", 134, 0 ] #c = [ "58726", "GGATGTCCTTGGTGCCCGAGTCAGCCTTAGATCGGA", "AAGGCTGACTCGGGCACCAAGGACATCCAGATCGGA", None, None ] #c = [ "188425", "GGACGTCCTTGGTGCCCGAGTCAGTATAGATCGGAA", "ATACTGACTCGGGCACCAAGGACTTCCAGATCGGAA", 24, 21 ] #c = [ "jjb_L21", "GGACGTCCTTGGTGCCCGAGTCAGGGCGAACTAGAT", "AGTTCGCCCTGACTCGGGCACCAAGGACGTCCAGAT", 21, 13 ] #c = [ "jjb_L20", "GGACGTCCTTGGTGCCCGAGTCAGGCGAACTCAGAT", "GAGTTCGCCTGACTCGGGCACCAAGGACGTCCAGAT", 20, 12 ] #c = [ "jjb_L19", "GGACGTCCTTGGTGCCCGAGTCAGCGAACTCCAGAT", "GGAGTTCGCTGACTCGGGCACCAAGGACGTCCAGAT", None, None ] #c = [ "406149", "AGGTGTCCTTGGTGCCCGAGTCAGGACAACTCCAGT", "TTATAGGCGATGGAGTTCGCCATAAACGCTGCTTAG", 132, 0 ] #c = [ "89185", "TCCAGTCCTTGGTGCCCGAGTCAGCTAAGCAGCGTT", "AATGACTCCTACCAGTATCACTACTGGTAGGAGTCT", 36, 38 ] #c = [ "3185000", "GAACGTCCTTGGTGCCCGAGTCAGGTTTATGGCGAA", "TCGCCATAAACCTGACTCGGGCACCAAGGACGTTCC", -1, -1 ] #c = [ "jjb_3185000'", "GAACGTCCTTGGTGCCCGAGTCAGGTTTATGGCGAA", "TCGCCATAAACCTGACTCGGGCACCAAGGACGTTCA", None, None ] #c = ['1', 'TCTGAGATCGGAAGAGCACACGTCTGAACTCCAGT', 'CAGAAGATCGGAAGAGCGTCGTGTAGGGAAAGAGT', None, None] #c = ['24941', 'TCCAGTCCTTGGTGCCCGAGTCAGAGACTCCTACCA', 'TATAGGCGATGGAGTTCGCCATAAACGCTGCTTAGC', -1, -1] c = ['jjbn', 'TTTGGTCCTTGGTGCCCGAGTCAGTAAAAAAATAGA', 'TCTATTTTTTTACTGACTCGGGCACCAAGGACCAAA', 83, 71 ] pair.set_from_data(c[0], c[1], c[2]) print "{}\n{} / {}".format(pair.identifier, pair.r1.original_seq, pair.r2.original_seq) s.process_pair(pair) if pair.has_site: print "{}: {} / {}".format(pair.target.name, pair.site, pair.end) else: print "FAIL: {}".format(pair.failure)
def make_test_dataset(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/data/" from spats_shape_seq import Spats from spats_shape_seq.db import PairDB pair_db = PairDB(bp + "ds.spats") pair_db.add_targets_table(bp + "../cotrans_single.fa") pair_db.parse(bp + "med_R1.fq", bp + "med_R2.fq") s = Spats(cotrans = True) s.run.num_workers = 1 s.run.writeback_results = True s.run._process_all_pairs = True s.run.algorithm = "find_partial" s.run.result_set_name = "test_validation" s.process_pair_db(pair_db) pair_db.store_run(s.run) pair_db.store_counters('spats', s.counters)
def make_test_dataset(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/data/" from spats_shape_seq import Spats from spats_shape_seq.db import PairDB pair_db = PairDB(bp + "ds.spats") pair_db.add_targets_table(bp + "../cotrans_single.fa") pair_db.parse(bp + "med_R1.fq", bp + "med_R2.fq") s = Spats(cotrans=True) s.run.num_workers = 1 s.run.writeback_results = True s.run._process_all_pairs = True s.run.algorithm = "find_partial" s.run.result_set_name = "test_validation" s.process_pair_db(pair_db) pair_db.store_run(s.run) pair_db.store_counters('spats', s.counters)
def test_minimum_length(self): from spats_shape_seq import Spats self.spats = Spats() self.spats.run.algorithm = "find_partial" self.spats.run.minimum_target_match_length = 11 self.spats.addTargets("test/5s/5s.fa") self.assertEqual(11, self.spats._targets.minimum_match_length) case = [ '1109:22737:14675', 'TCCAGTCCTTGGAGATCGGAAGAGCACACGTCTGA', 'CCAAGGACTGGAAGATCGGAAGAGCGTCGTGTAGG', None ] self.run_case(case) # this case only matches if the minimum length is set to 8 self.spats.run.minimum_target_match_length = 8 self.spats = Spats() self.spats.addTargets("test/5s/5s.fa") case[3] = 135 self.run_case(case)
def test_refactor(): from spats_clean import Spats bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/5sq_dev/" out = bp + "t3/" s = Spats(bp + "5S.fa", out) s.setup() s.process_pair_data(bp + "data/17571-AD1AW-KEW11-5S-2p1-18x-23FEB15-GGCTAC_S10_L001_R1_001.fastq", bp + "data/17571-AD1AW-KEW11-5S-2p1-18x-23FEB15-GGCTAC_S10_L001_R2_001.fastq") s.compute_profiles() s.write_reactivities() import subprocess subprocess.check_call(["diff", bp + "t2/rx.out", out + "/rx.out"]) print "Diff OK"
def prof_run(): from spats_shape_seq import Spats spats = Spats() #spats.run.cotrans = True #spats.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' #spats.run.writeback_results = False spats.run._process_all_pairs = True spats.run.skip_database = True spats.run.algorithm = "lookup" spats.run.count_mutations = True spats.run.num_workers = 1 bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/pdc_muts/PDC_tweaked/PDC_09_001_6/" spats.addTargets(bp + "target.fa") spats.process_pair_data(bp + "2k_R1.fastq", bp + "2k_R2.fastq") exit(0)
def test_refactor(): from spats_clean import Spats bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/5sq_dev/" out = bp + "t3/" s = Spats(bp + "5S.fa", out) s.setup() s.process_pair_data( bp + "data/17571-AD1AW-KEW11-5S-2p1-18x-23FEB15-GGCTAC_S10_L001_R1_001.fastq", bp + "data/17571-AD1AW-KEW11-5S-2p1-18x-23FEB15-GGCTAC_S10_L001_R2_001.fastq" ) s.compute_profiles() s.write_reactivities() import subprocess subprocess.check_call(["diff", bp + "t2/rx.out", out + "/rx.out"]) print "Diff OK"
def _dump_mut_counts(self): run_name = self._run_file() if not os.path.exists(run_name): raise Exception("Run must be run before attempting dump") spats = Spats() spats.load(run_name) countinfo = spats.counters.counts_dict() mut_cnts = [] for muts in sorted([int(k.split('_')[-1]) for k in countinfo.keys() if k.startswith('mut_count_')]): mut_cnts.append((muts, countinfo["mut_count_{}".format(muts)])) output_path = os.path.join(self.path, 'mut_counts.csv') self._write_csv(output_path, [ "Mutation Count", "Reads" ], mut_cnts) mut_cnts = [] for muts in sorted([int(k.split('_')[-1]) for k in countinfo.keys() if k.startswith('mapped_mut_count_')]): mut_cnts.append((muts, countinfo["mapped_mut_count_{}".format(muts)])) output_path = os.path.join(self.path, 'mapped_mut_counts.csv') self._write_csv(output_path, [ "Mutation Count", "Reads" ], mut_cnts)
class TestMutPairs(unittest.TestCase): def setUp(self): self.spats = Spats() self.spats.run.count_mutations = True self.spats.run.mutations_require_quality_score = ord('.') - ord('!') self.spats.run.allowed_target_errors = 1 self.spats.run.ignore_stops_with_mismatched_overlap = True self.spats.run.adapter_b = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG" self.setup_processor() def setup_processor(self): self.spats.run.algorithm = "find_partial" self.spats.addTargets("test/mut/mut_single.fa") def tearDown(self): self.spats = None def pair_for_case(self, case): pair = Pair() pair.set_from_data(case[0], case[1], case[2]) if len(case) > 6: pair.r1.quality = case[6] pair.r2.quality = case[7] else: pair.r1.quality = 'K' * len(case[1]) pair.r2.quality = 'K' * len(case[2]) return pair def run_case(self, case): pair = self.pair_for_case(case) self.spats.process_pair(pair) self.assertEqual(case[4], pair.site, "res={} != {} ({}, {}, {})".format(pair.site, case[4], self.__class__.__name__, case[0], pair.failure)) if pair.site is not None: self.assertEqual(case[3], pair.end, "end={} != {} ({}, {}, {})".format(pair.end, case[3], self.__class__.__name__, case[0], pair.failure)) self.assertEqual(case[5], sorted(pair.mutations) if pair.mutations else pair.mutations, "muts={} != {} ({}, {}, {})".format(pair.mutations, case[5], self.__class__.__name__, case[0], pair.failure)) return pair def cases(self): return cotrans_cases if self.spats.run.cotrans else cases def test_pairs(self): self.spats.run.pair_length = len(cases[0][1]) for case in self.cases(): self.run_case(case) print("Ran {} pair->site cases.".format(len(cases)))
def test_tags(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/5sq_dev/" from spats_shape_seq import Spats s = Spats() from spats_shape_seq.tag import TagProcessor s.run._processor_class = TagProcessor #from spats_shape_seq.target import Targets #s.addTargets(bp + "5S.fa") s.addTarget("5s", "GGATGCCTGGCGGCCGTAGCGCGGTGGTCCCACCTGACCCCATGCCGAACTCAGAAGTGAAACGCCGTAGCGCCGATGGTAGTGTGGGGTCTCCCCATGCGAGAGTAGGGAACTGCCAGGCATCTGACTCGGGCACCAAGGAC") #s.addTarget("rc(5s)", "GTCCTTGGTGCCCGAGTCAGATGCCTGGCAGTTCCCTACTCTCGCATGGGGAGACCCCACACTACCATCGGCGCTACGGCGTTTCACTTCTGAGTTCGGCATGGGGTCAGGTGGGACCACCGCGCTACGGCCGCCAGGCATCC") #s.addTarget("adapter_t", s.run.adapter_t) #s.addTarget("adapter_b", s.run.adapter_b) #s._targets._index_word_length = 8 #s._targets._minimum_length = 8 #s.addTarget("adapter_t_rc", reverse_complement(s.run.adapter_t)) #s.addTarget("adapter_b_rc", reverse_complement(s.run.adapter_b)) p = s._processor p.addTagTarget("5s", "GGATGCCTGGCGGCCGTAGCGCGGTGGTCCCACCTGACCCCATGCCGAACTCAGAAGTGAAACGCCGTAGCGCCGATGGTAGTGTGGGGTCTCCCCATGCGAGAGTAGGGAACTGCCAGGCATCTGACTCGGGCACCAAGGAC") p.addTagTarget("5s_rc", "GTCCTTGGTGCCCGAGTCAGATGCCTGGCAGTTCCCTACTCTCGCATGGGGAGACCCCACACTACCATCGGCGCTACGGCGTTTCACTTCTGAGTTCGGCATGGGGTCAGGTGGGACCACCGCGCTACGGCCGCCAGGCATCC") from spats_shape_seq.util import reverse_complement p.addTagTarget("adapter_t_rc", reverse_complement(s.run.adapter_t)) p.addTagTarget("adapter_b", s.run.adapter_b) from spats_shape_seq.pair import Pair cases = [ [ "1101:20069:1063", "TTTAGTCCTTGGTGCCCGAGTCAGATGCCTGGCAG", "TCCCACCTGACCCCATGCCGAACTCAGAAGTGAAA" ], [ "1101:11562:1050", "AAACGTCCTTGGTGCCCGAGTCAGATGCCTGGCAG", "CCACCTGACCCCATGCCGAACTCAGAAGTGAAACG" ], [ "21189", "TTTGGTCCTTGGTGCCCGAGTCAGAGATCGGAAGA", "CTGACTCGGGCACCAAGGACCAAAAGATCGGAAGA" ], [ "1101:12888:8140", "GGATGTCCTTGGTGCCCGAGTCAGATGCCAGATCG", "GGCATCTGACTCGGGCACCAAGGACATACAGATCG" ], [ "18333", "GAGTGTCCTTGGTGCCCGAGTCAGTGGTAGATCGG", "ACCACTGACTCGGGCACCAAGGACACTCAGATCGG" ], ] pair = Pair() for case in cases: pair.set_from_data(case[0], case[1], case[2]) s.process_pair(pair) print pair.r1.original_seq print pair.r1.tags print pair.r2.original_seq print pair.r2.tags print "-----------------------------"
class TestPanelPairs(unittest.TestCase): def setUp(self): from spats_shape_seq import Spats self.spats = Spats() self.spats.run.minimum_target_match_length = 10 self.spats.addTargets("test/panel_RNAs/panel_RNAs_complete.fa") def tearDown(self): self.spats = None def test_single_R1_match_with_adapter_multiple_without(self): pair = Pair() pair.set_from_data('M02465:8:000000000-A5D', 'CCCGCCGTCCTTGGTGCCCGAGTGAGATCGGAAGA','CACTCGGGCACCAAGGACGGCGGGAGATCGGAAGA') self.spats.run.debug = True self.spats.run.algorithm = "find_partial" self.spats.process_pair(pair) self.assertEqual(None, pair.target) self.assertEqual(1, self.spats.counters.multiple_R1_match)
def tags(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/" from spats_shape_seq.db import PairDB pair_db = PairDB(bp + "db/pairs.db") if True: print "Parsing to db..." pair_db.wipe() pair_db.add_targets_table(bp + "cotrans_single.fa") pair_db.parse(bp + "data/EJS_6_F_10mM_NaF_Rep1_GCCAAT_R1.fastq", bp + "data/EJS_6_F_10mM_NaF_Rep1_GCCAAT_R2.fastq", sample_size = 100000) from spats_shape_seq import Spats from spats_shape_seq.tag import TagProcessor from spats_shape_seq.util import reverse_complement s = Spats() s.run._processor_class = TagProcessor s.run.writeback_results = True s.run.result_set_name = "tags" s.run.num_workers = 1 s.run.cotrans = True s.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' s.loadTargets(pair_db) s.run.allow_indeterminate = True s.run.allowed_target_errors = 2 s.run.allowed_adapter_errors = 2 p = s._processor for target in pair_db.targets(): p.addTagTarget(target[0], target[1]) p.addTagTarget(target[0] + "_rc", reverse_complement(target[1])) p.addTagTarget("adapter_t_rc", reverse_complement(s.run.adapter_t)) p.addTagTarget("adapter_b", s.run.adapter_b) if s.run.cotrans: p.addTagTarget("linker_cotrans", s.run.cotrans_linker) p.addTagTarget("linker_cotrans_rc", reverse_complement(s.run.cotrans_linker)) s.process_pair_db(pair_db) rsid = pair_db.result_set_id_for_name(s.run.result_set_name) pair_db.count_tags(rsid) print pair_db.tag_counts(rsid)
class TestOverlap(unittest.TestCase): def tearDown(self): self.spats = None def pair_for_case(self, case): pair = Pair() pair.set_from_data(case[0], case[1], case[2]) return pair def run_case(self, case): pair = self.pair_for_case(case) print('running: {} / {}'.format(case[0], self.spats.run.algorithm)) self.spats.process_pair(pair) self.assertEqual( case[3], pair.site, "site res={} != {} ({}, {}, {}, {})".format( pair.site, case[3], self.__class__.__name__, case[0], self.spats.run.algorithm, pair.failure)) self.assertEqual( case[4], pair.failure, "failure res={} != {} ({}, {}, {})".format( pair.failure, case[4], self.__class__.__name__, case[0], self.spats.run.algorithm)) def test_pairs(self): for alg in algorithms: if alg == 'native': continue self.run_algorithm(alg) def run_algorithm(self, alg): from spats_shape_seq import Spats self.spats = Spats() self.spats.run.algorithm = alg self.spats.run.count_mutations = True self.spats.run.allowed_target_errors = True self.spats.run.ignore_stops_with_mismatched_overlap = True self.spats.addTargets("test/SRP/SRP.fa") self.run_pairs() def run_pairs(self): for case in overlap_cases: self.run_case(case) print("Ran {} overlap test cases.".format(len(overlap_cases)))
def test_minimum_length(self): from spats_shape_seq import Spats self.spats = Spats() self.spats.run.algorithm = "find_partial" self.spats.run.minimum_target_match_length = 11 self.spats.addTargets("test/5s/5s.fa") self.assertEqual(11, self.spats._targets.minimum_match_length) case = [ '1109:22737:14675', 'TCCAGTCCTTGGAGATCGGAAGAGCACACGTCTGA', 'CCAAGGACTGGAAGATCGGAAGAGCGTCGTGTAGG', None ] self.run_case(case) # this case only matches if the minimum length is set to 8 self.spats.run.minimum_target_match_length = 8 self.spats = Spats() self.spats.run.algorithm = "lookup" self.spats.addTargets("test/5s/5s.fa") case[3] = 135 self.run_case(case)
def d5s_writeback_run(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/" from spats_shape_seq.db import PairDB pair_db = PairDB(bp + "dev_out/pairs.db") pair_db.add_targets_table(bp + "5s/5S.fa") from spats_shape_seq import Spats s = Spats() s.addTargets(bp + "5s/5S.fa") s.writeback_results = True s.result_set_name = "pure_python" s.process_pair_db(pair_db)
class TestPanelPairs(unittest.TestCase): def setUp(self): from spats_shape_seq import Spats self.spats = Spats() self.spats.run.minimum_target_match_length = 10 self.spats.addTargets("test/panel_RNAs/panel_RNAs_complete.fa") def tearDown(self): self.spats = None def test_single_R1_match_with_adapter_multiple_without(self): pair = Pair() pair.set_from_data('M02465:8:000000000-A5D', 'CCCGCCGTCCTTGGTGCCCGAGTGAGATCGGAAGA', 'CACTCGGGCACCAAGGACGGCGGGAGATCGGAAGA') self.spats.run.debug = True self.spats.run.algorithm = "find_partial" self.spats.process_pair(pair) self.assertEqual(None, pair.target) self.assertEqual(1, self.spats.counters.multiple_R1_match)
def tag_test(): from spats_shape_seq import Spats s = Spats() s.run.cotrans = True s.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' s.run.algorithm = "find_partial" bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/" s.addTargets(bp + "cotrans_single.fa") from spats_shape_seq.pair import Pair pair = Pair() import cjb.util d = cjb.util.jsonAtPath("/tmp/spats_test.json") pair.set_from_data(str(d['id']), str(d['r1']), str(d['r2'])) print "{}\n{} / {}".format(pair.identifier, pair.r1.original_seq, pair.r2.original_seq) s.process_pair(pair) if pair.has_site: print "{}: {} / {}".format(pair.target.name, pair.site, pair.right) else: print "FAIL: {}".format(pair.failure)
def show_failure_types(): from spats_clean import Spats, Pair, FastqRecord spats = Spats("test/5s/5s.fa", "test/5s") spats.setup() bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/" with open(bp + "t11/x/filtered_R1.fq", 'rb') as r1_in: with open(bp + "t11/x/filtered_R2.fq", 'rb') as r2_in: r1_record = FastqRecord() r2_record = FastqRecord() pair = Pair() while True: r1_record.read(r1_in) if not r1_record.identifier: break r2_record.read(r2_in) pair.set_from_records(r1_record, r2_record) spats.process_pair(pair) summary = "{} :: {}".format( pair.identifier, pair.site if pair.has_site else pair.failure) if pair.r1.match_errors: summary += " R1!: {}".format(pair.r1.match_errors) if pair.r1.adapter_errors: summary += " R1A!: {}, adapter_len={}".format( pair.r1.adapter_errors, pair.r1._rtrim) if pair.r2.match_errors: summary += " R2!: {}".format(pair.r2.match_errors) if pair.r2.adapter_errors: summary += " R2A!: {}, adapter_len={}".format( pair.r2.adapter_errors, pair.r2._rtrim - 4) print summary
def _loadDBAndModel(self): rsnames = self._db.result_sets() if rsnames: self.result_set_id = self._db.result_set_id_for_name(rsnames[0]) self._db.index_results() self.has_tags = bool(self.result_set_id) else: self.result_set_id = -1 self.has_tags = False self.has_counters = self._db.has_counters() s = Spats() self._db.load_run(s.run) s.run._p_use_tag_processor = True s.loadTargets(self._db) if self.has_counters: self._db.load_counters("spats", s.counters) if self.has_tags: p = s._processor for t in s._targets.targets: p.addTagTarget(t.name, t.seq) p.addTagTarget(t.name + "_rc", reverse_complement(t.seq)) self.colors._colors[t.name.lower()] = self.colors.color("target") p.addTagTarget("adapter_t_rc", reverse_complement(s.run.adapter_t)) p.addTagTarget("adapter_b", s.run.adapter_b) if s.run.cotrans: p.addTagTarget("linker_cotrans", s.run.cotrans_linker) p.addTagTarget("linker_cotrans_rc", reverse_complement(s.run.cotrans_linker)) if s.run._p_extra_tags: for tag, seq in s.run._p_extra_tags.iteritems(): p.addTagTarget(tag, seq) if not self.has_counters: p.counters.load_from_db_data(self._db.counter_data_for_results(self.result_set_id)) self._spats = s
def tmut(): from spats_shape_seq import Spats from spats_shape_seq.db import PairDB from spats_shape_seq.diagram import diagram bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/mutsl/" pair_db = PairDB(bp + "ds_cmp.spats") if True: print "Parsing to db..." pair_db.wipe() pair_db.add_targets_table(bp + "mut_single.fa") fq_name = "mut2" pair_db.parse(bp + fq_name + "_R1.fastq", bp + fq_name + "_R2.fastq") spatss = [] for alg in ["find_partial", "lookup"]: spats = Spats(cotrans=False) spats.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' spats.run.count_mutations = True spats.run.algorithm = alg spats.run.allowed_target_errors = 1 spats.run.adapter_b = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG" spats.run._process_all_pairs = True spats.run.writeback_results = True spats.run.num_workers = 1 spats.run.result_set_name = "mut_" + alg spats.process_pair_db(pair_db) pair_db.store_run(spats.run) pair_db.store_counters(spats.run.result_set_name, spats.counters) spatss.append(spats) rdiff_func(bp + "ds_cmp.spats", "mut_find_partial", "mut_lookup", diag_spats=spatss[0])
class TestPairsPartial(unittest.TestCase): def setUp(self): self.spats = Spats() self.spats.run.cotrans = True self.spats.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' self.setup_processor() self.spats.addTargets("test/cotrans/cotrans_single.fa") def setup_processor(self): self.spats.run.algorithm = "find_partial" def tearDown(self): self.spats = None def pair_for_case(self, case): pair = Pair() pair.set_from_data(case[0], case[1], case[2]) return pair def run_case(self, case): pair = self.pair_for_case(case) self.spats.process_pair(pair) self.assertEqual(case[4], pair.site, "res={} != {} ({}, {})".format(pair.site, case[4], self.__class__.__name__, case[0])) if pair.site is not None: self.assertEqual(case[3], pair.end) return pair def test_pairs(self): self.spats.run.pair_length = len(cases[0][1]) if not self.spats._processor.exists(): # just ignore the native test if it's not available self.assertEqual("native", self.spats.run.algorithm) return for case in cases: self.run_case(case) print("Ran {} pair->site cases.".format(len(cases)))
class TestMutPairs(unittest.TestCase): def setUp(self): self.spats = Spats() self.spats.run.count_mutations = True self.spats.run.mutations_require_quality_score = ord('.') - ord('!') self.spats.run.allowed_target_errors = 1 self.spats.run.ignore_stops_with_mismatched_overlap = True self.spats.run.adapter_b = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG" self.setup_processor() def setup_processor(self): self.spats.run.algorithm = "find_partial" self.spats.addTargets("test/mut/mut_single.fa") def tearDown(self): self.spats = None def pair_for_case(self, case): pair = Pair() pair.set_from_data(case[0], case[1], case[2]) if len(case) > 6: pair.r1.quality = case[6] pair.r2.quality = case[7] else:
class TestShortAdapterB(unittest.TestCase): def tearDown(self): self.spats = None def pair_for_case(self, case): pair = Pair() pair.set_from_data(case[0], case[1], case[2]) return pair def run_case(self, case): pair = self.pair_for_case(case) print('running: {} / {}'.format(case[0], self.spats.run.algorithm)) self.spats.process_pair(pair) self.assertEqual( case[3], pair.site, "site res={} != {} ({}, {}, {}, {})".format( pair.site, case[3], self.__class__.__name__, case[0], self.spats.run.algorithm, pair.failure)) def test_pairs(self): for alg in algorithms: if alg == 'native': continue self.run_algorithm(alg) def run_algorithm(self, alg): from spats_shape_seq import Spats self.spats = Spats() self.spats.run.algorithm = alg self.spats.addTargets("test/5SrRNA/5SrRNA.fa") self.run_pairs() def run_pairs(self): for case in short_adapter_b_cases: self.run_case(case) print("Ran {} adapter_b test cases.".format( len(short_adapter_b_cases)))
def indels_run(): from spats_shape_seq import Spats s = Spats() s.run.algorithm = True s.run.count_indels = True s.run.count_mutations = True s.run.allowed_target_errors = 8 s.collapse_left_prefixes = True s.run.ignore_stops_with_mismatched_overlap = True s.run.allow_negative_values = True s.mutations_require_quality_score = 30 bp = "/Users/steve/mos/tasks/oughxX/code" s.addTargets(bp + "/test/hairpin/hairpinA_circ.fa") rp = bp + "/TESTING/cmp_muts_favored/steve_test" s.process_pair_data(rp + "/R1_match_failures.fastq", rp + "/R2_match_failures.fastq") exit(0)
def __init__(self, reads_data, cotrans = False): self._reads_data = reads_data self._pair_db = reads_data.pair_db s = Spats() s.run._p_use_tag_processor = True s.run.cotrans = cotrans s.run.writeback_results = True s.run.result_set_name = "tags" s.run.allow_indeterminate = True s.run.allowed_target_errors = 2 s.run.allowed_adapter_errors = 2 s.run.num_workers = 8 self._spats = s self._extra_tag_targets = [] self._plugins = {}
def cotrans_debug(): from spats_shape_seq import Spats s = Spats() s.run.cotrans = True #s.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' #s.run.algorithm = "find_partial" #s.run._p_v102_compat = True s.run.minimum_target_match_length = 10 bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/" s.addTargets(bp + "cotrans_single.fa") from spats_shape_seq.pair import Pair pair = Pair() import cjb.util d = cjb.util.jsonAtPath("/tmp/spats_test.json") pair.set_from_data(str(d['id']), str(d['r1']), str(d['r2'])) #c = ['683779', 'TCCGGTCCTTGGTGCCCGAGTCAGAAAAAAATAGAA', 'TCTATTTTTTTCTGACTCGGGCACCAAGGACCGGAA', 82, 71] #c = [ "1116:19486:8968", "TCCGGTCCTTGGTGCCCGAGTCAGTCCTTCCTCCTA", "GAGTCTATTTTTTTAGGAGGAAGGACTGACTCGGGC", 93, 68 ] #c = [ "301028", "AAGTGTCCTTGGTGCCCGAGTCAGAGATAGATCGGA", "ATCTCTGACTCGGGCACCAAGGACACTTAGATCGGA", 96, 92 ] #c = [ "31631284", "TTCAGTCCTTGGTGCCCGAGTCAGAGATAGATCGGA", "ATCTCTGACTCGGGCACCAATGACCGGAAGATCGGA", 96, 92 ] #c = [ "7232", "AGGTGTCCTTGGTGCCCGAGTCAGTAGCTAAGAAAT", "TTATAGGCGATGGAGTTCGCCATAAACGCTGCTTAG", -1, -1 ] #c = [ "16845404", "AAATGTCCTTGGTGCCCGAGTCAGACTGGTAGGAGT", "TCTTATAGGCGATGGAGTTCGCCATAAACGCTGCTT", -1, -1 ] #c = [ "24102328", "AAGCGTCCTTGGTGCCCGAGTCAGGAGTCATAGATC", "ATGACTCCTGACTCGGGCACCAAGGACGCTTAGATC", 46, 39 ] #c = [ "51216106", "GGGTGTCCTTGGTGCCCGAGTCAGATTAGCTAAGCA", "AGCTAATCTGACTCGGGCACCAAGGACGCTGCTTAG", 41, 34 ] c = [ "1116:19486:8968", "TCCGGTCCTTGGTGCCCGAGTCAGTCCTTCCTCCTA", "GAGTCTATTTTTTTAGGAGGAAGGACTGACTCGGGC", 93, 68 ] #c = [ "41823514", "GAATGTCCTTGGTGCCCGAGTCAGAACTCCAAGATC", "TGGAGTTCTGACTCGGGCACCAAGGACATTCAGATC", -1, -1 ] #c = [ "180", "AAGCTGTCCTTGGTGCCCGAGTCAGGAAAAGTTCTT", "TTTTTTTAGGAGGAAGGATCTATGAGCAAAGGAGAA", 120, 75 ] #c = [ "67219", "GAGTGTCCTTGGTGCCCGAGTCAGTCGACAACTCCA", "TTATAGGCGATGGAGTTCGCCATAAACGCTGCTTAG", 134, 0 ] #c = [ "58726", "GGATGTCCTTGGTGCCCGAGTCAGCCTTAGATCGGA", "AAGGCTGACTCGGGCACCAAGGACATCCAGATCGGA", None, None ] #c = [ "188425", "GGACGTCCTTGGTGCCCGAGTCAGTATAGATCGGAA", "ATACTGACTCGGGCACCAAGGACTTCCAGATCGGAA", 24, 21 ] #c = [ "jjb_L21", "GGACGTCCTTGGTGCCCGAGTCAGGGCGAACTAGAT", "AGTTCGCCCTGACTCGGGCACCAAGGACGTCCAGAT", 21, 13 ] #c = [ "jjb_L20", "GGACGTCCTTGGTGCCCGAGTCAGGCGAACTCAGAT", "GAGTTCGCCTGACTCGGGCACCAAGGACGTCCAGAT", 20, 12 ] #c = [ "jjb_L19", "GGACGTCCTTGGTGCCCGAGTCAGCGAACTCCAGAT", "GGAGTTCGCTGACTCGGGCACCAAGGACGTCCAGAT", None, None ] #c = [ "406149", "AGGTGTCCTTGGTGCCCGAGTCAGGACAACTCCAGT", "TTATAGGCGATGGAGTTCGCCATAAACGCTGCTTAG", 132, 0 ] #c = [ "89185", "TCCAGTCCTTGGTGCCCGAGTCAGCTAAGCAGCGTT", "AATGACTCCTACCAGTATCACTACTGGTAGGAGTCT", 36, 38 ] #c = [ "3185000", "GAACGTCCTTGGTGCCCGAGTCAGGTTTATGGCGAA", "TCGCCATAAACCTGACTCGGGCACCAAGGACGTTCC", -1, -1 ] #c = [ "jjb_3185000'", "GAACGTCCTTGGTGCCCGAGTCAGGTTTATGGCGAA", "TCGCCATAAACCTGACTCGGGCACCAAGGACGTTCA", None, None ] #c = ['1', 'TCTGAGATCGGAAGAGCACACGTCTGAACTCCAGT', 'CAGAAGATCGGAAGAGCGTCGTGTAGGGAAAGAGT', None, None] #c = ['24941', 'TCCAGTCCTTGGTGCCCGAGTCAGAGACTCCTACCA', 'TATAGGCGATGGAGTTCGCCATAAACGCTGCTTAGC', -1, -1] c = [ 'jjbn', 'TTTGGTCCTTGGTGCCCGAGTCAGTAAAAAAATAGA', 'TCTATTTTTTTACTGACTCGGGCACCAAGGACCAAA', 83, 71 ] pair.set_from_data(c[0], c[1], c[2]) print "{}\n{} / {}".format(pair.identifier, pair.r1.original_seq, pair.r2.original_seq) s.process_pair(pair) if pair.has_site: print "{}: {} / {}".format(pair.target.name, pair.site, pair.end) else: print "FAIL: {}".format(pair.failure)
def tmut_case(): from spats_shape_seq import Spats from spats_shape_seq.db import PairDB from spats_shape_seq.diagram import diagram bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/mutsl/" spats = Spats(cotrans=False) spats.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' spats.run.count_mutations = True spats.run.algorithm = "find_partial" spats.run.allowed_target_errors = 1 spats.run.adapter_b = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG" spats.run._process_all_pairs = True spats.run.writeback_results = True spats.run.num_workers = 1 spats.run.result_set_name = "mut" spats.addTargets(bp + "mut_single.fa") from spats_shape_seq.pair import Pair pair = Pair() #c = [ 'GAATGTCCTTGGTGCCCGAGTCAGTCCTTGGTGCCCGAGTCAGTCCTTGGTTCCCGAGTCACTCCTTTGTTCCCC', 'AGGACTGACTCGGGCACCAAGGACTTTCTCGTTCACCTATTTCTTTCTCTTCCCCCTTTTTCTTTCTCTTTCTCC' ] #c = [ 'GAGCGTCCTTGGTGCCCGAGTCAGATGCCGACCCGGGTGGGGGCCCTGCCAGCTACATCCCGGCACACGCGTCAT', 'TAGGTCAGGTCCGGAAGGAAGCAGCCAAGGCAGATGACGCGTGTGCCGGGATGTAGCTGGCAGGGCCCCCACCCG' ] #c = [ 'GAATGTCCTTGGTGCCCGAGTCAGGACACGCGTCATCTGCCTTGGCTGCTTCCTTCCGGACCTGACCTGGTAAAC', 'ATCGGGGGCTCTGTTGGTTCCCCCGCAACGCTACTCTGTTTACCAGGTCAGGTCCGGAAGGAAGCAGCCAAGTCA' ] #c = [ 'AGGCGTCCTTGGTGCCCGAGTCAGCCTTGGCTGCTTCCTTCCGGACCTGACCTGGTAAACAGAGTAGCGTTGCGG', 'ATCGGGGGCTCTGTTGGTTCCCCCGCAACGCTACTCTGTTTACCAGGTCAGGTCCGGAAGGAAGCAGCCAAGTCT' ] #c = [ 'TTCAGTCCTTGGTGCCCGAGTCAGCCAGCTACATCCCGGCACACGCGTCATCTGCCTTGGCTGCTTCCTTCCGGA', 'AGGTCAGATCCGGAAGGAAGCAGCCAAGGCAGATGACGCGTGTGCCGGGATGTAGCTGGCTGACTCGGGCACCAA' ] #c = [ 'AAATGTCCTTGGTGCCCGAGTCAGATCTGCCTTAAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGA', 'TAAGGCAGATCTGACTCGGGCACCAAGGACATTTAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCG' ] #c = [ 'CTCAGTCCTTGGTGCCCGAGTCAGTGAGCTAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTC', 'AGCTCACTGACTCGGGCACCAAGGACTGAGAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGG' ] #c = [ 'AAGCGTCCTTGGTGCCCGAGTCAGTGGAGGTAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCT', 'ACCTCCACTGACTCGGGCACCAAGGACGCTTAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTG' ] #c = [ 'TCCGGTCCTTGGTGCCCGAGTCAGATGTAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGT', 'ACATCTGACTCGGGCACCAAGGACCGGAAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTC' ] #c = [ 'TTTAAGTCCTTGGTGCCCGAGTCAGGTCATCTGCCTTGGCTGCTTCCTTCCGGACCTGACCTGGTAAACAGAGTA', 'TACTCTGTTTACCAGGTCAGGTCCGGAAGGAAGCAGCCAAGGCAGATGACCTGACTCGGGCACCAAGGACTTAAA' ] #c = [ 'TTCACAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'GGACAAGCAATGCTTACCTTGATGTTGAACTTTTGAATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGA' ] #c = [ 'AGATCAACAAGAATTAGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'ACAAGCAATGCTTGCCTTGATGTTGAACTTTTGAATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAAC' ] #c = [ 'AAATCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'AATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGATT' ] #c = [ 'TCCGCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'ATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGCGGA' ] #c = [ 'TCCACAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCATTTGCTCATCATTAACCTCCTGAATCACTAT', 'GGACAAGCAATGCTTGCCTTGATGTTGAACTTTTGAATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGA' ] #c = [ 'GGGTCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTTAGATCGGAAGAGCACAC', 'AAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGACCCAGATCGGAAGAGCGTCG' ] c = [ 'GAACCAACAAGAATTGGGACAACTCCAGTGAAAGGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAAGATCGGA', 'TCAGGAGGTTAATGATGAGCAAAGGAGAAGAACCTTTCACTGGAGTTGTCCCAATTCTTGTTGGTTCAGATCGGA' ] #c = [ 'CCTACAACAAGAATTGGGACAACTCCAGTGAGAAGTTCTTCTCCTTTGCTCATCATTAAGATCGGAAGAGCACAC', 'TAATGATGAGCAAAGGAGAAGAACTTCTCACTGGAGTTGTCCCAATTCTTGTTGTAGGAGATCGGAAGAGCGTCG' ] #c = [ 'CTTGCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCTTTAACCTCCTGAATCACTAA', 'TAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGCAAGA' ] pair.set_from_data('x', c[0], c[1]) spats.process_pair(pair) print diagram(pair, spats.run) if pair.has_site: print "{}: {} / {} {}".format(pair.target.name, pair.site, pair.end, pair.mutations) else: print "FAIL: {}".format(pair.failure)
class TestDatasets(unittest.TestCase): def test_datasets(self): for case in cases: for alg in algorithms: if os.environ.get('SKIP_SLOW_TESTS') and alg == 'native': raise nose.SkipTest('skipping slow tests') self.run_dataset(case, alg) print("Ran {} datasets.".format(len(cases))) def run_dataset(self, case, algorithm): bp = "test/{}/".format(case) test_file = bp + "test.spats.tmp" try: shutil.copyfile(bp + "ds.spats", test_file) db = PairDB(test_file) s = Spats() db.load_run(s.run) if not s.run.cotrans and algorithm == "native": return s.run.writeback_results = True s.run.result_set_name = "test" s.run.algorithm = algorithm s.run.quiet = True s.loadTargets(db) if not s._processor.exists(): # just ignore the native test if it's not available self.assertEqual("native", algorithm) return s.process_pair_db( db, batch_size=1024 ) # small batch_size just to exercise multiprocessing code msg = None count = 0 for res in db.differing_results("test", "test_validation"): msg = str([str(x) for x in res]) count += 1 self.assertEqual( 0, count, "{} differing results: {} / {} \n{}".format( count, case, algorithm, msg)) finally: if os.path.exists(test_file): os.remove(test_file)