def test_perfect_match(): s = "aaacttcccaccccataccctattaccactgccaattacctagtggtttcatttactctaaacctgtgattcctctgaattattttcatttta" r1 = SeqRecord(seq=Seq(s), annotations={"topology": "linear"}) r2 = SeqRecord(seq=Seq(s), annotations={"topology": "linear"}) blaster = BioBlast([r1], [r2]) blaster.blastn() result = blaster.results[0] assert result["query"]["start"] == 1 assert result["query"]["end"] == len(s) assert result["subject"]["start"] == 1 assert result["subject"]["end"] == len(s)
def test_simple_alignment(): record = rand_record(1000) queries = [record[:]] subjects = [record[10:-10]] queries = make_linear(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() assert len(results) == 1 compare_result(results[0], 11, len(record) - 10, 1, len(record) - 10 - 10)
def test_circular_over_subject(self): record = rand_record(1000) queries = [record] subjects = [record[200:300] + ns(500) + record[100:200]] queries = make_linear(queries) subjects = make_circular(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() compare_result(results[0], 101, 300, 601, 100)
def test_align_Ns(): record = rand_record(1000) nseq = SeqRecord(Seq("N" * 500)) queries = [record[:]] subjects = [nseq + record + nseq] queries = make_linear(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() print(results)
def test_self_blast(here): subjects = load_genbank_glob(join(here, "data/test_data/genbank/templates/*.gb"), force_unique_ids=True) queries = [ SeqRecord(Seq(str(subjects[0][:1000].seq))), # SeqRecord(Seq(str(subjects[1][:1000]))), ] force_unique_record_ids(make_linear(queries)) bioblast = BioBlast(queries, queries) results = bioblast.blastn() assert not results
def test_run_bioblast_twice(): junk1 = "atgctatgctgatgctgctgtgctgatgctgatgtgtattgctgtatcgcgcgagttagc" junk2 = "g" * 30 frag = "aaacttcccaccccataccctattaccactgccaattacctagtggtttcatttactctaaacctgtgattcctctgaattattttcatttta" query = SeqRecord(seq=Seq(frag), annotations={"circular": False}) subject = SeqRecord(seq=Seq(junk1 + frag + junk2), annotations={"circular": False}) blaster = BioBlast([subject], [query]) blaster.blastn() blaster.blastn() alignments = blaster.results print(alignments)
def test_partial_alignment(left_spacer, ij): record = rand_record(1000) queries = [record[:]] subjects = [ns(left_spacer) + record[ij[0]:ij[1]]] queries = make_linear(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() assert len(results) == 1 compare_result(results[0], ij[0] + 1, ij[1], 1 + left_spacer, ij[1] - ij[0] + left_spacer)
def test_partial_alignment_reverse_complement(left_spacer, ij): record = rand_record(1000) queries = [record[:]] subjects = [record[ij[0]:ij[1]]] subjects[0] = ns(left_spacer) + subjects[0].reverse_complement() queries = make_linear(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() assert len(results) == 1 compare_result(results[0], ij[0] + 1, ij[1], len(subjects[0].seq), left_spacer + 1)
def test_circular_complete_subject(self): record = rand_record(1000) queries = [record] subjects = [record[500:] + record[:400]] queries = make_circular(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() result = results[0] print(result) assert result["subject"]["start"] == 1 assert result["subject"]["end"] == 900 assert result["query"]["start"] == 501 assert result["query"]["end"] == 400
def test_raises_pyblast_when_not_unique(here): subjects = load_genbank_glob( join(here, "data/test_data/genbank/templates/*.gb")) queries = load_genbank_glob( join(here, "data/test_data/genbank/designs/*.gb")) print("n_queres: {}".format(len(queries))) print("n_subjects: {}".format(len(subjects))) with pytest.raises(PyBlastException): BioBlast(subjects, queries)
def test_reverse_alignment_simple(): record = rand_record(1000) query = record subject = record[10:990].reverse_complement() subjects = make_linear([subject]) queries = make_linear([query]) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() for k, v in bioblast.seq_db.records.items(): print(k) print(v) print(json.dumps(results, indent=2)) assert results[0]["query"]["start"] == 10 + 1 assert results[0]["query"]["end"] == 990 assert results[0]["subject"]["start"] == 980 assert results[0]["subject"]["end"] == 1
def test_not_raise_pyblast_when_unique(here): subjects = load_genbank_glob( join(here, "data/test_data/genbank/templates/*.gb")) queries = load_genbank_glob( join(here, "data/test_data/genbank/designs/*.gb")) force_unique_record_ids(subjects + queries) print("n_queres: {}".format(len(queries))) BioBlast(subjects, queries)
def test_multiquery_blast(here): subjects = load_genbank_glob(join(here, "data/test_data/genbank/templates/*.gb"), force_unique_ids=True) queries = load_genbank_glob(join(here, "data/test_data/genbank/designs/*.gb"), force_unique_ids=True) print("n_queres: {}".format(len(queries))) print("n_subjects: {}".format(len(subjects))) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() recids = set() for res in results: recid = res["query"]["origin_record_id"] recids.add(recid) print("n_records: {}".format(len(results))) assert len(recids) == len(queries)
def test_circular_complete_query_1(self): """In this situation, the subject is completely aligned with a circular query starting at index 500 (starting index = 0). Note that the pyblast results start at index 1.""" record = rand_record(1000) queries = [record] subjects = [ns(100) + record[500:] + record[:500] + ns(100)] queries = make_circular(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() result = results[0] assert result["query"]["start"] == 501 assert result["query"]["raw_end"] == 1500 assert result["subject"]["start"] == 101 assert result["subject"]["end"] == 1100
def test_circular_over_query(self): record = rand_record(1000) queries = [record] subjects = [record[-100:] + record[:100]] queries = make_circular(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() result = results[0] result_seq = str((record[result["query"]["start"] - 1:] + record[:result["query"]["end"]]).seq) expected_seq = str(subjects[0].seq) assert result_seq == expected_seq compare_result(results[0], 1000 - 100 + 1, 100, 1, 200)
def test_example2(): from pyblast import BioBlast from pyblast.utils import make_linear, make_circular from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq import json seq = "ACGTTGTAGTGTAGTTGATGATGATGTCTGTGTCGTGTGATGTGCTAGGGGTTGATGTGAGTAGTTAGTGGTAGTGTTTAGGGGCGGCGCGGAGTATGCTG" queries = [SeqRecord(Seq(seq))] subjects = [SeqRecord(Seq(seq[-20:] + seq[:30]))] # pyblast requires a 'topology' annotation on the SeqRecords. # we can make records circular or linear using `make_linear` or `make_circular` methods subjects = make_linear(subjects) queries = make_circular(queries) blast = BioBlast(subjects, queries) results = blast.blastn() print(json.dumps(results, indent=2))
def test_basic_run_reverse_complement(): junk1 = "atgctatgctgatgctgctgtgctgatgctgatgtgtattgctgtatcgcgcgagttagc" junk2 = "g" * 30 frag = "aaacttcccaccccataccctattaccactgccaattacctagtggtttcatttactctaaacctgtgattcctctgaattattttcatttta" query = SeqRecord(seq=Seq(frag), annotations={ "circular": False }).reverse_complement() subject = SeqRecord(seq=Seq(junk1 + frag + junk2), annotations={"circular": False}) make_linear([query]) # print(type(query)) # print(type(subject)) blaster = BioBlast([subject], [query]) blaster.blastn() alignments = blaster.results for a in alignments: print(json.dumps(a, indent=2)) assert a["subject"]["strand"] == -1
def test_circular_complete_query_4(self): """In this situation, the subject is wraps around the query for 10 extra bases on the left and right site. Note that pyblast results start at index 1. """ record = rand_record(1000) queries = [record] subjects = [ns(100) + record[-10 + 500:] + record[:500 + 10] + ns(100)] queries = make_circular(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() result = results[0] assert result["query"]["start"] == 491 assert result["query"]["raw_end"] == 1510 assert result["subject"]["start"] == 101 assert result["subject"]["end"] == 1120
def test_interaction_network(): """We expect self alignments to be removed from the results.""" records = [None, None, None, None] records[0] = rand_record(500) records[1] = rand_record(100) + records[0][:-100] + rand_record(1000) records[2] = rand_record(200) + records[1][:700] + rand_record(500) records[3] = records[2][-500:] + rand_record(500) force_unique_record_ids(records) queries = make_linear(records) bioblast = BioBlast(queries, queries) results = bioblast.blastn() assert results for r in results: k1 = r["query"]["origin_key"] k2 = r["subject"]["origin_key"] print(k1, k2) assert not k1 == k2
def test_circular_complete_query_parametrized_rc(self, extra_right, extra_left): record = rand_record(1000) queries = [record] subjects = [ ns(100) + record[(500 - extra_left):] + record[:(500 + extra_right)] + ns(100) ] subjects = [subjects[0].reverse_complement()] queries = make_circular(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() result = results[0] print(json.dumps(result, indent=2)) assert result["query"]["start"] == 501 - extra_left assert result["query"]["raw_end"] == 1500 + extra_right assert result["subject"]["start"] == 1100 + extra_right + extra_left assert result["subject"]["end"] == 101 # to spans query_span = bioblast.parse_result_to_span(result["query"], output_index=0) subject_span = bioblast.parse_result_to_span(result["subject"], output_index=0) assert len(subject_span) == len( query_span) == 1000 + extra_right + extra_left assert query_span.a == 500 - extra_left assert query_span.b == 500 + extra_right assert subject_span.a == 100 assert subject_span.b == 1100 + extra_right + extra_left
def test_unnamed_queries(here): subjects = load_genbank_glob(join(here, "data/test_data/genbank/templates/*.gb"), force_unique_ids=True) seqstr1 = str(subjects[0].seq)[:1000] seqstr2 = str(subjects[1].seq)[:1000] queries = [ SeqRecord(Seq(seqstr1)), SeqRecord(Seq(seqstr2)) # SeqRecord(Seq(str(subjects[1][:1000]))), ] force_unique_record_ids(make_linear(queries)) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() recids = set() for res in results: recid = res["query"]["origin_record_id"] recids.add(recid) print("n_records: {}".format(len(results))) assert len(recids) == len(queries)
def make_blast(): subjects = load_fasta_glob(join( here, "data/test_data/primers/primers.fasta"), force_unique_ids=True) subjects = make_linear(subjects) queries = load_genbank_glob( join( here, "data/test_data/genbank/designs/pmodkan-ho-pact1-z4-er-vpr.gb" ), force_unique_ids=True, ) return BioBlast(subjects, queries)
def make_blast(): subjects = load_genbank_glob(join( here, "data/test_data/genbank/templates/*.gb"), force_unique_ids=True) queries = load_genbank_glob( join( here, "data/test_data/genbank/designs/pmodkan-ho-pact1-z4-er-vpr.gb" ), force_unique_ids=True, ) queries = make_circular(queries) assert is_circular(queries[0]) return BioBlast(subjects, queries)
def test_unnamed_queries_raises_duplicate_error(here): subjects = load_genbank_glob(join(here, "data/test_data/genbank/templates/*.gb"), force_unique_ids=True) seqstr1 = str(subjects[0].seq)[:1000] seqstr2 = str(subjects[1].seq)[:1000] queries = [ SeqRecord(Seq(seqstr1)), SeqRecord(Seq(seqstr2)) # SeqRecord(Seq(str(subjects[1][:1000]))), ] make_linear(queries) with pytest.raises(PyBlastException): BioBlast(subjects, queries)
def test_ungapped(): frag = "GtctaaaggtgaagaattattcactggtgttgtcccaattttggttgaattagatggtgatgttaatggtcacaaattttctgtctccggtgaaggtgaaggtgatgctacttacggtaaattgaccttaaaatttatttgtactactggtaaattgccagttccatggccaaccttagtcactactttcggttatggtgttcaatgttttgcgagatacccagatcatatgaaacaacatgactttttcaagtctgccatgccagaaggttatgttcaagaaagaactatttttttcaaagatgacggtaactacaagaccagagctgaagtcaagtttgaaggtgataccttagttaatagaatcgaattaaaaggtattgattttaaagaagatggtaacattttaggtcacaaattggaatacaactataactctcacaatgtttacatcatggctgacaaacaaaagaatggtatcaaagttaacttcaaaattagacacaacattgaagatggttctgttcaattagctgaccattatcaacaaaatactccaattggtgatggtccagtcttgttaccagacaaccattacttatccactcaatctgccttatccaaagatccaaacgaaaagagagaccacatggtcttgttagaatttgttactgctgctggtattacccatggtatggatgaattgtacaaaTAGTGATACCGTCGACCTCGAGTCAattagttatgtcacgcttacattcacgccctccccccacatccgctctaaccgaaaaggaaggagttagacaacctgaagtctaggtccctatttatttttttatagttatgttagtattaagaacgttatttatatttcaaatttttcttt" query = SeqRecord(seq=Seq(frag), annotations={"circular": False}) subject = SeqRecord( seq=Seq(frag[:400] + "atgctatgctgatgctgctgtgctgat" + frag[400:]), annotations={"circular": False}, ) # print(type(query)) # print(type(subject)) blaster = BioBlast([subject], [query]) blaster.update_config({"ungapped": None}) blaster.blastn() alignments = blaster.results print(alignments)
)) ] subjects = [ SeqRecord(Seq("TCGTGTAGTTGAGTGTTACGTTGCATGTCGTTACGTGATCG"), id="aa1"), SeqRecord(Seq("TCGTGTAGTTGAGTGTTACGTTGCATGTCGGGGACGTGATCG"), id="aa2") ] # pyblast requires a 'topology' annotation on the SeqRecords. # we can make records circular or linear using `make_linear` or `make_circular` methods t0 = time.time() subjects = make_linear(subjects) queries = make_linear(queries) blast = BioBlast(subjects, queries) results = blast.blastn() t1 = time.time() - t0 #print(t1) #print(results) fa_file = "../samples/P21333.fasta" sequences = SeqIO.parse(fa_file, "fasta") queries = [] subjects = [] for record in sequences: queries.append(record) #sequences = SeqIO.parse("viral/viral_classification/sample_genomes/HIV.B.fasta", "fasta") sequences = SeqIO.parse("../samples/H1.txt", "fasta")