def test_random_sequences(aligner, strand1="+", strand2="+"): chromosome = "".join(["ACGT"[random.randint(0, 3)] for i in range(1000)]) transcript = "".join(["ACGT"[random.randint(0, 3)] for i in range(300)]) sequence = "".join(["ACGT"[random.randint(0, 3)] for i in range(100)]) chromosome = Seq(chromosome) transcript = Seq(transcript) sequence = Seq(sequence) chromosome.id = "chromosome" transcript.id = "transcript" sequence.id = "sequence" alignments = aligner.align(chromosome, transcript, strand=strand1) alignment1 = alignments[0] alignments = aligner.align(transcript, sequence, strand=strand2) alignment2 = alignments[0] psl_check = map_check(alignment1, alignment2) alignment = alignment1.map(alignment2) psl_check = psl_check.split() psl = format(alignment, "psl") psl = psl.split() assert psl[8:] == psl_check[8:] psl1 = format(alignment1, "psl") words = psl1.split() nBlocks1 = int(words[17]) psl2 = format(alignment2, "psl") words = psl2.split() nBlocks2 = int(words[17]) print("Randomized sequence test %d, %d, %s, %s OK" % (nBlocks1, nBlocks2, strand1, strand2))
def test_internal(self): aligner = self.aligner chromosome = Seq("AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA") chromosome.id = "chromosome" transcript = Seq("GGGGGGGCCCCCGGGGGGA") transcript.id = "transcript" sequence = Seq("GGCCCCCGGG") sequence.id = "sequence" alignments1 = aligner.align(chromosome, transcript) self.assertEqual(len(alignments1), 1) alignment1 = alignments1[0] self.assertTrue( numpy.array_equal(alignment1.coordinates, numpy.array([[12, 31], [0, 19]]))) self.assertEqual( str(alignment1), """\ AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA ||||||||||||||||||| GGGGGGGCCCCCGGGGGGA """, ) alignments2 = aligner.align(transcript, sequence) self.assertEqual(len(alignments2), 1) alignment2 = alignments2[0] self.assertTrue( numpy.array_equal(alignment2.coordinates, numpy.array([[5, 15], [0, 10]]))) self.assertEqual( str(alignment2), """\ GGGGGGGCCCCCGGGGGGA |||||||||| GGCCCCCGGG """, ) alignment = alignment1.map(alignment2) self.assertTrue( numpy.array_equal(alignment.coordinates, numpy.array([[17, 27], [0, 10]]))) self.assertEqual( str(alignment), """\ AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA |||||||||| GGCCCCCGGG """, ) psl = format(alignment, "psl") self.assertEqual( psl, """\ 10 0 0 0 0 0 0 0 + sequence 10 0 10 chromosome 40 17 27 1 10, 0, 17, """, )
def test1(self): aligner = self.aligner chromosome = Seq( "GCCTACCGTATAACAATGGTTATAATACAAGGCGGTCATAATTAAAGGGAGTGCAGCAACGGCCTGCTCTCCAAAAAAACAGGTTTTATGAAAAGAAAGTGCATTAACTGTTAAAGCCGTCATATCGGTGGGTTCTGCCAGTCACCGGCATACGTCCTGGGACAAAGACTTTTTACTACAATGCCAGGCGGGAGAGTCACCCGCCGCGGTGTCGACCCAGGGGACAGCGGGAAGATGTCGTGGTTTCCTTGTCATTAACCAACTCCATCTTAAAAGCTCCTCTAGCCATGGCATGGTACGTTGCGCGCACCCTTTTATCGGTAAGGCGCGGTGACTCTCTCCCAAAACAGTGCCATAATGGTTCGCTTCCTACCTAAGGCACTTACGGCCAATTAATGCGCAAGCGAGCGGAAGGTCTAACAGGGCACCGAATTCGATTA" ) chromosome.id = "chromosome" transcript = Seq( "GGAATTTTAGCAGCCAAAGGACGGATCCTCCAAGGGGCCCCAGCACAGCACATTTTTAACGCGAACTAAGCGGGAGCGCATGTGGGACAGTTGATCCCATCCGCCTCAAAATTTCTCGCAATATCGGTTGGGGCACAGGTCCACTTTACGAATTCATACCGTGGTAGAGACCTTTATTAGATAGATATGACTGTTTGATTGCGGCATAGTACGACGAAGCAAGGGGATGGACGTTTCGGTTGCATTCGACCGGGTTGGGTCGAAAAACAGGTTTTATGAAAAGAAAGTGCATTAACTGTTAAAGCCGTCATATCGGTGGGTTC" ) transcript.id = "transcript" sequence = Seq( "TCCAAGGGGCCCCAGCACAGCACATTTTTAACGCGGGGACAGTTGATCCCATCCGCCTTTTACGAATTCATACCGTGGTAGGCGGCATAGTACGACGAAGCGGTTGGGTCGAAAAACAGGTTGCCGTCATATCGGTGGGTTC" ) sequence.id = "sequence" alignments1 = aligner.align(chromosome, transcript) alignment1 = alignments1[0] self.assertEqual(len(alignment1.path), 164) self.assertEqual( str(alignment1), """\ GCCTACCGTATAACAATGGTTATA------ATACAAGG-CGG----TCATAATTAAAGGGAGTG---CAGCAACGGCCTGCTCTCCAAAAAAACAGGTTTTATGAAAAGAAAGTGCATTAACTGTTAAAGC-----CGTCATATCGGTGG----GTTCTGCCAGTCACCGGCATACGTCCTGGGACAAAGACTTTTTACT-ACAATGCCAGGCGGGAGAGTCACCCGCCGCGGTGTCGACCCAGGGG-ACAGCGGGAAGATGTCGTGGTTTC-CTT---G---TCATTAACC-------A-ACTCCATCTTA--AAAGCTCCTCTAGCCATGGCATG---GT---ACGTTGCGCGCACCCTTTTA-T----CG--GTAAGG-------CG---CGGT-------GACTCTC--------TCCCAAAACAGTGCCATAATGGTTCGCTTCCTACCT-------AAG-GCACTT-ACGGCCAATTAATGCGCAAGCGAGCGGAAGGTC-TAACAG-GGCACCGAATTCGATTA |||--||-||------|---||||-|||----||------.|||||---|---|||||-----|.||-----------|||--||||-|------||.|.|----||||----||||-----||-|||----||||----||--||--|-|--||--|||.||-|||----||||-|---|||-||-.||||------------|-|---------||||-|-------||||-||||---------|||-------|-|||---|---||||--|||-------|-|--||-|-|||--|.|-------|||--||---|||---||---|--|||||-|||------||-|----||--|.||||-------||---||||-------|||---|--------||..||||||----------|||----||--||--|-------|||-|||-||-||.|----|||------||||---|-----|||-||.|.|-||----|--||| GGAAT--TT-TAGCAGCCA---AAGGACGGATCCTC------CAAGGG---GCCCCAGCA-----CAGC-----------ACA--TTTT-T------AACGCG----AACT----AAGCGGGAGCG-CAT----GTGGGACAGT--TG--A-T--CC--CATCCG-CCT----CAAA-A---TTT-CTCGCAAT------------A-T---------CGGT-T-------GGGGCACAG---------GTC-------CACTTTACGAATTCAT--ACCGTGGTAGAGA--CC-T-TTATTAGA-------TAG--AT---ATGACTGTTTGA--TTGCG-GCA------TAGTACGACGAAGCAAGGGGATGGACGTTTCGGTTGCATTCGAC---CGGGTTGGGTCGAAAAACA----------GGT----TT--TA--TGAAAAGAAAGTGCA-TTAACTG----TTA------AAGC---C-----GTCATATCGGTGG----G--TTC """, # noqa: W291 ) alignments2 = aligner.align(transcript, sequence) alignment2 = alignments2[0] self.assertEqual(len(alignment2.path), 12) self.assertEqual( str(alignment2), """\ GGAATTTTAGCAGCCAAAGGACGGATCCTCCAAGGGGCCCCAGCACAGCACATTTTTAACGCGAACTAAGCGGGAGCGCATGTGGGACAGTTGATCCCATCCGCCTCAAAATTTCTCGCAATATCGGTTGGGGCACAGGTCCACTTTACGAATTCATACCGTGGTAGAGACCTTTATTAGATAGATATGACTGTTTGATTGCGGCATAGTACGACGAAGCAAGGGGATGGACGTTTCGGTTGCATTCGACCGGGTTGGGTCGAAAAACAGGTTTTATGAAAAGAAAGTGCATTAACTGTTAAAGCCGTCATATCGGTGGGTTC |||||||||||||||||||||||||||||||||||--------------------|||||||||||||||||||||||--------------------------------------|||||||||||||||||||||||---------------------------------||||||||||||||||||||--------------------------------|||||||||||||||||||||------------------------------|||||||||||||||||||| TCCAAGGGGCCCCAGCACAGCACATTTTTAACGCG--------------------GGGACAGTTGATCCCATCCGCCT--------------------------------------TTTACGAATTCATACCGTGGTAG---------------------------------GCGGCATAGTACGACGAAGC--------------------------------GGTTGGGTCGAAAAACAGGTT------------------------------GCCGTCATATCGGTGGGTTC """, ) alignment = alignment1.map(alignment2) self.assertEqual(len(alignment.path), 76) self.assertEqual( str(alignment), """\ GCCTACCGTATAACAATGGTTATAATACAAGGCGGTCATAATTAAAGGGAGTG---CAGCAACGGCCTGCTCTCCAAAAAAACAGGTTTTATGAAAAGAAAGTGCATTAACTGTTAAAGCCGTCATATCGGTGG----GTTCTGCCAGTCACCGGCATACGTCCTGGGACAAAGACTTTTTACTACAATGCCAGGCGGGAGAGTCACCCGCCGCGGTGTCGACCCAGGGGACAGCGGGAAGATGTCGTGGTTTCCTT---G---TCATTAACCAACTCCATCTTAAAAGCTCCTCTAGCCATGGCATGGTACGTT-------GCGCGCACCCTTTTA-T----CG--GTAAGGCGCGGTGACTCTC-------TCCCAAAACAGTGCCATAATGGTTCGCTTCCTACCTAAGGCACTTACGGCCAATTAATGCGCAAGCGAGCGGAAGGTC-TAACAG-GGCACCGAATTCGATTA ||------.|||||---|---|||||-----|.||-----------|||--||||-|------||.|.|----------------------------||----||--||--|-|--||--|||.||-|||------------------------------------------------------------------------------------------||---|---||||--|||-------------------------------------------------|||-|||------||-|----||--|.------------------------||..||||||----------|||----|------------------------------------||---|-----|||-||.|.|-||----|--||| TC------CAAGGG---GCCCCAGCA-----CAGC-----------ACA--TTTT-T------AACGCG----------------------------GGGACAGT--TG--A-T--CC--CATCCG-CCT------------------------------------------------------------------------------------------TTTACGAATTCAT--ACC------------------------------------------GTGGTAGGCG-GCA------TAGTACGACGAAGC-----------------GGTTGGGTCGAAAAACA----------GGT----T------------------------------------GC---C-----GTCATATCGGTGG----G--TTC """, # noqa: W291 ) psl = format(alignment, "psl") self.assertEqual( psl, """\ 96 10 0 0 11 36 27 294 + sequence 142 0 142 chromosome 440 35 435 37 2,6,1,5,4,3,4,1,6,2,2,2,1,1,2,6,3,2,1,4,3,3,3,2,1,2,2,10,3,1,2,1,3,6,2,1,3, 0,2,8,12,17,21,24,28,29,35,41,43,45,46,47,49,55,58,63,67,71,81,84,87,90,95,99,108,118,121,122,124,125,129,136,138,139, 35,43,52,53,63,78,83,88,95,129,131,135,139,141,144,148,155,248,250,251,257,302,306,315,317,318,320,339,359,366,403,408,414,417,423,429,432, """, )
def test2(self): aligner = self.aligner chromosome = Seq( "CTAATGCGCCTTGGTTTTGGCTTAACTAGAAGCAACCTGTAAGATTGCCAATTCTTCAGTCGAAGTAAATCTTCAATGTTTTGGACTCTTAGCGGATATGCGGCTGAGAAGTACGACATGTGTACATTCATACCTGCGTGACGGTCAGCCTCCCCCGGGACCTCATTGGGCGAATCTAGGTGTGATAATTGACACACTCTTGGTAAGAAGCACTCTTTACCCGATCTCCAAGTACCGACGCCAAGGCCAAGCTCTGCGATCTAAAGCTGCCGATCGTAGATCCAAGTCCTCAGCAAGCTCGCACGAATACGCAGTTCGAAGGCTGGGTGTTGTACGACGGTACGGTTGCTATAGCACTTTCGCGGTCTCGCTATTTTCAGTTTGACTCACCAGTCAGTATTGTCATCGACCAACTTGGAATAGTGTAACGCAGCGCTTGA" ) chromosome.id = "chromosome" transcript = Seq( "CACCGGCGTCGGTACCAGAGGGCGTGAGTACCTTGTACTAGTACTCATTGGAATAATGCTCTTAGAAGTCATCTAAAAGTGACAACGCCTGTTTGGTTATGACGTTCACGACGCGTCTTAACAGACTAGCATTAGACCGACGGGTTGAGGCGTCTGGGTTGATACAGCCGTTTGCATCAGTGTATCTAACACTCTGAGGGATAATTGATGAACCGTGTTTTCCGATAGGTATGTACAGTACCACCACGCACGACTAAGGACCATTTTCTGCGTGCGACGGTTAAAATAACCTCAATCACT" ) transcript.id = "transcript" sequence = Seq( "TCCCCTTCTAATGGAATCCCCCTCCGAAGGTCGCAGAAGCGGCCACGCCGGAGATACCAGTTCCACGCCTCAGGTTGGACTTGTCACACTTGTACGCGAT" ) sequence.id = "sequence" alignments1 = aligner.align(chromosome, transcript) alignment1 = alignments1[0] self.assertEqual(len(alignment1.path), 126) self.assertEqual( str(alignment1), """\ CTAATGCGCCTTGGTTTTGGCTTAACTAGA-------AGCAACC-TGTAAGATTGCCAATTCTTCAGTCGAAGTAAATCTTCAATGTTTTGGA------CTCTTAG----CGGATATGCGGCTGAGAAGTACGACA-----TGT---GT----ACATTCATAC--CTGCGT-------GACGGTCAGCCT----CCCCCGGGACCTCATTG-GGCGAATCTAGGTGTGATA-A-----TTGACA-CA----CTCTTGGTAAGAAGCACTCT---------TTACCCGATCTCCAAGTACCGACGCCAAGGCCAAGCTCTG-----CGATCTAAAGCTGCCGATCGTAGATCCAAGTCCTCAGCAAGCTCGCACGAATACGCAG-------TTCGAAGGCTGGGTGTTGTACGACGGTACGGTTGCTATAGCACTTTCGCGGTCTCGCTATTTTCAGTTTGACTCACCAGTCAGTATTGTCATCGACCAACTTGGAATAGTGTAACGCAGCGCTTGA |||--|.|||---------||.|||-------||-.|||-||||------------||--------||||---|-|||-----|||||------|||||||----|----||----||-|.||||--||||-----|||---||----||.|||--||--|-||||-------|||--|-|||.|----||..||||------|||-||||--|||.|||-|||||-|-----|||-||-||----.|||-------||-||||||---------||----|||------|.||||-----------------||-----||||----||.|----||-|||----|-|||.|-||.||----|||||||.||---||-------||------|||.|||-----|||||||--------||-|--|----------------||----------|---|||--|||--------|||-----||| CACCGGCG--TCGGT---------ACCAGAGGGCGTGAG-TACCTTGTA------------CT--------AGTA---C-TCA-----TTGGAATAATGCTCTTAGAAGTC----AT----CT-AAAAGT--GACAACGCCTGTTTGGTTATGACGTTC--ACGAC-GCGTCTTAACAGAC--T-AGCATTAGACCGACGGG------TTGAGGCG--TCTGGGT-TGATACAGCCGTTTG-CATCAGTGTATCT-------AA-CACTCTGAGGGATAATT----GAT------GAACCG-----------------TGTTTTCCGAT----AGGT----AT-GTA----C-AGTAC-CACCA----CGCACGACTA---AGGACCATTTT------CTGCGTG-----CGACGGT--------TA-A--A----------------AT----------A---ACC--TCA--------ATC-----ACT """, # noqa: W291 ) alignments2 = aligner.align(transcript, sequence) alignment2 = alignments2[0] self.assertEqual(len(alignment2.path), 66) self.assertEqual( str(alignment2), """\ CACCGGCGTCGGTACCAGAGGGCGTGAGTACCTTGTACTAGTACTCATTGGAATAATGCTCTTAGAAGTCATCTAAAAGTGACAACGCCTGTTTGGTTATGACGTTCACGACGCGTCTTAACAGACTAGCATTAGACCGACG--GGTTGAGGCGTCTGGGTTGATACAGCCGTTTGCATCAGTGTATCTAACA---CTCTGAGGGATAATTGATGAACCGTGTTTTCCGATAGGTATGTACAGTACCACCACGCACGACTAAGGACCATTTTCTG--CGTGCGACGGTTAAAATAACCTCAATCACT ||------------|-------||||---|||------|-||||||-------------------------------|.||-------------|--||-|||.|-|||---.||||--|||----|.||-|||--|---||------------|||||------------||||---||---||---|||--|||-----|||--||--|-----||----------------||---------|||-|||-------------||--|--|||| TC------------C-------CCTT---CTA------A-TGGAAT-------------------------------CCCC-------------C--TC-CGAAG-GTC---GCAGA--AGC----GGCC-ACGCCG---GA------------GATAC------------CAGT---TC---CACGCCTC--AGG-----TTG--GA--C-----TT----------------GT---------CAC-ACT-------------TGTAC--GCGAT """, # noqa: W291 ) alignment = alignment1.map(alignment2) self.assertEqual(len(alignment.path), 78) self.assertEqual( str(alignment), """\ CTAATGCGCCTTGGTTTTGGCTTAACTAGAAGCAA-CC-TGTAAGATTGCCAATTCTTCAGTCGAAGTAAATCTTCAATGTTTTGGACTCTTAGCGGATATGCGGCTGAGAAGTACGACATGTGTA------CATTCATAC--CTGCGT----GACGGTCAGCCT--CCCCCG--GGACCTCATTGGGCGAATCTAGGTGT-GATAATTGACA-CAC--TCTTGGTAAGAAGCA---CTCT---TTACCCGATCTCCAAGTACCGACGCCAAGGCCAAGCTCTGCGATCTAAAGCTGCCGATCGTAGATCCAA--GTCCTCAGCAAGCTCGCACGAATACGCAGTTCGAAGGCTG--GGTGTTGTACGACGGTACGGTTGCTATAGCACTTTCGCGGTCTCGCTATTTTCAGTTTGACTCACCAGTCAGTATTGTCATCGACCAACTTGGAATAGTGTAACGCAGCGCTTGA |.------------------------||-|---------------||--------|----------|------||||---------------------------------------------|--||---|--.-|-||----||-----|||----||-.||--|---------|----------------||||--------||---||-----------||---|||----||----|--------|.--|---------------------------------------------------||--------------|||-|.|---------------||--.--|-----||| TC-----------------------CCCTT---------------CT--------A----------A------TGGA---------------------------------------ATCCCCC--TC---CGAA-G-GTCGCAGA-----AGC--GGCC-ACGCCG---------G---------------AGATA-------CCA-GTTC-----------CACGCCTC-AGGTT----G--------GA--C-------------------------------------------------TTGT--------------CAC-ACT---------------TGTAC--G-----CGAT """, # noqa: W291 ) psl = format(alignment, "psl") self.assertEqual( psl, """\ 61 6 0 0 14 32 28 260 + sequence 100 0 99 chromosome 440 10 337 35 2,2,1,2,1,1,4,1,2,1,1,1,2,2,3,2,3,1,1,4,2,2,2,3,2,1,2,1,2,3,3,2,1,1,3, 0,3,6,7,9,10,11,21,22,24,27,28,29,35,37,42,44,49,50,52,57,61,63,68,74,76,77,79,82,84,87,90,94,95,96, 10,35,37,53,63,74,81,124,127,132,133,135,137,139,146,151,154,157,167,183,194,197,210,212,216,222,231,235,285,301,305,323,325,328,334, """, )
def load_csv_file(file, delimiter=";"): """ This function loads a "Primer" file. @returns: List of PrimerPair instances """ pos = { "id": 0, "forwardPrimer": 0, "reversePrimer": 0, "fPDNA": 0, "rPDNA": 0, "ampliconMinLength": 0, "ampliconMaxLength": 0 } header_len = len(pos) primer_dict = {} with open(file, newline='') as csvfile: csvreader = csv.reader(csvfile, delimiter=delimiter) headers = next(csvreader) if (len(headers) != header_len): raise ValueError("Wrong header") for i in range(len(headers)): if (headers[i] not in pos): raise ValueError("Unknown header " + headers[i]) pos[headers[i]] = i i = 1 for row in csvreader: i += 1 if (len(row) == header_len): fprimer = Seq(row[pos["fPDNA"]], IUPAC.IUPACAmbiguousDNA()) fprimer = SeqRecord(fprimer) fprimer.id = row[pos["forwardPrimer"]] rprimer = Seq(row[pos["rPDNA"]], IUPAC.IUPACAmbiguousDNA()) rprimer = SeqRecord(rprimer) if (True): #TODO rprimer = rprimer.reverse_complement() rprimer.id = row[pos["reversePrimer"]] primer_pair = PrimerPair((row[pos["id"]]), fprimer, rprimer, int(row[pos["ampliconMinLength"]]), int(row[pos["ampliconMaxLength"]])) if (check_primer_pair_integrity(primer_pair)): primer_dict[row[pos["id"]]] = primer_pair else: logging.warning("Skipping primer pair " + primer_pair.id + ", bad sequence") else: logging.warning("Wrong primer pair in line " + str(i)) return primer_dict
def test_internal(self): aligner = self.aligner chromosome = Seq("AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA") chromosome.id = "chromosome" transcript = Seq("GGGGGGGCCCCCGGGGGGA") transcript.id = "transcript" sequence = Seq("GGCCCCCGGG") sequence.id = "sequence" alignments1 = aligner.align(chromosome, transcript) self.assertEqual(len(alignments1), 1) alignment1 = alignments1[0] self.assertEqual(alignment1.path, ((12, 0), (31, 19))) self.assertEqual( str(alignment1), """\ AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA ||||||||||||||||||| GGGGGGGCCCCCGGGGGGA """, ) alignments2 = aligner.align(transcript, sequence) self.assertEqual(len(alignments2), 1) alignment2 = alignments2[0] self.assertEqual(alignment2.path, ((5, 0), (15, 10))) self.assertEqual( str(alignment2), """\ GGGGGGGCCCCCGGGGGGA |||||||||| GGCCCCCGGG """, ) alignment = alignment1.map(alignment2) self.assertEqual(len(alignment.path), 2) self.assertSequenceEqual(alignment.path[0], [17, 0]) self.assertSequenceEqual(alignment.path[1], [27, 10]) self.assertEqual( str(alignment), """\ AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA |||||||||| GGCCCCCGGG """, ) psl = format(alignment, "psl") self.assertEqual( psl, """\ 10 0 0 0 0 0 0 0 + sequence 10 0 10 chromosome 40 17 27 1 10, 0, 17, """, )
def test_reverse_transcript_sequence(self): aligner = self.aligner chromosome = Seq("AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA") chromosome.id = "chromosome" transcript = Seq("TCCCCCCGGGGGCCCCCCC") transcript.id = "transcript" sequence = Seq("CCCGGGGGCC") sequence.id = "sequence" alignments1 = aligner.align(chromosome, transcript, "-") self.assertEqual(len(alignments1), 1) alignment1 = alignments1[0] self.assertEqual(alignment1.path, ((12, 19), (31, 0))) self.assertEqual( str(alignment1), """\ AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA ||||||||||||||||||| GGGGGGGCCCCCGGGGGGA """, # noqa: W291 ) alignments2 = aligner.align(transcript, sequence) self.assertEqual(len(alignments2), 1) alignment2 = alignments2[0] self.assertEqual(alignment2.path, ((4, 0), (14, 10))) self.assertEqual( str(alignment2), """\ TCCCCCCGGGGGCCCCCCC |||||||||| CCCGGGGGCC """, # noqa: W291 ) alignment = alignment1.map(alignment2) self.assertEqual(len(alignment.path), 2) self.assertSequenceEqual(alignment.path[0], [17, 10]) self.assertSequenceEqual(alignment.path[1], [27, 0]) self.assertEqual( str(alignment), """\ AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA |||||||||| GGCCCCCGGG """, # noqa: W291 ) psl = format(alignment, "psl") self.assertEqual( psl, """\ 10 0 0 0 0 0 0 0 - sequence 10 0 10 chromosome 40 17 27 1 10, 0, 17, """, )
def test_left_overhang(self): aligner = self.aligner chromosome = Seq("GGGCCCCCGGGGGGAAAAAAAAAA") chromosome.id = "chromosome" transcript = Seq("AGGGGGCCCCCGGGGGGA") transcript.id = "transcript" sequence = Seq("GGGGGCCCCCGGG") sequence.id = "sequence" alignments1 = aligner.align(chromosome, transcript) self.assertEqual(len(alignments1), 1) alignment1 = alignments1[0] self.assertEqual( str(alignment1), """\ GGGCCCCCGGGGGGAAAAAAAAAA ||||||||||||||| AGGGGGCCCCCGGGGGGA """, ) alignments2 = aligner.align(transcript, sequence) self.assertEqual(len(alignments2), 1) alignment2 = alignments2[0] self.assertEqual( str(alignment2), """\ AGGGGGCCCCCGGGGGGA ||||||||||||| GGGGGCCCCCGGG """, ) alignment = alignment1.map(alignment2) self.assertTrue( numpy.array_equal(alignment.coordinates, numpy.array([[0, 11], [2, 13]]))) self.assertEqual( str(alignment), """\ GGGCCCCCGGGGGGAAAAAAAAAA ||||||||||| GGGGGCCCCCGGG """, ) psl = format(alignment, "psl") self.assertEqual( psl, """\ 11 0 0 0 0 0 0 0 + sequence 13 2 13 chromosome 24 0 11 1 11, 2, 0, """, )
def test_left_overhang(self): aligner = self.aligner chromosome = Seq("GGGCCCCCGGGGGGAAAAAAAAAA") chromosome.id = "chromosome" transcript = Seq("AGGGGGCCCCCGGGGGGA") transcript.id = "transcript" sequence = Seq("GGGGGCCCCCGGG") sequence.id = "sequence" alignments1 = aligner.align(chromosome, transcript) self.assertEqual(len(alignments1), 1) alignment1 = alignments1[0] self.assertEqual( str(alignment1), """\ GGGCCCCCGGGGGGAAAAAAAAAA ||||||||||||||| AGGGGGCCCCCGGGGGGA """, # noqa: W291 ) alignments2 = aligner.align(transcript, sequence) self.assertEqual(len(alignments2), 1) alignment2 = alignments2[0] self.assertEqual( str(alignment2), """\ AGGGGGCCCCCGGGGGGA ||||||||||||| GGGGGCCCCCGGG """, # noqa: W291 ) alignment = alignment1.map(alignment2) self.assertEqual(len(alignment.path), 2) self.assertSequenceEqual(alignment.path[0], [0, 2]) self.assertSequenceEqual(alignment.path[1], [11, 13]) self.assertEqual( str(alignment), """\ GGGCCCCCGGGGGGAAAAAAAAAA ||||||||||| GGGGGCCCCCGGG """, # noqa: W291 ) psl = format(alignment, "psl") self.assertEqual( psl, """\ 11 0 0 0 0 0 0 0 + sequence 13 2 13 chromosome 24 0 11 1 11, 2, 0, """, )
def test_right_overhang(self): aligner = self.aligner chromosome = Seq("AAAAAAAAAAAAGGGGGGGCCCCCGGG") chromosome.id = "chromosome" transcript = Seq("GGGGGGGCCCCCGGGGGGA") transcript.id = "transcript" sequence = Seq("GGCCCCCGGGGG") sequence.id = "sequence" alignments1 = aligner.align(chromosome, transcript) self.assertEqual(len(alignments1), 1) alignment1 = alignments1[0] self.assertEqual( str(alignment1), """\ AAAAAAAAAAAAGGGGGGGCCCCCGGG ||||||||||||||| GGGGGGGCCCCCGGGGGGA """, # noqa: W291 ) alignments2 = aligner.align(transcript, sequence) self.assertEqual(len(alignments2), 1) alignment2 = alignments2[0] self.assertEqual( str(alignment2), """\ GGGGGGGCCCCCGGGGGGA |||||||||||| GGCCCCCGGGGG """, # noqa: W291 ) alignment = alignment1.map(alignment2) self.assertEqual(len(alignment.path), 2) self.assertSequenceEqual(alignment.path[0], [17, 0]) self.assertSequenceEqual(alignment.path[1], [27, 10]) self.assertEqual( str(alignment), """\ AAAAAAAAAAAAGGGGGGGCCCCCGGG |||||||||| GGCCCCCGGGGG """, # noqa: W291 ) psl = format(alignment, "psl") self.assertEqual( psl, """\ 10 0 0 0 0 0 0 0 + sequence 12 0 10 chromosome 27 17 27 1 10, 0, 17, """, )
def makeFasta(fa_path, path): #snps = open("/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/HapMapPhase3GenotypeData/SNPMappings.txt") snps = open(path+"SNPAnnotation.txt") print "generating SNP dictionary" for snp in snps: fields = snp.strip().split("\t") ma = fields[5] alleles = list(set([fields[3], fields[4]]).difference([fields[5]])) alleles.append(fields[5]) if snp_dict.has_key(fields[1]): snp_dict[ fields[1] ].append( (fields[2], alleles) ) else: snp_dict[ fields[1] ] = [ (fields[2], alleles ) ] #print snp_dict #path = "/Users/dashazhernakova/Documents/UMCG/hg19/referenceByChr/" #path = "/Users/dashazhernakova/Documents/UMCG/" print "generating new fasta files" for in_f in glob.glob( os.path.join(fa_path, '*.fa') ): chr_num = in_f.split("/")[len(in_f.split("/")) - 1].replace(".fa", "") ref_fa = open(path + chr_num + "_ref.fa","w") alt_fa = open(path + chr_num + "_alt.fa","w") print "processing " + in_f print "processing " + chr_num for seq_record in SeqIO.parse(in_f, "fasta"): seq_ref = seq_record.seq seq_alt = seq_record.seq #id = in_f.split("/")[ len(in_f.split("/")) - 1 ].replace(".fa","") id = chr_num for pos, al in snp_dict[id]: #print pos, al pos = int(pos) seq_ref = Seq(str(seq_ref[:pos - 1]) + al[0] + str(seq_ref[pos:])) seq_ref.id = id seq_alt = Seq(str(seq_alt[:pos - 1]) + al[1] + str(seq_alt[pos:])) seq_alt.id = id SeqIO.write([SeqRecord(seq_ref,id = seq_ref.id, description = 'ref')], ref_fa, "fasta") SeqIO.write([SeqRecord(seq_alt,id = seq_ref.id, description = 'alt')], alt_fa, "fasta") #print seq alt_fa.close() ref_fa.close() print "finished generating fasta files"
def make_fasta_from_list(querylist, queryfasta, gaplen, seqoutname, outfilename): ### Query list element format # Gap: [61252 , (0:61252) , gap , 61252 , 0] # [length , (T_start:Tstop) , "gap" , length , 0] # Object: [b40-14.HS_iter4_seq3733|+ , (61252:6463804) , 93612:7595148 , -6402552 , 4526208] # [ID|strand , (T_start:Tstop) , Q_start:Q_stop , -(alignment length) , matches] gaplen = int(gaplen) seq = Seq("") seq.id = seqoutname for CompntId in querylist: Id, T_range, Q_range, alignment, matches = CompntId if not Q_range == "gap": CompntId_name = Id[:-2] Orientation = Id[-1] # Add gap between Components if str(seq) != "": seq = seq + "N" * gaplen if Orientation == "-": my_sub_seq = queryfasta[CompntId_name].reverse_complement() else: my_sub_seq = queryfasta[CompntId_name] seq = seq + my_sub_seq # Print the entire sequence seq.id = seqoutname seq.description = "" print >> outfilename, seq.format('fasta')
def test_random(aligner, nBlocks1=1, nBlocks2=1, strand1="+", strand2="+"): chromosome = "".join(["ACGT"[random.randint(0, 3)] for i in range(1000)]) nBlocks = nBlocks1 transcript = "" position = 0 for i in range(nBlocks): position += random.randint(60, 80) blockSize = random.randint(60, 80) transcript += chromosome[position:position + blockSize] position += blockSize nBlocks = nBlocks2 sequence = "" position = 0 for i in range(nBlocks): position += random.randint(20, 40) blockSize = random.randint(20, 40) sequence += transcript[position:position + blockSize] position += blockSize chromosome = Seq(chromosome) transcript = Seq(transcript) sequence = Seq(sequence) if strand1 == "-": chromosome = chromosome.reverse_complement() if strand2 == "-": sequence = sequence.reverse_complement() chromosome.id = "chromosome" transcript.id = "transcript" sequence.id = "sequence" alignments1 = aligner.align(chromosome, transcript, strand=strand1) alignment1 = alignments1[0] alignments2 = aligner.align(transcript, sequence, strand=strand2) alignment2 = alignments2[0] alignment = alignment1.map(alignment2) psl_check = map_check(alignment1, alignment2) psl = format(alignment, "psl") assert psl == psl_check print("Randomized test %d, %d, %s, %s OK" % (nBlocks1, nBlocks2, strand1, strand2))
record.id) continue elif record.id in trims: trimloc = trims[record.id] print(record.id, "before is ", len(record), "long") if len(trimloc) > 1: print("more than one trim location ... maybe a chimera?", record.id) else: locs = trimloc[0].split("..") left = int(locs[0]) - 1 right = int(locs[0]) if left == 0: record = record[right - 1:] elif right == len(record): record = record[:left] else: # internal slicing temprecord = Seq(record[:left] + record[right - 1:], DNAAlphabet()) temprecord.id = record.id print(record.id, len(record)) record = temprecord print(len(record)) print(record.id, "after is ", len(record), "long") elif record.id in duplicates: log.write("Skipping %s as is considered a duplicate\n" % record.id) continue SeqIO.write(record, output_handle, "fasta")
#!/usr/bin/env python """calculate and plot the base pair probability matrix""" # from __future__ import print_function import RNA import matplotlib import Bio import matplotlib.pyplot as plt from Bio.Seq import Seq from ss_dotplot import versions_used # Define sample sequence godzilla = Seq('GAGACCCGTAAAAGGGTCTCGAAAGTGTGTAAAAAACACAC') godzilla.id = 'Godzilla Queen of Monsters' #foldgod = RNA.fold_compound(str(godzilla)) Hirsch = Seq('CCGCACAGCGGGCAGUGCCC') Hirsch.id = 'Papa Hirsch protects us all' #foldHirsch = RNA.fold_compound(str(Hirsch)) monsters = (godzilla, Hirsch) # use either 'BuPu' or 'Greys' colormap = 'BuPu' def plot_bppm ( bppm, name ): # plot base pair probability matrix, write plot to post script file plt.matshow(bppm, fignum=name, cmap=plt.get_cmap(colormap)) plt.savefig('{:s}.ps'.format(name), format='ps') plt.close() return
def test_sort(self): target = Seq("ACTT") query = Seq("ACCT") sequences = (target, query) coordinates = numpy.array([[0, 4], [0, 4]]) alignment = Align.Alignment(sequences, coordinates) self.assertEqual( str(alignment), """\ ACTT ||.| ACCT """, ) alignment.sort() self.assertEqual( str(alignment), """\ ACCT ||.| ACTT """, ) alignment.sort(reverse=True) self.assertEqual( str(alignment), """\ ACTT ||.| ACCT """, ) target.id = "seq1" query.id = "seq2" alignment.sort() self.assertEqual( str(alignment), """\ ACTT ||.| ACCT """, ) alignment.sort(reverse=True) self.assertEqual( str(alignment), """\ ACCT ||.| ACTT """, ) alignment.sort(key=GC) self.assertEqual( str(alignment), """\ ACTT ||.| ACCT """, ) alignment.sort(key=GC, reverse=True) self.assertEqual( str(alignment), """\ ACCT ||.| ACTT """, )
def write_unmapped_scaffold(outblocks, scaffold, stats, unmapped_output, genome, chromosomes): seq = Seq('') dbblocks = [] mapped = False scaffold_chromosomes = [] for block in outblocks: if genome.revised_db: dbblocks.append([block.chromosome, block.cm, scaffold, block.start, block.end, block.length]) if genome.revised_fasta: seq += genome.sequences[scaffold][block.start-1:block.end] if block.chromosome != '0': scaffold_chromosomes.append(int(block.chromosome)) stats['scaffolds'] += 1 scaffold_chromosomes = set(scaffold_chromosomes) scaffold_start, scaffold_end = outblocks[0].start, outblocks[-1].end scaffold_length = scaffold_end - scaffold_start + 1 stats['scaffold_length'] += scaffold_length if seq.features or len(scaffold_chromosomes) > 0 and len(dbblocks) > 1: unmapped_output.append([gd.Block(scaffold, scaffold_start, scaffold_end)]) scaffold_name = genome.revised + "{:05d}".format(genome.revised_count) genome.revised_count += 1 genome.revised_names["{}_{}_{}".format(scaffold, scaffold_start, scaffold_end)] = scaffold_name stats['written_scaffolds'] += 1 stats['written_length'] += scaffold_length if genome.revised_db: for block in dbblocks: genome.revised_db.execute("insert into scaffold_map values (?,?,?,?,?,?)", block) if genome.revised_fasta: seq.description = "length={}".format(len(seq)) seq.id = scaffold_name SeqIO.write(seq, genome.revised_fasta, "fasta") if len(scaffold_chromosomes) > 0: chrom = next(iter(scaffold_chromosomes)) chr_unmapped_end = chromosomes[chrom].unmapped_start + scaffold_length - 1 chromosomes[chrom].agp.append("{}\t{}\t{}\t{}\tD\t{}\t1\t{}\t+\n".format("chr{}_unmapped".format(chrom), chromosomes[chrom].unmapped_start, chr_unmapped_end, chromosomes[chrom].unmapped_part, scaffold_name, scaffold_length)) chromosomes[chrom].unmapped_part += 1 chromosomes[chrom].agp.append("{}\t{}\t{}\t{}\tN\t100\tfragment\tno\n".format("chr{}_unmapped".format(chrom), chr_unmapped_end+1, chr_unmapped_end+100, chromosomes[chrom].unmapped_part)) chromosomes[chrom].unmapped_part += 1 chromosomes[chrom].unmapped_start = chr_unmapped_end + 101 else: stats['discard_scaffolds'] += 1 stats['discard_length'] += scaffold_length for dbblock in dbblocks: dblength = dbblock[5] partslength = 0 for newpart in genome.newparts[scaffold]: for origpart in genome.origparts[newpart.oldname]: if dbblock[2] == origpart.newname and (dbblock[3] <= origpart.newstart <= dbblock[4] or dbblock[3] <= origpart.newend <= dbblock[4]): if origpart.parttype in ['active', 'retained']: partslength += origpart.newend - origpart.newstart + 1 origpart.parttype = 'removed' if dblength != partslength: print(scaffold, dblength, partslength, dbblock)
if len(trimloc) > 1: trimloc = sorted(merge_intervals(trimloc), reverse=True, key=lambda locitem: locitem[0]) seqlen = len(record) for loc in trimloc: left = int(loc[0]) - 1 right = int(loc[1]) newrecord = Seq("", generic_dna) log.write("trimming %d to %d in %s len=%d" % (left, right, record.id, len(record))) if left == 0: newrecord = record[right - 1:] elif right == len(record): newrecord = record[:left] else: # internal slicing log.write("-->internal slicing :%d .. %d:" % (left, right - 1)) log.write(' left string is %s' % record[0:left]) log.write(' right string is %s' % record[right - 1:]) newrecord = record[0:left] + record[right - 1:] newrecord.id = record.id log.write(" -- new len for %s is %d: %s" % (newrecord.id, len(newrecord), newrecord)) record = newrecord if (len(record) >= 200): SeqIO.write(record, output_handle, "fasta")
def write_unmapped_scaffold(outblocks, scaffold, stats, unmapped_output, genome, chromosomes): seq = Seq('') dbblocks = [] mapped = False scaffold_chromosomes = [] for block in outblocks: if genome.revised_db: dbblocks.append([ block.chromosome, block.cm, scaffold, block.start, block.end, block.length ]) if genome.revised_fasta: seq += genome.sequences[scaffold][block.start - 1:block.end] if block.chromosome != '0': scaffold_chromosomes.append(int(block.chromosome)) stats['scaffolds'] += 1 scaffold_chromosomes = set(scaffold_chromosomes) scaffold_start, scaffold_end = outblocks[0].start, outblocks[-1].end scaffold_length = scaffold_end - scaffold_start + 1 stats['scaffold_length'] += scaffold_length if seq.features or len(scaffold_chromosomes) > 0 and len(dbblocks) > 1: unmapped_output.append( [gd.Block(scaffold, scaffold_start, scaffold_end)]) scaffold_name = genome.revised + "{:05d}".format(genome.revised_count) genome.revised_count += 1 genome.revised_names["{}_{}_{}".format(scaffold, scaffold_start, scaffold_end)] = scaffold_name stats['written_scaffolds'] += 1 stats['written_length'] += scaffold_length if genome.revised_db: for block in dbblocks: genome.revised_db.execute( "insert into scaffold_map values (?,?,?,?,?,?)", block) if genome.revised_fasta: seq.description = "length={}".format(len(seq)) seq.id = scaffold_name SeqIO.write(seq, genome.revised_fasta, "fasta") if len(scaffold_chromosomes) > 0: chrom = next(iter(scaffold_chromosomes)) chr_unmapped_end = chromosomes[ chrom].unmapped_start + scaffold_length - 1 chromosomes[chrom].agp.append( "{}\t{}\t{}\t{}\tD\t{}\t1\t{}\t+\n".format( "chr{}_unmapped".format(chrom), chromosomes[chrom].unmapped_start, chr_unmapped_end, chromosomes[chrom].unmapped_part, scaffold_name, scaffold_length)) chromosomes[chrom].unmapped_part += 1 chromosomes[chrom].agp.append( "{}\t{}\t{}\t{}\tN\t100\tfragment\tno\n".format( "chr{}_unmapped".format(chrom), chr_unmapped_end + 1, chr_unmapped_end + 100, chromosomes[chrom].unmapped_part)) chromosomes[chrom].unmapped_part += 1 chromosomes[chrom].unmapped_start = chr_unmapped_end + 101 else: stats['discard_scaffolds'] += 1 stats['discard_length'] += scaffold_length for dbblock in dbblocks: dblength = dbblock[5] partslength = 0 for newpart in genome.newparts[scaffold]: for origpart in genome.origparts[newpart.oldname]: if dbblock[2] == origpart.newname and ( dbblock[3] <= origpart.newstart <= dbblock[4] or dbblock[3] <= origpart.newend <= dbblock[4]): if origpart.parttype in ['active', 'retained']: partslength += origpart.newend - origpart.newstart + 1 origpart.parttype = 'removed' if dblength != partslength: print(scaffold, dblength, partslength, dbblock)