def test_read_gff(self): level_records, other_records = helper.read_gff( os.path.join(data_dir, 'test_read_gff.gff')) expected_level_records = [[ gff.GFF_record('\t'.join([ 'seqname', 'SOURCE', 'gene', '42', '100', '.', '+', '.', 'ID=id' ])) ], [ gff.GFF_record('\t'.join([ 'seqname', 'SOURCE', 'mRNA', '42', '100', '.', '+', '.', 'ID=id.1;Parent=id' ])) ], [ gff.GFF_record('\t'.join([ 'seqname', 'SOURCE', 'exon', '52', '62', '.', '+', '.', 'ID=id.1:exon:1;Parent=id.1' ])) ]] expected_other_records = { 'seqname': [ gff.GFF_record('\t'.join([ 'seqname', 'SOURCE', 'eggs', '150', '200', '.', '+', '.', 'ID=spam' ])) ] } self.assertEqual(expected_level_records, level_records) self.assertEqual(expected_other_records, other_records)
def test_interscts(self): '''Test instersects''' gff_1 = gff.GFF_record('\t'.join(['seq', 'SOURCE', 'gene', '42', '45', '.', '.', '.'])) intersects = [ ['seq', 'SOURCE', 'gene', '43', '44', '.', '.', '.'], ['seq', 'SOURCE', 'gene', '42', '43', '.', '.', '.'], ['seq', 'SOURCE', 'gene', '43', '45', '.', '.', '.'], ['seq', 'SOURCE', 'gene', '41', '42', '.', '.', '.'], ['seq', 'SOURCE', 'gene', '41', '43', '.', '.', '.'] ] for l in intersects: record = gff.GFF_record('\t'.join(l)) self.assertTrue(gff_1.intersects(record)) not_intersects = [ ['seq', 'SOURCE', 'gene', '40', '41', '.', '.', '.'], ['seq', 'SOURCE', 'gene', '46', '50', '.', '.', '.'], ['seq', 'SOURCE', 'gene', '1', '10', '.', '.', '.'], ['seq', 'SOURCE', 'gene', '100', '200', '.', '.', '.'], ['different_seq', 'SOURCE', 'gene', '43', '45', '.', '.', '.'], ] for l in not_intersects: record = gff.GFF_record('\t'.join(l)) self.assertFalse(gff_1.intersects(record))
def test_load_cufflinks_gtf(self): genes = helper.load_cufflinks_gtf( os.path.join(data_dir, 'test_load_cufflinks_gtf.gtf')) gene1_trans = gff.GFF_record('\t'.join([ 'seq', 'Cufflinks', 'transcript', '1', '100', '1000', '+', '.', 'gene_id "CUFF.1"; transcript_id "CUFF.1.1";' ])) gene1_exon = gff.GFF_record('\t'.join([ 'seq', 'Cufflinks', 'exon', '1', '100', '1000', '+', '.', 'gene_id "CUFF.1"; transcript_id "CUFF.1.1";' ])) gene2_trans = gff.GFF_record('\t'.join([ 'seq2', 'Cufflinks', 'transcript', '1', '200', '2000', '+', '.', 'gene_id "CUFF.2"; transcript_id "CUFF.2.1";' ])) gene2_exon1 = gff.GFF_record('\t'.join([ 'seq2', 'Cufflinks', 'exon', '1', '100', '2000', '+', '.', 'gene_id "CUFF.2"; transcript_id "CUFF.2.1";' ])) gene2_exon2 = gff.GFF_record('\t'.join([ 'seq2', 'Cufflinks', 'exon', '150', '200', '2000', '+', '.', 'gene_id "CUFF.2"; transcript_id "CUFF.2.1";' ])) gene1 = gene.Gene(gene1_trans) gene1.add_gff_record(gene1_exon) gene2 = gene.Gene(gene2_trans) gene2.add_gff_record(gene2_exon1) gene2.add_gff_record(gene2_exon2) expected_genes = {'seq': [gene1], 'seq2': [gene2]} self.assertEqual(expected_genes['seq2'], genes['seq2']) with self.assertRaises(helper.Error): helper.load_cufflinks_gtf( os.path.join(data_dir, 'test_load_cufflinks_gtf.no_parent.gtf'))
def test_remove_all_but_longest_transcript(self): '''Test remove_all_but_longest_transcript''' self.gene.remove_all_but_longest_transcript() self.assertEqual(['gene_id.1'], list(self.gene.transcripts.keys())) gff_mRNA2 = gff.GFF_record('\t'.join([ 'seqname', 'SOURCE', 'mRNA', '42', '100', '.', '+', '.', 'ID=gene_id.2;Parent=gene_id' ])) gff_exon2 = gff.GFF_record('\t'.join([ 'seqname', 'SOURCE', 'exon', '50', '65', '.', '+', '.', 'ID=gene_id.2:exon:1;Parent=gene_id.2' ])) self.gene.add_gff_record(gff_mRNA2) self.gene.add_gff_record(gff_exon2) self.gene.remove_all_but_longest_transcript() self.assertEqual(['gene_id.2'], list(self.gene.transcripts.keys()))
def test_longest_transcript_by_exon_length(self): '''Test longest_transcript_by_exon_length''' self.assertEqual('gene_id.1', self.gene.longest_transcript_by_exon_length()) gff_mRNA2 = gff.GFF_record('\t'.join([ 'seqname', 'SOURCE', 'mRNA', '42', '100', '.', '+', '.', 'ID=gene_id.2;Parent=gene_id' ])) gff_exon2 = gff.GFF_record('\t'.join([ 'seqname', 'SOURCE', 'exon', '50', '65', '.', '+', '.', 'ID=gene_id.2:exon:1;Parent=gene_id.2' ])) self.gene.add_gff_record(gff_mRNA2) self.gene.add_gff_record(gff_exon2) self.assertEqual('gene_id.2', self.gene.longest_transcript_by_exon_length())
def test_set_attribute(self): '''Test get_attribute''' gff_record = gff.GFF_record('\t'.join(['seq', 'SOURCE', 'feature', '42', '43', '.', '.', '.'])) gff_record.set_attribute('key1', '42') self.assertTrue(gff_record.get_attribute('key1'), '42') gff_record.set_attribute('key1', '43') self.assertTrue(gff_record.get_attribute('key1'), '43')
def setUp(self): self.gff_mRNA = gff.GFF_record('\t'.join([ 'seqname', 'SOURCE', 'mRNA', '42', '100', '.', '+', '.', 'ID=gene_id.1;Parent=gene_id' ])) self.gff_gene = gff.GFF_record('\t'.join([ 'seqname', 'SOURCE', 'gene', '42', '100', '.', '+', '.', 'ID=gene_id' ])) self.gff_exon1 = gff.GFF_record('\t'.join([ 'seqname', 'SOURCE', 'exon', '50', '60', '.', '+', '.', 'ID=gene_id.1:exon:1;Parent=gene_id.1' ])) self.gene = gene.Gene(self.gff_mRNA) self.gene.add_gff_record(self.gff_gene) self.gene.add_gff_record(self.gff_exon1)
def test_sort(self): '''Test sort''' unsorted_list = [ gff.GFF_record('\t'.join(['x', 'x', 'x', '42', '43', '.', '.', '.'])), gff.GFF_record('\t'.join(['x', 'x', 'x', '12', '13', '.', '.', '.'])) ] sorted_list = list(unsorted_list) sorted_list.sort() for l in [self.trans.five_utr, self.trans.three_utr, self.trans.exons, self.trans.ncRNA ,self.trans.rRNA, self.trans.tRNA, self.trans.snRNA]: l += unsorted_list self.trans._sort() for l in [self.trans.five_utr, self.trans.three_utr, self.trans.exons, self.trans.ncRNA ,self.trans.rRNA, self.trans.tRNA, self.trans.snRNA]: self.assertEqual(sorted_list, l)
def test_update_other_records(self): other_records = {} gff1 = gff.GFF_record('\t'.join([ 'seqname', 'SOURCE', 'eggs', '150', '200', '.', '+', '.', 'ID=spam' ])) gff2 = gff.GFF_record('\t'.join([ 'seqname', 'SOURCE', 'spam', '150', '200', '.', '+', '.', 'ID=eggs' ])) expected = {} helper.update_other_records(other_records, gff1) expected['seqname'] = [copy.deepcopy(gff1)] self.assertEqual(other_records, expected) helper.update_other_records(other_records, gff2) expected['seqname'].append(copy.deepcopy(gff2)) self.assertEqual(other_records, expected)
def test_can_extend(self): new_left_exon = gff.GFF_record('\t'.join([ 'seqname', 'SOURCE', 'exon', '40', '55', '.', '+', '.', 'ID=gene_id.1:exon:1;Parent=gene_id.1' ])) new_right_exon = gff.GFF_record('\t'.join([ 'seqname', 'SOURCE', 'exon', '55', '70', '.', '+', '.', 'ID=gene_id.1:exon:1;Parent=gene_id.1' ])) gene2 = copy.deepcopy(self.gene) self.assertFalse(self.gene.can_extend(gene2)) gene2.add_gff_record(new_left_exon) self.assertTrue(self.gene.can_extend(gene2)) gene2.seqname = 'holyhandgrenade' self.assertFalse(self.gene.can_extend(gene2)) gene2 = copy.deepcopy(self.gene) gene2.add_gff_record(new_right_exon) self.assertTrue(self.gene.can_extend(gene2))
def test_lenient_mode(self): '''Test __init__ when being lenient''' gff.lenient = True test_input = [ ['seq', 'Cufflinks', 'transcript', '42', '43', '.', '.', '.', 'key1 "val1"; key2 "val2";'], ['seq', 'SOURCE', 'gene', '42', '43', '.', '.', '.', 'attribute with no equals sign'], ] for l in test_input: record = gff.GFF_record('\t'.join(l)) self.assertEqual('\t'.join(l), str(record)) gff.lenient = False
def test_get_attribute(self): '''Test get_attribute''' gff.lenient = True gff_record = gff.GFF_record('\t'.join(['seq', 'SOURCE', 'feature', '42', '43', '.', '.', '.', 'key1=val1;key2=val2;key3'])) self.assertEqual(gff_record.get_attribute('key1'), 'val1') self.assertEqual(gff_record.get_attribute('key2'), 'val2') self.assertEqual(gff_record.get_attribute('key3'), None) with self.assertRaises(gff.Error): gff_record.get_attribute('killer rabbit') gff.lenient = False
def test_load_ref_gff(self): #level_records, other_records = helper.read_gff(os.path.join(data_dir, 'test_get_genes_from_ref.gff')) genes, other_records = helper.load_ref_gff( os.path.join(data_dir, 'test_get_genes_from_ref.gff')) gene1_gene = gff.GFF_record('\t'.join( ['seq', 'SOURCE', 'gene', '42', '100', '.', '+', '.', 'ID=gene'])) gene1_mRNA = gff.GFF_record('\t'.join([ 'seq', 'SOURCE', 'mRNA', '42', '100', '.', '+', '.', 'ID=gene.1;Parent=gene' ])) gene1_exon = gff.GFF_record('\t'.join([ 'seq', 'SOURCE', 'exon', '42', '62', '.', '+', '.', 'ID=gene.1:exon:1;Parent=gene.1' ])) gene1_exon2 = gff.GFF_record('\t'.join([ 'seq', 'SOURCE', 'exon', '92', '100', '.', '+', '.', 'ID=gene.1:exon:2;Parent=gene.1' ])) gene1 = gene.Gene(gene1_gene) gene1.add_gff_record(gene1_mRNA) gene1.add_gff_record(gene1_exon) gene1.add_gff_record(gene1_exon2) gene2_gene = gff.GFF_record('\t'.join( ['seq2', 'SOURCE', 'gene', '1', '10', '.', '+', '.', 'ID=gene2'])) gene2_mRNA = gff.GFF_record('\t'.join([ 'seq2', 'SOURCE', 'mRNA', '1', '10', '.', '+', '.', 'ID=gene2.1;Parent=gene2' ])) gene2_exon = gff.GFF_record('\t'.join([ 'seq2', 'SOURCE', 'exon', '1', '10', '.', '+', '.', 'ID=gene2.1:exon:1;Parent=gene2.1' ])) gene2 = gene.Gene(gene2_gene) gene2.add_gff_record(gene2_mRNA) gene2.add_gff_record(gene2_exon) expected_genes = {'seq': [gene1], 'seq2': [gene2]} expected_other = { 'seq': [ gff.GFF_record('\t'.join([ 'seq', 'SOURCE', 'eggs', '150', '200', '.', '+', '.', 'ID=spam' ])) ] } self.assertEqual(expected_genes, genes) self.assertEqual(expected_other, other_records)
def test_init_bad_input(self): '''Test __init__ for bad input''' test_input = [ ['seq', 'SOURCE', 'gene', '42', '43', '.', '.'], ['seq', 'SOURCE', 'gene', '42', '43', '.', '.', '.', '.', 'too many'], ['seq', 'SOURCE', 'gene', 'not_int', '43', '.', '.', '.'], ['seq', 'SOURCE', 'gene', '42', 'not_int', '.', '.', '.'], ['seq', 'SOURCE', 'gene', '42', '43', 'not_float', '.', '.'], ['seq', 'SOURCE', 'gene', '42', '43', '.', 'bad_strand', '.'], ['seq', 'SOURCE', 'gene', '42', '43', '.', '.', '.', 'attribute with no equals sign'], ] for l in test_input: with self.assertRaises(gff.Error): gff.GFF_record('\t'.join(l))
def test_init_good_input(self): '''Test __init__ for good input''' test_input = [ ['seq', 'SOURCE', 'gene', '42', '43', '.', '.', '.'], ['seq', 'SOURCE', 'gene', '42', '43', '1.4', '-', '0'], ['seq', 'SOURCE', 'gene', '42', '43', '1.4', '+', '1'], ['seq', 'SOURCE', 'gene', '42', '43', '1.4', '+', '1', 'key=value'], ['seq', 'SOURCE', 'gene', '42', '43', '1.4', '+', '1', 'key=value;key2=value 2'], ['seq', 'Cufflinks', 'transcript', '42', '43', '.', '.', '.', 'key1 "val1";'], ['seq', 'Cufflinks', 'transcript', '42', '43', '.', '.', '.', 'key1 "val1";'], ['seq', 'Cufflinks', 'transcript', '42', '43', '.', '.', '.', 'key1 "val1"; key2 "val2";'], ] for l in test_input: record = gff.GFF_record('\t'.join(l)) self.assertEqual('\t'.join(l), str(record))
def test_less_than(self): '''Test less than operator''' gff_1 = gff.GFF_record('\t'.join(['seq', 'SOURCE', 'gene', '42', '44', '.', '.', '.'])) gff_2 = gff.GFF_record('\t'.join(['seq', 'SOURCE', 'gene', '42', '43', '.', '.', '.'])) gff_3 = gff.GFF_record('\t'.join(['seq', 'SOURCE', 'gene', '42', '45', '.', '.', '.'])) gff_4 = gff.GFF_record('\t'.join(['seq', 'SOURCE', 'gene', '41', '42', '.', '.', '.'])) gff_5 = gff.GFF_record('\t'.join(['seq', 'SOURCE', 'gene', '41', '45', '.', '.', '.'])) gff_6 = gff.GFF_record('\t'.join(['different_seq', 'SOURCE', 'gene', '42', '45', '.', '.', '.'])) self.assertFalse(gff_1 < gff_1) self.assertFalse(gff_1 < gff_2) self.assertTrue(gff_1 < gff_3) self.assertFalse(gff_1 < gff_4) self.assertFalse(gff_1 < gff_5) self.assertFalse(gff_1 < gff_6)
def test_len(self): '''Test __len__''' g = gff.GFF_record('\t'.join(['seq', 'SOURCE', 'gene', '42', '44', '.', '.', '.'])) self.assertEqual(3, len(g))
def test_update_utrs(self): '''Test update_utrs()''' self.trans.add_gff_record(self.gff_exon) self.trans.add_gff_record(self.gff_exon2) self.trans.add_gff_record(self.gff_exon3) new_left_exon = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'CDS', '30', '50', '.', '+', '.'])) new_left_exon2 = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'CDS', '10', '20', '.', '+', '.'])) new_right_exon = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'CDS', '90', '100', '.', '+', '.'])) new_right_exon2 = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'CDS', '110', '120', '.', '+', '.'])) new_five_utr = gff.GFF_record('\t'.join(['seqname', 'UTR_updater', 'five_prime_UTR', '30', '43', '.', '+', '.', 'ID=gene_id.1:5utr;Parent=gene_id.1'])) new_five_utr_2 = gff.GFF_record('\t'.join(['seqname', 'UTR_updater', 'five_prime_UTR', '30', '43', '.', '+', '.', 'ID=gene_id.1:5utr:2;Parent=gene_id.1'])) new_five_utr_1 = gff.GFF_record('\t'.join(['seqname', 'UTR_updater', 'five_prime_UTR', '10', '20', '.', '+', '.', 'ID=gene_id.1:5utr:1;Parent=gene_id.1'])) new_three_utr = gff.GFF_record('\t'.join(['seqname', 'UTR_updater', 'three_prime_UTR', '95', '100', '.', '+', '.', 'ID=gene_id.1:3utr;Parent=gene_id.1'])) new_three_utr_1 = gff.GFF_record('\t'.join(['seqname', 'UTR_updater', 'three_prime_UTR', '95', '100', '.', '+', '.', 'ID=gene_id.1:3utr:1;Parent=gene_id.1'])) new_three_utr_2 = gff.GFF_record('\t'.join(['seqname', 'UTR_updater', 'three_prime_UTR', '110', '120', '.', '+', '.', 'ID=gene_id.1:3utr:2;Parent=gene_id.1'])) before_adding = [copy.deepcopy(self.trans) for x in range(5)] after_adding = [copy.deepcopy(self.trans) for x in range(5)] to_add = [copy.deepcopy(self.trans) for x in range(5)] # test extending the first exon to_add[0].exons.pop(0) to_add[0].add_gff_record(new_left_exon) after_adding[0].exons[0].coords.start = 44 after_adding[0].add_gff_record(new_five_utr) # test adding a spliced UTR at the start to_add[1].exons.pop(0) to_add[1].add_gff_record(new_left_exon) to_add[1].add_gff_record(new_left_exon2) after_adding[1].exons[0].coords.start = 44 after_adding[1].add_gff_record(new_five_utr_1) after_adding[1].add_gff_record(new_five_utr_2) # test extending the last exon to_add[2].exons.pop() to_add[2].add_gff_record(new_right_exon) after_adding[2].exons[-1].coords.end = 94 after_adding[2].add_gff_record(new_three_utr) # test adding a spliced UTR at the end to_add[3].exons.pop() to_add[3].add_gff_record(new_right_exon) to_add[3].add_gff_record(new_right_exon2) after_adding[3].exons[-1].coords.end = 94 after_adding[3].add_gff_record(new_three_utr_1) after_adding[3].add_gff_record(new_three_utr_2) # test trying to add too many new uts splices to_add[4].add_gff_record(new_left_exon) to_add[4].add_gff_record(new_left_exon2) to_add[4].add_gff_record(new_left_exon2) to_add[4].add_gff_record(new_left_exon2) # test adding a 5'UTR that overlaps an existing gene. Should do nothing t = copy.deepcopy(before_adding[0]) t.update_utrs(to_add[0], exclude_coords = [intervals.Interval(10,31)]) self.assertEqual(t, before_adding[0]) # test adding a 3'UTR that overlaps an existing gene. Should do nothing t = copy.deepcopy(before_adding[2]) t.update_utrs(to_add[2], exclude_coords = [intervals.Interval(100,110)]) self.assertEqual(t, before_adding[2]) for i in range(len(before_adding)): before_adding[i].update_utrs(to_add[i]) self.assertEqual(before_adding[i], after_adding[i])
def setUp(self): self.gff_mRNA = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'mRNA', '42', '100', '.', '+', '.', 'ID=gene_id.1;Parent=gene_id'])) self.gff_five_utr = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'five_prime_UTR', '42', '43', '.', '+', '.', 'ID=gene_id.1:5utr;Parent=gene_id.1'])) self.gff_three_utr = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'three_prime_UTR', '95', '100', '.', '+', '.', 'ID=gene_id.1:3utr;Parent=gene_id.1'])) self.gff_exon = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'CDS', '44', '53', '.', '+', '.', 'ID=gene_id.1:exon:1;Parent=gene_id.1'])) self.gff_exon2 = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'exon', '60', '63', '.', '+', '.', 'ID=gene_id.1:exon:2;Parent=gene_id.1'])) self.gff_exon3 = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'exon', '90', '94', '.', '+', '.', 'ID=gene_id.1:exon:2;Parent=gene_id.1'])) self.gff_pseudogenic_exon = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'pseudogenic_exon', '60', '63', '.', '+', '.', 'ID=gene_id.1:exon:3;Parent=gene_id.1'])) self.gff_transcript = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'transcript', '42', '100', '.', '+', '.', 'ID=gene_id.1;Parent=gene_id'])) self.gff_polypeptide = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'polypeptide', '42', '100', '.', '+', '.', 'ID=gene_id.1:pep;Derives_from=gene_id.1;translation=abcdefghjiklmnopwhatisyourfavouritecolourqrstuvwxyz'])) self.gff_pseudogenic_transcript = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'pseudogenic_transcript', '42', '100', '.', '+', '.', 'ID=gene_id.1;Parent=gene_id'])) self.gff_ncRNA = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'ncRNA', '42', '43', '.', '+', '.', 'ID=gene_id.1:ncRNA;Parent=gene_id.1'])) self.gff_tRNA = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'tRNA', '42', '43', '.', '+', '.', 'ID=gene_id.1:tRNA;Parent=gene_id.1'])) self.gff_snRNA = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'snRNA', '42', '43', '.', '+', '.', 'ID=gene_id.1:snRNA;Parent=gene_id.1'])) self.gff_rRNA = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'rRNA', '42', '43', '.', '+', '.', 'ID=gene_id.1:rRNA;Parent=gene_id.1'])) self.gff_other = gff.GFF_record('\t'.join(['seqname', 'SOURCE', 'eggs', '42', '4242', '.', '+', '.'])) self.trans = transcript.Transcript(self.gff_mRNA)