def test_save_fasta(self): genome_mm10 = glbase3.genome() genome_mm10.bindSequence("test_data/seq") newl = [{ "name": "A", "loc": glbase3.location(loc="chr1:100-150") }, { "name": "X", "loc": glbase3.location(loc="chrA:100-150") }] newgl = glbase3.genelist() newgl.load_list(newl) fasta = genome_mm10.getSequences(newgl) fasta.saveFASTA(filename="/tmp/test_fasta.fa", name=["loc", "name"]) with open("/tmp/test_fasta.fa") as oh: self.assertEqual(oh.readline().strip(), '>chr1:100-150_A') self.assertEqual( oh.readline().strip(), 'ATCAGACAGGTAGATCATCTCGCTCCGAGCTTGCCACCAGCAAACCATTGC') self.assertEqual(oh.readline().strip(), '>chrA:100-150_X') self.assertEqual( oh.readline().strip(), 'GTAAAAACCCGATGGAATACTCATCCAGTAAGTCCGAACCACTTCAACATC') fasta.saveFASTA(filename="/tmp/test_fasta.fa") with open("/tmp/test_fasta.fa") as oh: self.assertEqual(oh.readline().strip(), '>chr1:100-150') self.assertEqual( oh.readline().strip(), 'ATCAGACAGGTAGATCATCTCGCTCCGAGCTTGCCACCAGCAAACCATTGC') self.assertEqual(oh.readline().strip(), '>chrA:100-150') self.assertEqual( oh.readline().strip(), 'GTAAAAACCCGATGGAATACTCATCCAGTAAGTCCGAACCACTTCAACATC')
def setUp(self): self.gsql = glbase3.genome_sql( new=True, filename='/tmp/test_genome_sql.sql' ) # This is platform specific and breaks on Windows self.gsql.add_feature( glbase3.location(chr='chr1', left=110, right=120), glbase3.location(chr='chr1', left=110, right=120), 10, [1, 2, 3, 4], [5, 6, 7, 8], 'Nanog', '+')
def test_mask(self): t = glbase3.flat_track(filename="/tmp/test.flat", bin_format="f") a = t.get(glbase3.location(loc="chr2:99-111")) unmasked = [2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,2] self.assertTrue(False not in [int(x) == int(y) for x, y in zip(a, unmasked)]) # all seqs. a = t.get(glbase3.location(loc="chr2:99-111"), mask_zero=True) expected = "[2.0 -- -- -- -- -- -- -- -- -- -- --]" # not sure how to test this apart from a string. self.assertEqual(str(a), expected)
def test_read_extend(self): a = self.t.get(gl.location(loc="chr1:10-20"), read_extend=1) self.assertListEqual( list(a), [5., 5., 5., 4., 4., 5., 5., 4., 4., 5., 5. ]) # These are correct. Always returns floats now a = self.t.get(gl.location(loc="chr1:10-20"), read_extend=2) self.assertListEqual(list(a), [5., 5., 5., 5., 4., 5., 5., 5., 4., 5., 5.]) # .
def test_remove_dupes_by_loc(self): data = [{ "loc": glbase3.location(loc="chr1:1000-1200") }, { "loc": glbase3.location(loc="chr1:1000-1200") }, { "loc": glbase3.location(loc="chr1:1100-1200") }, { "loc": glbase3.location(loc="chr1:1300-1400") }, { "loc": glbase3.location(loc="chr1:1300-1400") }, { "loc": glbase3.location(loc="chr1:1300-1400") }, { "loc": glbase3.location(loc="chr1:1600-1600") }, { "loc": glbase3.location(loc="chr1:1423-1423") }, { "loc": glbase3.location(loc="chr2:1000-1200") }] g = glbase3.genelist() g.load_list(data) newl = g.removeDuplicatesByLoc(delta=100, mode='pointify_expand') self.assertEqual(len(newl), 4)
def generate_track_frags_only(filename, norm_factor): t = gl.track(filename=filename, new=True, name="Test Track", norm_factor=norm_factor) t.add_location(gl.location(loc="chr1:10-20")) t.add_location(gl.location(loc="chr1:10-21")) # test duplicates t.add_location(gl.location(loc="chr1:10-22")) # test 1's t.add_location(gl.location(loc="chr1:9-23")) # test 1's t.finalise() return (t)
def test_buckets(self): glbase3.config.bucket_size = 100 # change to a smaller value for testing purposes. g = glbase3.genelist() data = [ { "loc": glbase3.location(loc="chr1:1000-1200") }, { "loc": glbase3.location(loc="chr1:1200-1300") }, { "loc": glbase3.location(loc="chr1:1200-1201") }, { "loc": glbase3.location(loc="chr1:1300-1400") }, { "loc": glbase3.location(loc="chr1:1400-1500") }, { "loc": glbase3.location(loc="chr1:1500-1600") }, { "loc": glbase3.location(loc="chr1:1600-1600") }, # point locs on edges of buckets { "loc": glbase3.location(loc="chr1:1423-1423") }, # point locs in middle of buckets { "loc": glbase3.location(loc="chr1:0-1500") } ] # span much larger than bucket g.load_list(data) left_buck = int( (1299 - 1) / glbase3.config.bucket_size) * glbase3.config.bucket_size right_buck = int( (1788) / glbase3.config.bucket_size) * glbase3.config.bucket_size buckets_reqd = list( range(left_buck, right_buck + glbase3.config.bucket_size, glbase3.config.bucket_size) ) # make sure to get the right spanning and left spanning sites loc_ids = set() if buckets_reqd: for buck in buckets_reqd: if buck in g.buckets["1"]: loc_ids.update(g.buckets["1"][buck]) # unique ids self.assertSetEqual(loc_ids, set([0, 1, 2, 3, 4, 5, 6, 7, 8])) self.assertEqual(len(g.buckets), 1) self.assertEqual(len(g.buckets["1"]), 17) glbase3.config.bucket_size = 10000 # change it back
def test_reload(self): t = glbase3.flat_track(filename="/tmp/test.flat", bin_format="f") a = self.t.get(glbase3.location(loc="chr1:0-100")) expected_result = [i for i in range(100)] + [0] self.assertTrue(False not in [ int(x) == int(y) for x, y in zip(a, expected_result) ]) # all seqs.
def test_read_extend_frags_only(self): a = self.frags.get(gl.location(loc="chr1:5-25"), read_extend=1) self.assertTrue(False not in [ i == e for i, e in zip(a, [ 0, 0, 0, 0, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 2, 1, 0 ]) ])
def test_pileup(self): t = gl.track(filename="/tmp/test_pileup.trk", new=True, name="Test Track") for i in [10, 10, 10, 10, 10, 10]: t.add_location(gl.location(chr="chr1", left=i, right=i + 5)) t.finalise() g = gl.genelist(filename="test_data/track_test.bed", format=gl.format.bed) L = t.pileup(genelist=g, filename="test_images/test_output.png", heatmap_filename="test_images/test_heatmap.png", window_size=15, bin_size=1, respect_strand=True, normalise=False, read_extend=1, raw_tag_filename="test_images/test_tags.tsv") expected_result = numpy.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 3., 3., 3., 6., 6., 6., 6., 3., 3., 3., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., ]) # units are now reads per item in genelist #print L['pileup'] #print expected_result self.assertTrue(False not in [x == y for x, y in zip(L["pileup"], expected_result)])
def test_norm_factor(self): n1 = generate_track(filename="/tmp/test_1.0.trk", norm_factor=1.0) # Test silent argument actual = self.t.get(gl.location(loc="chr1:10-20"), read_extend=1) observed = n1.get(gl.location(loc="chr1:10-20"), read_extend=1) self.assertTrue( False not in [x == y for x, y in zip(actual, observed)]) n2 = generate_track(filename="/tmp/test_2.0.trk", norm_factor=2.0) observed = n2.get(gl.location(loc="chr1:10-20"), read_extend=1) self.assertTrue( False not in [x == y for x, y in zip(actual * 2.0, observed)]) n3 = generate_track(filename="/tmp/test_0.5.trk", norm_factor=0.5) observed = n3.get(gl.location(loc="chr1:10-20"), read_extend=1) self.assertTrue( False not in [x == y for x, y in zip(actual * 0.5, observed)])
def test_reload(self): t = gl.track(filename="/tmp/test.trk") # uses self.t # list of left and rights to compare expected_reads = [(10, 20, '+'), (10, 20, '+'), (21, 22, '+'), (23, 23, '+'), (1, 100, '+'), (9, 19, '+'), (15, 15, '+'), (19, 25, '+'), (5, 11, '+')] reads = self.t.get_reads(gl.location(loc="chr1:1-100")) res = [(r[li], r[ri], r[st]) in expected_reads for r in reads] self.assertTrue(False not in res) self.assertTrue(len(expected_reads) == len(reads))
def test_get_reads(self): # Test a range of reads and edges. expected_reads = [(10, 20, '+'), (10, 20, '+'), (21, 22, '+'), (23, 23, '+'), (1, 100, '+'), (9, 19, '+'), (15, 15, '+'), (19, 25, '+'), (5, 11, '+')] reads = self.t.get_reads(gl.location(loc="chr1:1-100")) res = [(r[li], r[ri], r[st]) in expected_reads for r in reads] self.assertTrue(False not in res) self.assertTrue(len(expected_reads) == len(reads)) # Test an inner get for a long read expected_reads = [(1, 1000, '+')] reads = self.t.get_reads(gl.location(loc="chrX:10-20")) res = [(r[li], r[ri], r[st]) in expected_reads for r in reads ] # Can't test chrom, track1 does not return chrom self.assertTrue(False not in res) self.assertTrue(len(expected_reads) == len(reads)) expected_reads = [(1, 2, '-')] reads = self.t.get_reads(gl.location(loc="chr3:1-5")) res = [(r[li], r[ri], r[st]) in expected_reads for r in reads ] # Can't test chrom, track1 does not return chrom self.assertTrue(False not in res) self.assertTrue(len(expected_reads) == len(reads)) reads = self.t.get_reads( gl.location(loc="chr3:50-100")) # empty region test self.assertTrue(len(reads) == 0) expected_reads = [(10, 20, '+'), (10, 20, '+'), (1, 100, '+'), (9, 19, '+'), (5, 11, '+'), (19, 25, '+'), (15, 15, '+')] reads = self.t.get_reads( gl.location(loc="chr1:10-20")) # overspanning read test res = [(r[li], r[ri], r[st]) in expected_reads for r in reads ] # Can't test chrom, track1 does not return chrom self.assertTrue(False not in res) self.assertTrue(len(expected_reads) == len(reads)) expected_reads = [(15, 15, '+'), (10, 20, '+'), (10, 20, '+'), (1, 100, '+'), (9, 19, '+')] reads = self.t.get_reads( gl.location(loc="chr1:15-15")) # single point location read test res = [(r[li], r[ri], r[st]) in expected_reads for r in reads ] # Can't test chrom, track1 does not return chrom self.assertTrue(False not in res) self.assertTrue(len(expected_reads) == len(reads))
def test_get_array(self): a = self.t.get(gl.location(loc="chr1:10-20")) self.assertListEqual(list(a), [5., 5., 4., 4., 4., 5., 4., 4., 4., 5., 4.])
def test_removeDuplicatesByLoc_delete_any_matches(self): a = [ { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=130, right=230) }, { 'loc': glbase3.location(chr=1, left=130, right=230) }, { 'loc': glbase3.location(chr=1, left=9800, right=9990) }, # across bucket { 'loc': glbase3.location(chr=1, left=10001, right=10200) }, ] gl = glbase3.genelist() gl.load_list(a) dups = gl.removeDuplicatesByLoc('pointify_expand', 'loc', 10, delete_any_matches=True) self.assertEqual(len(dups), 2) dups = gl.removeDuplicatesByLoc('overlap', 'loc', 0, delete_any_matches=True) self.assertEqual(len(dups), 2)
def generate_track(filename, norm_factor): t = gl.track(filename=filename, new=True, name="Test Track", norm_factor=norm_factor) t.add_location(gl.location(loc="chr1:10-20")) t.add_location(gl.location(loc="chr1:10-20")) # test duplicates t.add_location(gl.location(loc="chr1:21-22")) # test 1's t.add_location(gl.location(loc="chr1:23-23")) # test single outside t.add_location(gl.location(loc="chr1:1-100")) # test massive span t.add_location(gl.location(loc="chr1:9-19")) # inside test t.add_location(gl.location(loc="chr1:15-15")) # 1bp inside t.add_location(gl.location(loc="chr1:19-25")) # over right border t.add_location(gl.location(loc="chr1:5-11")) # over left border t.add_location(gl.location(loc="chrX:1-1000")) # letter chromsome t.add_location(gl.location(loc="chr2:2-2000")) # other numeric chr t.add_location(gl.location(loc="chr3:1-2"), strand="-") # test strand t.add_location(gl.location(loc="chr10:1-2"), strand="-") # test strand t.add_location(gl.location(loc="chr100:1-2"), strand="-") # test strand t.finalise() return (t)
def load_bed(self, filename, out_filename, expand_bed=0): ''' **Purpose** Load in a BED file, ideally output by collect_valid_pairs.py, although I guess any valid BED will do This function is not officially part of te_hic, but could be useful to annotate (for example) a BED list of peaks from a ChIP-seq **Arguments** filename (Required) filename of the BED file expand_bed (Optional, default=0) Optionally expand the BED coordianted left and right by expand_bed ''' assert filename, 'You must specify a filename' done = 0 bucket_size = glbase3.config.bucket_size output = [] oh = open(filename, 'r') for idx, line in enumerate(oh): line = line.strip().split('\t') # reach into the genelist guts... # work out which of the buckets is required: loc = glbase3.location(chr=line[0], left=int(line[1])-expand_bed, right=int(line[2])+expand_bed) left_buck = ((loc["left"]-1)//bucket_size) * bucket_size right_buck = ((loc["right"])//bucket_size) * bucket_size buckets_reqd = range(left_buck, right_buck+bucket_size, bucket_size) result = [] # get the ids reqd. loc_ids = set() if buckets_reqd: for buck in buckets_reqd: if buck in self.genome.buckets[loc["chr"]]: loc_ids.update(self.genome.buckets[loc["chr"]][buck]) # set = unique ids for index in loc_ids: #print loc.qcollide(self.linearData[index]["loc"]), loc, self.linearData[index]["loc"] if loc.qcollide(self.genome.linearData[index]["loc"]): result.append(self.genome.linearData[index]) read1_feat = [] read1_type = [] if result: for r in result: read1_feat.append(r['name']) read1_type.append(r['type']) if read1_feat: read1_feat = ', '.join(set(read1_feat)) read1_type = ', '.join(set(read1_type)) else: read1_feat = 'None' read1_type = 'None' output.append('\t'.join(line[0:3] + [read1_feat, read1_type])) #print(output[-1]) done += 1 if done % 1000000 == 0: print('Processed: {:,}'.format(done)) #break print('Processed {:,} reads'.format(len(output))) oh.close() out = open(out_filename, 'w') for o in output: out.write('%s\n' % o) out.close()
def test_get(self): a = self.t.get(glbase3.location(loc="chr1:10-20")) self.assertListEqual(list(a), [10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0])