class TestMaize(unittest.TestCase): def setUp(self): handle = open( '/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt' ) fh = handle.readlines() self.blast_str = ' , '.join(fh) self.unmasked_fasta = Fasta('/Users/gturco/find_cns/maize_v2_UM.fasta') self.qbed = Bed('/Users/gturco/rice_maize/rice_v6.bed') self.qbed.fill_dict() self.sbed = Bed('/Users/gturco/maize/maize_v2.bed', '/Users/gturco/maize/maize_v2.fasta') self.sbed.fill_dict() self.sfeat = self.sbed.accn('GRMZM2G086714') self.qfeat = self.qbed.accn('Os09g27050') def test_get_cmd(self): sfasta = 'data/rice_v6_maize_v2/maize_v2_split/2.fasta' qfasta = 'data/rice_v6_maize_v2/rice_v6_split/4.fasta' def test_parse_balse(self): orientaion = -1 cns = parse_blast(self.blast_str, orientaion, self.qfeat, self.sfeat, self.qbed, self.sbed, 12000, 26000, self.unmasked_fasta) print cns
class TestPseudo(unittest.TestCase): def setUp(self): self.qallbed = Bed("data/rice_v6_setaria64/rice_v6.all.bed", "data/rice_v6_setaria64/rice_v6.fasta") self.qallbed.fill_dict() self.sallbed = Bed("data/rice_v6_setaria64/setaria64.all.bed", "data/rice_v6_setaria64/setaria64.fasta") self.sallbed.fill_dict() self.saccn = self.sallbed.accn("Si000834m") blastfh = open("blast_res") self.blast = blastfh.read() self.d, self.pseudo = group_cds(self.blast, self.saccn) def test_group_cds_1(self): self.assertEqual(len(self.d.keys()), 4) total_values = [] for key in self.d.keys(): values = len(self.d[key]) total_values.append(values) self.assertEqual(sum(total_values), 38) def test_group_cds_2(self): blast_2fh = open("blast_2") blast_2 = blast_2fh.read() d, pseudo = group_cds(blast_2, self.sallbed.accn("Si002524m")) self.assertEqual(len(d.keys()), 5) for key in d.keys(): # logging.info('key: {0}'.format(key)) self.assertEqual(1, len(d[key])) def test_append_to_included_groups(self): locs = [1, 2, 3, 4] group_dict = {(2, 5): [], (3, 6): [], (9, 8): []} result_dict = append_to_included_groups(locs, group_dict) expected = {(2, 5): [(1, 2, 3, 4)], (3, 6): [(1, 2, 3, 4)], (9, 8): []} self.assertEquals(expected, result_dict) def test_remove_crossing_hit(self): qaccn = self.qallbed.accn("Os01g01890") for group_key in self.d.keys(): exon_hits = self.d[group_key] non_crossing = remove_crossing_hits(exon_hits, qaccn, self.saccn) if len(non_crossing) > 1: mid, start, stop = bites(non_crossing) def test_find_orf(self): qaccn = self.qallbed.accn("Os01g01295") orf = find_orf(self.qallbed, qaccn) self.assertEqual(orf + 1, 141084) def test_find_orf_neg(self): saccn = self.sallbed.accn("Si001539m") orf = find_orf(self.sallbed, saccn) self.assertEqual(orf, 7662777)
def main(cns_path, fmt, query_bed_path, subject_bed_path): cns_dic = cns_to_dic(cns_path, fmt) query_bed = Bed(query_bed_path) subject_bed = Bed(subject_bed_path) utr_dict = {} for cns in cns_dic: cns['qstop'] = int(cns['qstop']) cns['qstart'] = int(cns['qstart']) cns['sstop'] = int(cns['sstop']) cns['sstart'] = int(cns['sstart']) qfeat = query_bed.accn(cns['qaccn']) sfeat = subject_bed.accn(cns['saccn']) qgene_space_start = min(qfeat['locs'])[0] qgene_space_end = max(qfeat['locs'])[1] qgene_space_poly = LineString([(0.0, qgene_space_start), (0.0, qgene_space_end)]) qgene_poly = LineString([(0.0, qfeat['start']), (0.0, qfeat['end'])]) sgene_poly = LineString([(0.0, sfeat['start']), (0.0, sfeat['end'])]) # if intron of one dont need to check other qcns = LineString([(0, cns['qstart']), (0, cns['qstop'])]) scns = LineString([(0, cns['sstart']), (0, cns['sstop'])]) cns_type(cns, qgene_space_poly, qgene_poly, sgene_poly, scns, qcns, qgene_space_start, qfeat) create_utr_list(utr_dict, qfeat, cns, "q") create_utr_list(utr_dict, sfeat, cns, "s") for cns in cns_dic: if cns['type'] == "5-prox_dist": qgene_start = min(utr_dict[cns['qaccn']]) qgene_stop = max(utr_dict[cns['qaccn']]) # sstart = min(utr_dict[cns['saccn']]) # sstop = max(utr_dict[cns['saccn']]) five_diff_pos = abs(qgene_start - cns["qstop"]) five_diff_neg = abs(qgene_stop - cns["qstart"]) if five_diff_pos <= 1000 and cns[ "qstrand"] == "+" or five_diff_neg <= 1000 and cns[ "qstrand"] == "-": cns["type"] = "5-proximal" elif five_diff_pos > 1000 and cns[ "qstrand"] == "+" or five_diff_neg > 1000 and cns[ "qstrand"] == "-": cns["type"] = "5-distal" elif cns['type'] == "3-prox_dist": qgene_start = min(utr_dict[cns['qaccn']]) qgene_stop = max(utr_dict[cns['qaccn']]) three_diff_pos = abs(cns["qstart"] - qgene_stop) three_diff_neg = abs(cns["qstop"] - qgene_start) if three_diff_pos <= 1000 and cns[ "qstrand"] == "+" or three_diff_neg <= 1000 and cns[ "qstrand"] == "-": cns["type"] = "3-proximal" elif three_diff_pos > 1000 and cns[ "qstrand"] == "+" or three_diff_neg > 1000 and cns[ "qstrand"] == "-": cns["type"] = "3-distal" return cns_dic
def utr_present(cns_pck,query_bed_path, UTR): "checks to see if qaccn has utr region" db = MySQLdb.connect(host="127.0.0.1", user="******", db = "rice_gene_table") cursor = db.cursor() cns_handle = open(cns_pck) cns_pickle = pickle.load(cns_handle) query_bed = Bed(query_bed_path) for cns in cns_pickle: qfeat = query_bed.accn(cns['qaccn']) if qfeat['strand'] == "+": end = qfeat['end'] start = qfeat["start"] else: end = qfeat['start'] start = qfeat["end"] if UTR == 3: if end == min(qfeat['locs'])[0] or end == max(qfeat['locs'])[1]: stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.3_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format(cns['qaccn']) print stmt cursor.execute(stmt) elif UTR == 5: if start == min(qfeat['locs'])[0] or start == max(qfeat['locs'])[1]: stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.5_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format(cns['qaccn']) print stmt cursor.execute(stmt)
def utr_present(cns_pck, query_bed_path, UTR): "checks to see if qaccn has utr region" db = MySQLdb.connect(host="127.0.0.1", user="******", db="rice_gene_table") cursor = db.cursor() cns_handle = open(cns_pck) cns_pickle = pickle.load(cns_handle) query_bed = Bed(query_bed_path) for cns in cns_pickle: qfeat = query_bed.accn(cns['qaccn']) if qfeat['strand'] == "+": end = qfeat['end'] start = qfeat["start"] else: end = qfeat['start'] start = qfeat["end"] if UTR == 3: if end == min(qfeat['locs'])[0] or end == max(qfeat['locs'])[1]: stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.3_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format( cns['qaccn']) print stmt cursor.execute(stmt) elif UTR == 5: if start == min(qfeat['locs'])[0] or start == max( qfeat['locs'])[1]: stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.5_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format( cns['qaccn']) print stmt cursor.execute(stmt)
def main(cns_path, fmt, query_bed_path, subject_bed_path): cns_dic = cns_to_dic(cns_path,fmt) query_bed = Bed(query_bed_path) subject_bed = Bed(subject_bed_path) utr_dict = {} for cns in cns_dic: cns['qstop'] = int(cns['qstop']) cns['qstart'] = int(cns['qstart']) cns['sstop'] = int(cns['sstop']) cns['sstart'] = int(cns['sstart']) qfeat = query_bed.accn(cns['qaccn']) sfeat = subject_bed.accn(cns['saccn']) qgene_space_start = min(qfeat['locs'])[0] qgene_space_end = max(qfeat['locs'])[1] qgene_space_poly = LineString([(0.0, qgene_space_start), (0.0, qgene_space_end)]) qgene_poly = LineString([(0.0, qfeat['start']), (0.0, qfeat['end'])]) sgene_poly = LineString([(0.0, sfeat['start']), (0.0, sfeat['end'])]) # if intron of one dont need to check other qcns = LineString([(0,cns['qstart']),(0,cns['qstop'])]) scns = LineString([(0,cns['sstart']),(0,cns['sstop'])]) cns_type(cns,qgene_space_poly, qgene_poly, sgene_poly, scns, qcns,qgene_space_start,qfeat) create_utr_list(utr_dict,qfeat, cns,"q") create_utr_list(utr_dict,sfeat, cns,"s") for cns in cns_dic: if cns['type'] == "5-prox_dist": qgene_start = min(utr_dict[cns['qaccn']]) qgene_stop = max(utr_dict[cns['qaccn']]) # sstart = min(utr_dict[cns['saccn']]) # sstop = max(utr_dict[cns['saccn']]) five_diff_pos = abs(qgene_start - cns["qstop"]) five_diff_neg = abs(qgene_stop - cns["qstart"]) if five_diff_pos <=1000 and cns["qstrand"] == "+" or five_diff_neg <=1000 and cns["qstrand"] == "-": cns["type"] = "5-proximal" elif five_diff_pos >1000 and cns["qstrand"] == "+" or five_diff_neg >1000 and cns["qstrand"] == "-": cns["type"] = "5-distal" elif cns['type'] == "3-prox_dist": qgene_start = min(utr_dict[cns['qaccn']]) qgene_stop = max(utr_dict[cns['qaccn']]) three_diff_pos = abs(cns["qstart"] - qgene_stop) three_diff_neg = abs(cns["qstop"] - qgene_start) if three_diff_pos <=1000 and cns["qstrand"] == "+" or three_diff_neg <=1000 and cns["qstrand"] == "-": cns["type"] = "3-proximal" elif three_diff_pos > 1000 and cns["qstrand"] == "+" or three_diff_neg > 1000 and cns["qstrand"] == "-": cns["type"] = "3-distal" return cns_dic
def main(bedfile,seqfile, gene_list): print "position,gene,element" b = Bed(bedfile) f = Fasta(seqfile) for gene_name in gene_list: gene = b.accn(gene_name) promf, promr = get_prom(f, gene) print gene_name mf = find_seq(promf) mr = find_seq(promr) make_graph(mf,mr, gene_name)
class TestMaize(unittest.TestCase): def setUp(self): handle = open("/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt") fh = handle.readlines() self.blast_str = " , ".join(fh) self.unmasked_fasta = Fasta("/Users/gturco/find_cns/maize_v2_UM.fasta") self.qbed = Bed("/Users/gturco/rice_maize/rice_v6.bed") self.qbed.fill_dict() self.sbed = Bed("/Users/gturco/maize/maize_v2.bed", "/Users/gturco/maize/maize_v2.fasta") self.sbed.fill_dict() self.sfeat = self.sbed.accn("GRMZM2G086714") self.qfeat = self.qbed.accn("Os09g27050") def test_get_cmd(self): sfasta = "data/rice_v6_maize_v2/maize_v2_split/2.fasta" qfasta = "data/rice_v6_maize_v2/rice_v6_split/4.fasta" def test_parse_balse(self): orientaion = -1 cns = parse_blast( self.blast_str, orientaion, self.qfeat, self.sfeat, self.qbed, self.sbed, 12000, 26000, self.unmasked_fasta ) print cns
def main(cns_file,bedpath,fastapath): genespace = get_genespace(cns_file) bed = Bed(bedpath) f = Fasta(fastapath) handles = ['3_utr','5_utr','intronic','5_prox','5_distal','3_prox','3_distal'] fhs = open_files(handles) for gene in genespace.keys(): #cnsspace = genespace[gene] try: accn = bed.accn(gene) except KeyError: continue cnsspace = [(max(0,accn['start'] - 12000), accn['end'] + 12000)] #print "GENESPACE {0}".format(cnsspace) locs = accn['locs'] locs.sort() cnsspace.sort() write_to_pos_fasta(bed,accn,locs,cnsspace,fhs,f)
def main(cns_file, bedpath, fastapath): genespace = get_genespace(cns_file) bed = Bed(bedpath) f = Fasta(fastapath) handles = [ '3_utr', '5_utr', 'intronic', '5_prox', '5_distal', '3_prox', '3_distal' ] fhs = open_files(handles) for gene in genespace.keys(): #cnsspace = genespace[gene] try: accn = bed.accn(gene) except KeyError: continue cnsspace = [(max(0, accn['start'] - 12000), accn['end'] + 12000)] #print "GENESPACE {0}".format(cnsspace) locs = accn['locs'] locs.sort() cnsspace.sort() write_to_pos_fasta(bed, accn, locs, cnsspace, fhs, f)
class TestPerfectTargetRegion(unittest.TestCase): def setUp(self): self.gene_name = "Os01g02110" self.bed = Bed("ricetest.bed") self.fasta = Fasta("ricetest.fasta") self.gene = self.bed.accn(self.gene_name) self.exons = self.gene['locs'] def test_rel_pos(self): self.assertEqual((376,486),rel_pos(self.gene,self.exons[0])) self.assertEqual((1289,1789),rel_pos(self.gene,self.exons[-1])) def test_fasta(self): exon = self.exons[-1] seq = self.fasta[self.gene_name][:] self.assertTrue(1789 <= len(seq)) def test_pattern(self): e = exons[-1] start, stop = rel_pos(self.gene,e) for exon in self.exons: