示例#1
0
 def setUp(self):
     self.cns_filename = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.cns.txt"
     self.pairsfile = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.pairs.txt"
     self.qbed = Bed("data/rice_v6_sorghum_v1/rice_v6.bed") ;self.qbed.fill_dict()
     self.sbed = Bed("data/rice_v6_sorghum_v1/sorghum_v1.bed") ;self.sbed.fill_dict()
     self.cns_dict, self.evalue_dict = get_cns_dict(self.cns_filename)
     self.qpair_map, self.spair_map = make_pair_maps(self.pairsfile, "pair", self.qbed, self.sbed)
def print_bed(flist, old_path):
    ipath, ext = op.splitext(old_path)
    path = "%s.with_new%s" % (ipath, ext)

    print >> sys.stderr, "writing to: %s.with_new%s" % (ipath, ext)
    fh = open(path, 'wb')
    seen = {}

    for item in flist:
        # convert the locs to a tuple.
        #print >>sys.stderr, item
        item = list(item)
        item[6] = tuple(item[6])
        item = tuple(item)
        if item in seen: continue
        seen[item] = 1
        locs = item[6]  # tuple(sorted([item[1], item[2]]))

        row = dict(accn=item[3],
                   start=item[1],
                   end=item[2],
                   seqid=item[0],
                   locs=locs,
                   score='.',
                   strand=item[5],
                   rgb='.',
                   thickstart='.',
                   thickend=".")
        print >> fh, Bed.row_string(row)
    fh.close()
    return Bed(path)
示例#3
0
class TestAssign(unittest.TestCase):
    def setUp(self):
        self.cns_filename = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.cns.txt"
        self.pairsfile = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.pairs.txt"
        self.qbed = Bed("data/rice_v6_sorghum_v1/rice_v6.bed") ;self.qbed.fill_dict()
        self.sbed = Bed("data/rice_v6_sorghum_v1/sorghum_v1.bed") ;self.sbed.fill_dict()
        self.cns_dict, self.evalue_dict = get_cns_dict(self.cns_filename)
        self.qpair_map, self.spair_map = make_pair_maps(self.pairsfile, "pair", self.qbed, self.sbed)

    def test_get_cns_dict(self):
        """test for test_get_cns_dict"""
        #print self.cns_dict.keys()
        print "keys!",  self.evalue_dict.keys()


    def test_assign(self):
      assign(self.cns_dict, self.qbed, self.sbed, self.qpair_map, self.spair_map)
    
    def test_cns_fmt_dict(self):
      for cns, qfeat, sfeat in assign(self.cns_dict, self.qbed, self.sbed, self.qpair_map, self.spair_map):
        d = cns_fmt_dict(cns, qfeat, sfeat, self.evalue_dict)
        print "dddddddd", d
        
    def test_main(self):
      pass
示例#4
0
def utr_present(cns_pck,query_bed_path, UTR):
  "checks to see if qaccn has utr region"
  db = MySQLdb.connect(host="127.0.0.1", user="******", db = "rice_gene_table")
  cursor = db.cursor()
  cns_handle = open(cns_pck)
  cns_pickle = pickle.load(cns_handle)
  query_bed = Bed(query_bed_path)
  for cns in cns_pickle:
    qfeat = query_bed.accn(cns['qaccn'])
    if qfeat['strand'] == "+":
      end = qfeat['end']
      start = qfeat["start"]
    else:
      end = qfeat['start']
      start = qfeat["end"]
    if UTR == 3:
      if end == min(qfeat['locs'])[0] or end == max(qfeat['locs'])[1]:
        stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.3_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format(cns['qaccn'])
        print stmt
        cursor.execute(stmt)
    elif UTR == 5:
      if start == min(qfeat['locs'])[0] or start == max(qfeat['locs'])[1]:
        stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.5_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format(cns['qaccn'])
        print stmt
        cursor.execute(stmt)
示例#5
0
def utr_present(cns_pck, query_bed_path, UTR):
    "checks to see if qaccn has utr region"
    db = MySQLdb.connect(host="127.0.0.1", user="******", db="rice_gene_table")
    cursor = db.cursor()
    cns_handle = open(cns_pck)
    cns_pickle = pickle.load(cns_handle)
    query_bed = Bed(query_bed_path)
    for cns in cns_pickle:
        qfeat = query_bed.accn(cns['qaccn'])
        if qfeat['strand'] == "+":
            end = qfeat['end']
            start = qfeat["start"]
        else:
            end = qfeat['start']
            start = qfeat["end"]
        if UTR == 3:
            if end == min(qfeat['locs'])[0] or end == max(qfeat['locs'])[1]:
                stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.3_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format(
                    cns['qaccn'])
                print stmt
                cursor.execute(stmt)
        elif UTR == 5:
            if start == min(qfeat['locs'])[0] or start == max(
                    qfeat['locs'])[1]:
                stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.5_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format(
                    cns['qaccn'])
                print stmt
                cursor.execute(stmt)
示例#6
0
class TestAssign(unittest.TestCase):
    def setUp(self):
        self.cns_filename = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.cns.txt"
        self.pairsfile = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.pairs.txt"
        self.qbed = Bed("data/rice_v6_sorghum_v1/rice_v6.bed")
        self.qbed.fill_dict()
        self.sbed = Bed("data/rice_v6_sorghum_v1/sorghum_v1.bed")
        self.sbed.fill_dict()
        self.cns_dict, self.evalue_dict = get_cns_dict(self.cns_filename)
        self.qpair_map, self.spair_map = make_pair_maps(
            self.pairsfile, "pair", self.qbed, self.sbed)

    def test_get_cns_dict(self):
        """test for test_get_cns_dict"""
        #print self.cns_dict.keys()
        print "keys!", self.evalue_dict.keys()

    def test_assign(self):
        assign(self.cns_dict, self.qbed, self.sbed, self.qpair_map,
               self.spair_map)

    def test_cns_fmt_dict(self):
        for cns, qfeat, sfeat in assign(self.cns_dict, self.qbed, self.sbed,
                                        self.qpair_map, self.spair_map):
            d = cns_fmt_dict(cns, qfeat, sfeat, self.evalue_dict)
            print "dddddddd", d

    def test_main(self):
        pass
示例#7
0
 def setUp(self):
     self.old_bed = Bed("data/rice_t_sorghum_v1/sorghum_v1.bed")
     self.missed_bed = Bed(
         "data/rice_t_sorghum_v1/missed_sorghum_v1_from_rice_b.bed")
     self.matches = "data/rice_t_sorghum_v1/missed_sorghum_v1_from_rice_b.matches.txt"
     self.missed_genes = parse_missed_genes(self.matches)
     self.missed_genes_grouped, self.missed_genes_dict = group_genes_in_bed(
         self.missed_genes, self.old_bed, self.missed_bed)
示例#8
0
 def setUp(self):
     self.qallbed = Bed("data/rice_v6_setaria64/rice_v6.all.bed", "data/rice_v6_setaria64/rice_v6.fasta")
     self.qallbed.fill_dict()
     self.sallbed = Bed("data/rice_v6_setaria64/setaria64.all.bed", "data/rice_v6_setaria64/setaria64.fasta")
     self.sallbed.fill_dict()
     self.saccn = self.sallbed.accn("Si000834m")
     blastfh = open("blast_res")
     self.blast = blastfh.read()
     self.d, self.pseudo = group_cds(self.blast, self.saccn)
示例#9
0
 def setUp(self):
     self.cns_filename = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.cns.txt"
     self.pairsfile = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.pairs.txt"
     self.qbed = Bed("data/rice_v6_sorghum_v1/rice_v6.bed")
     self.qbed.fill_dict()
     self.sbed = Bed("data/rice_v6_sorghum_v1/sorghum_v1.bed")
     self.sbed.fill_dict()
     self.cns_dict, self.evalue_dict = get_cns_dict(self.cns_filename)
     self.qpair_map, self.spair_map = make_pair_maps(
         self.pairsfile, "pair", self.qbed, self.sbed)
示例#10
0
def main(bedfile,seqfile, gene_list):
    print "position,gene,element"
    b = Bed(bedfile)
    f = Fasta(seqfile)
    for gene_name in gene_list:
        gene = b.accn(gene_name)
        promf, promr = get_prom(f, gene)
        print gene_name
        mf = find_seq(promf)
        mr = find_seq(promr)
        make_graph(mf,mr, gene_name)
示例#11
0
    def setUp(self):
        handle = open("/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt")
        fh = handle.readlines()
        self.blast_str = " , ".join(fh)
        self.unmasked_fasta = Fasta("/Users/gturco/find_cns/maize_v2_UM.fasta")

        self.qbed = Bed("/Users/gturco/rice_maize/rice_v6.bed")
        self.qbed.fill_dict()
        self.sbed = Bed("/Users/gturco/maize/maize_v2.bed", "/Users/gturco/maize/maize_v2.fasta")
        self.sbed.fill_dict()
        self.sfeat = self.sbed.accn("GRMZM2G086714")
        self.qfeat = self.qbed.accn("Os09g27050")
示例#12
0
class TestPseudo(unittest.TestCase):
    def setUp(self):
        self.qallbed = Bed("data/rice_v6_setaria64/rice_v6.all.bed", "data/rice_v6_setaria64/rice_v6.fasta")
        self.qallbed.fill_dict()
        self.sallbed = Bed("data/rice_v6_setaria64/setaria64.all.bed", "data/rice_v6_setaria64/setaria64.fasta")
        self.sallbed.fill_dict()
        self.saccn = self.sallbed.accn("Si000834m")
        blastfh = open("blast_res")
        self.blast = blastfh.read()
        self.d, self.pseudo = group_cds(self.blast, self.saccn)

    def test_group_cds_1(self):
        self.assertEqual(len(self.d.keys()), 4)
        total_values = []
        for key in self.d.keys():
            values = len(self.d[key])
            total_values.append(values)
        self.assertEqual(sum(total_values), 38)

    def test_group_cds_2(self):
        blast_2fh = open("blast_2")
        blast_2 = blast_2fh.read()

        d, pseudo = group_cds(blast_2, self.sallbed.accn("Si002524m"))

        self.assertEqual(len(d.keys()), 5)
        for key in d.keys():
            # logging.info('key: {0}'.format(key))

            self.assertEqual(1, len(d[key]))

    def test_append_to_included_groups(self):
        locs = [1, 2, 3, 4]
        group_dict = {(2, 5): [], (3, 6): [], (9, 8): []}

        result_dict = append_to_included_groups(locs, group_dict)
        expected = {(2, 5): [(1, 2, 3, 4)], (3, 6): [(1, 2, 3, 4)], (9, 8): []}

        self.assertEquals(expected, result_dict)

    def test_remove_crossing_hit(self):
        qaccn = self.qallbed.accn("Os01g01890")
        for group_key in self.d.keys():
            exon_hits = self.d[group_key]
            non_crossing = remove_crossing_hits(exon_hits, qaccn, self.saccn)
            if len(non_crossing) > 1:
                mid, start, stop = bites(non_crossing)

    def test_find_orf(self):
        qaccn = self.qallbed.accn("Os01g01295")
        orf = find_orf(self.qallbed, qaccn)
        self.assertEqual(orf + 1, 141084)

    def test_find_orf_neg(self):
        saccn = self.sallbed.accn("Si001539m")
        orf = find_orf(self.sallbed, saccn)
        self.assertEqual(orf, 7662777)
示例#13
0
def main(cnsfile, qbed_file, sbed_file, pairsfile, pairs_fmt, qdsid, sdsid,qpad,spad):
    qcns_file = qbed_file.replace(".nolocaldups", "_cns.gff")
    assert qcns_file != qbed_file
    qcns_gff = open(qcns_file, 'w')
    print >>qcns_gff, "##gff-version 3"
    if sbed_file != qbed_file:
        scns_file = sbed_file.replace(".nolocaldups", "_cns.gff")
        assert scns_file != sbed_file
        scns_gff = open(scns_file, 'w')
        print >>scns_gff, "##gff-version 3"
    else:
        scns_gff = qcns_gff
    
    qbed = Bed(qbed_file); qbed.fill_dict()
    sbed = Bed(sbed_file); sbed.fill_dict()


    cnsdict, evaldict = get_cns_dict(cnsfile)
    qpair_map, spair_map = make_pair_maps(pairsfile, pairs_fmt, qbed, sbed)
    out = sys.stdout

    fmt = "%(cns_id)s,%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \
                       "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(eval)s,%(link)s"

    print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","")
    for cns, qfeat, sfeat in assign(cnsdict,qbed, sbed, qpair_map, spair_map):
        d = cns_fmt_dict(cns, qfeat, sfeat, evaldict)
        d['cns_id'] = cns_id(d)
        if d['sstop'] < d['sstart']:
            d['sstop'], d['sstart'] = d['sstart'], d['sstop']
        d['link'] = cns_link(d, qdsid, sdsid,qpad,spad)
        print >>out, fmt % d
        write_gff(d, qcns_gff, scns_gff)
示例#14
0
def write_new_bed(gene_list, old_bed, missed_genes, out_file):
    merge_fh = open(out_file, "wb")
    hit_list = [hit for hit, qaccn in missed_genes]
    for i, gene in enumerate(old_bed):
        if gene["accn"] in hit_list: continue
        new_line = Bed.row_string(gene)
        merge_fh.write("{0}\n".format(new_line))
    for i, new_gene in enumerate(gene_list):
        ### merge overlapping here
        updated_feat = gene_list[new_gene]
        if len(updated_feat["locs"]) > 1:
            updated_feat = merge_feats(updated_feat)
        new_line = Bed.row_string(updated_feat)
        merge_fh.write("{0}\n".format(new_line))
示例#15
0
文件: merge.py 项目: gturco/co-anno
def write_new_bed(gene_list, old_bed, missed_genes,out_file):
    merge_fh = open(out_file,"wb")
    hit_list = [hit for hit,qaccn in missed_genes]
    for i,gene in enumerate(old_bed):
        if gene["accn"] in hit_list: continue
        new_line = Bed.row_string(gene)
        merge_fh.write("{0}\n".format(new_line))
    for i,new_gene in enumerate(gene_list):
        ### merge overlapping here
        updated_feat = gene_list[new_gene]
        if len(updated_feat["locs"]) > 1:
            updated_feat = merge_feats(updated_feat)
        new_line = Bed.row_string(updated_feat)
        merge_fh.write("{0}\n".format(new_line))
示例#16
0
    def setUp(self):
        handle = open(
            '/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt'
        )
        fh = handle.readlines()
        self.blast_str = ' , '.join(fh)
        self.unmasked_fasta = Fasta('/Users/gturco/find_cns/maize_v2_UM.fasta')

        self.qbed = Bed('/Users/gturco/rice_maize/rice_v6.bed')
        self.qbed.fill_dict()
        self.sbed = Bed('/Users/gturco/maize/maize_v2.bed',
                        '/Users/gturco/maize/maize_v2.fasta')
        self.sbed.fill_dict()
        self.sfeat = self.sbed.accn('GRMZM2G086714')
        self.qfeat = self.qbed.accn('Os09g27050')
示例#17
0
def parse_dups(dups_file, flat):
    #####THIS ONLY WORKS IF WE CHANGE QUOTA
    flat.fill_dict()
    dup_dic = {}
    seen = []

    for line in open(dups_file):
        line = line.strip().split("\t")
        parent = line[0]
        dups = line[1:]
        
        all = [Bed.row_to_dict(flat.d[f]) for f in list(set(line))]
        all.sort(key=operator.itemgetter('start'))
        dup_start = all[0]
        dup_end = all[-1]
        dup_dic[parent] = 'P'
        seen += [parent]
        for dup in dups:
            if dup in seen: continue
            seen.append(dup)
            dup_dic[dup] = parent
        # so here, there are all the genes that arent part of the local dup
        # array, but we want to mark them with 'I'
        intervening = flat.get_features_in_region(dup_start['seqid'], dup_start['start'], dup_end['end'])
        for ii in intervening:
            if ii['accn'] == parent or ii['accn'] == dup_end: continue
            if not ii['accn'] in dup_dic.keys():
                dup_dic[ii['accn']] = 'I'
    return dup_dic
示例#18
0
def main_gene(feature_file, query_list_pos, query_list_neg):
    cds = []
    three_all = []
    five_all = []
    feature_bed = Bed(feature_file)
    for feature in feature_bed:
        exon_meth = []
        for e in feature['locs']:
            for i in range(e[0], e[1] + 1):
                if feature["strand"] == "+":
                    matches = query_list_pos[feature['seqid']].find(i, i)
                else:
                    matches = query_list_neg[feature['seqid']].find(i, i)

                exon_meth.append(len(matches))
        cds.append(sum(exon_meth))
        if feature["strand"] == "+":
            five_prime = query_list_pos[feature['seqid']].find(
                int(feature['locs'][0][0]) - 300, int(feature['locs'][0][0]))
            three_prime = query_list_pos[feature['seqid']].find(
                int(feature['locs'][-1][1]),
                int(feature['locs'][-1][1]) + 300)
        elif feature["strand"] == "-":
            three_prime = query_list_pos[feature['seqid']].find(
                int(feature['locs'][0][0]) - 300, int(feature['locs'][0][0]))
            five_prime = query_list_pos[feature['seqid']].find(
                int(feature['locs'][-1][1]),
                int(feature['locs'][-1][1]) + 300)
        three_all.append(len(three_prime))
        five_all.append(len(five_prime))
    return cds, three_all, five_all
示例#19
0
def main(cns_path, fmt, query_bed_path, subject_bed_path):
  cns_dic = cns_to_dic(cns_path,fmt)
  query_bed = Bed(query_bed_path)
  subject_bed = Bed(subject_bed_path)
  utr_dict = {}
  for cns in cns_dic:
    cns['qstop'] = int(cns['qstop'])
    cns['qstart'] = int(cns['qstart'])
    cns['sstop'] = int(cns['sstop'])
    cns['sstart'] = int(cns['sstart'])
 
    qfeat = query_bed.accn(cns['qaccn'])
    sfeat = subject_bed.accn(cns['saccn']) 
    qgene_space_start = min(qfeat['locs'])[0]
    qgene_space_end = max(qfeat['locs'])[1]
    qgene_space_poly = LineString([(0.0, qgene_space_start), (0.0, qgene_space_end)])
    qgene_poly = LineString([(0.0, qfeat['start']), (0.0, qfeat['end'])])
    sgene_poly = LineString([(0.0, sfeat['start']), (0.0, sfeat['end'])])
    # if intron of one dont need to check other
    qcns = LineString([(0,cns['qstart']),(0,cns['qstop'])])
    scns = LineString([(0,cns['sstart']),(0,cns['sstop'])])
    cns_type(cns,qgene_space_poly, qgene_poly, sgene_poly, scns, qcns,qgene_space_start,qfeat)
    create_utr_list(utr_dict,qfeat, cns,"q")
    create_utr_list(utr_dict,sfeat, cns,"s")
  for cns in cns_dic:
    if cns['type'] == "5-prox_dist":
      qgene_start = min(utr_dict[cns['qaccn']])
      qgene_stop =  max(utr_dict[cns['qaccn']])
      # sstart = min(utr_dict[cns['saccn']])
      # sstop =  max(utr_dict[cns['saccn']])
      five_diff_pos = abs(qgene_start - cns["qstop"])
      five_diff_neg = abs(qgene_stop - cns["qstart"])
      if five_diff_pos <=1000 and cns["qstrand"] == "+" or five_diff_neg <=1000 and cns["qstrand"] == "-":
        cns["type"] = "5-proximal"
      elif five_diff_pos >1000 and cns["qstrand"] == "+" or five_diff_neg >1000 and cns["qstrand"] == "-":
        cns["type"] = "5-distal"
    elif cns['type'] == "3-prox_dist":
      qgene_start = min(utr_dict[cns['qaccn']])
      qgene_stop =  max(utr_dict[cns['qaccn']])
      three_diff_pos =  abs(cns["qstart"] - qgene_stop)
      three_diff_neg =  abs(cns["qstop"] - qgene_start)
      if three_diff_pos <=1000 and cns["qstrand"] == "+" or three_diff_neg <=1000 and cns["qstrand"] == "-":
        cns["type"] = "3-proximal"
      elif three_diff_pos > 1000 and cns["qstrand"] == "+" or three_diff_neg > 1000 and cns["qstrand"] == "-":
        cns["type"] = "3-distal"
  return cns_dic
def main(cns_file,bedpath,fastapath):
    genespace = get_genespace(cns_file)
    bed = Bed(bedpath)
    f = Fasta(fastapath)
    handles = ['3_utr','5_utr','intronic','5_prox','5_distal','3_prox','3_distal']
    fhs = open_files(handles)
    for gene in genespace.keys():
        #cnsspace = genespace[gene]
        try:
            accn = bed.accn(gene)
        except KeyError: continue
        cnsspace = [(max(0,accn['start'] - 12000), accn['end'] + 12000)]
        #print "GENESPACE {0}".format(cnsspace)
        locs = accn['locs']
        locs.sort()
        cnsspace.sort()
        write_to_pos_fasta(bed,accn,locs,cnsspace,fhs,f)
示例#21
0
class LocalDups(object):
    def __init__(self,filename,bed):
        self.filename = filename
        self.bed = Bed(bed)
        self.bed.fill_dict()

    def get_order_dups(self):
        d = {}
        for line in open(self.filename):
            dupline = DupLine(line)
            dups = dupline.get_order(self.bed)
            d[dups[0]['accn']] = "P"
            for dup in dups[1:]:
                d[dup['accn']] = dups[0]['accn']
            intervening = dupline.get_interving_genes(self.bed)
            for i in intervening:
                if i in d.keys():continue
                d[i] = "I"
        self.filename.close()
        return d

    def write_ordered(self,out_fh):
        """write localdups to outfile"""
        localdup_fh = open(out_fh, "w")
        d = {}
        for line in open(self.filename):
            dupline = DupLine(line)
            dups = dupline.get_order(self.bed)
            line = "{0}\n".format("\t".join(dups))
            localdup_fh.write(line)
        localdup_fh.close()


    def get_dups(self):
        d = {}
        for  line in open(self.filename):
            dupline = DupLine(line)
            d[dupline.parent] = 'P'
            for dup in dupline.children:
                d[dup] = dupline.parent
            intervening = dupline.get_interving_genes(self.bed)
            for i in intervening:
                if i in d.keys(): continue
                d[i] = "I"
        self.filename.close()
        return d
示例#22
0
def write_genelist(q_or_s, outfile, flat, pairs, orthos, mcnss, link_fmt, this_org, other_org,
        other_flat, dups, local_dups):
    # used in the link_fmt
    qorg, sorg = this_org, other_org

    fmt = "%(accn)s\t%(seqid)s\t%(start)i\t%(end)i\t%(ortholog)s\t%(ortho_cns)s\t"
    fmt +="%(regional_dup_info)s\t%(local_dup_info)s\t%(strand)s\t"
    fmt += "%(new_gene_info)s\t%(link)s"
    header = fmt.replace('%(', '').replace(')s','').replace(')i','')

    outdir = op.dirname(flat.path)
    annos = dict([kv.rstrip().split(",") for kv in open("%s/%s_protein_rna.anno" % (outdir, q_or_s))])
    if flat.path == other_flat.path:
        annos.update(dict([kv.rstrip().split(",") for kv in open("%s/s_protein_rna.anno" % (outdir,))]))

    out = open(outfile, 'w')
    print >>sys.stderr, "writing genelist to %s" % (outfile,)
    print >>out, header.replace('ortho_', other_org + '_')

    same_org = this_org == other_org
    for feat in flat:

        these_pairs = pairs.get(feat['accn'], [])
        cnss = mcnss.get(feat['accn'], [])

        ortholog, other_pairs = split_pairs(feat, [other_flat.d[t] for t in these_pairs], orthos, q_or_s=='s')
        ortho_cns, non_ortho_cns = split_cns(cnss, orthos, q_or_s=='s')
        regional_dup_info = dups.get(feat['accn'], '')
        local_dup_info = local_dups.get(feat['accn'], '')

        if ortholog:
            ortho = ortholog[0]
            link = link_fmt % dict(qorg=qorg, sorg=sorg,
                                   accn1=ortho['accn'], accn2=feat['accn']
                                  )
        else:
            link = ''

        new_gene_info = ""
        if feat['accn'].endswith(("_cns_protein", "_cns_rna")):
            try:
                new_gene_info = annos[feat['accn']]
            except KeyError: # from coannoation of previous run.
                pass

        ortholog = len(ortholog) and ",".join([o["accn"] for o in ortholog]) or ""
        if len(ortho_cns) > 0 and len(ortholog) == 0:
           print >>sys.stderr, "\nBAD", feat, "\n", ortho_cns, "\nthese:", these_pairs, "\nother:", other_pairs, "\n\n"
           # fell right on the edge of a syntenic block. the cns got in, but not the gene.
           #1/0

        other_pairs = ",".join([o["accn"] for o in other_pairs])
        fmt_dict = locals()
        fmt_dict.update(Bed.row_to_dict(feat))
        fmt_dict.update({'ortho_cns': len(ortho_cns) if ortholog else "",
                         'ortho_NON_cns_count': len(non_ortho_cns) if
                         other_pairs else ""})
        print >>out, fmt % fmt_dict
示例#23
0
class LocalDups(object):
    def __init__(self, filename, bed):
        self.filename = filename
        self.bed = Bed(bed)
        self.bed.fill_dict()

    def get_order_dups(self):
        d = {}
        for line in open(self.filename):
            dupline = DupLine(line)
            dups = dupline.get_order(self.bed)
            d[dups[0]['accn']] = "P"
            for dup in dups[1:]:
                d[dup['accn']] = dups[0]['accn']
            intervening = dupline.get_interving_genes(self.bed)
            for i in intervening:
                if i in d.keys(): continue
                d[i] = "I"
        self.filename.close()
        return d

    def write_ordered(self, out_fh):
        """write localdups to outfile"""
        localdup_fh = open(out_fh, "w")
        d = {}
        for line in open(self.filename):
            dupline = DupLine(line)
            dups = dupline.get_order(self.bed)
            line = "{0}\n".format("\t".join(dups))
            localdup_fh.write(line)
        localdup_fh.close()

    def get_dups(self):
        d = {}
        for line in open(self.filename):
            dupline = DupLine(line)
            d[dupline.parent] = 'P'
            for dup in dupline.children:
                d[dup] = dupline.parent
            intervening = dupline.get_interving_genes(self.bed)
            for i in intervening:
                if i in d.keys(): continue
                d[i] = "I"
        self.filename.close()
        return d
示例#24
0
def merge_flat(new_name, aflat, bflat):
    """take 2 flat files and return a new one that is the union of the 2
      existing"""
    seen = {}
    both = []
    for flat in (aflat, bflat):
        for row in flat:
            key = row['seqid'], row['accn']
            if key in seen: continue
            seen[key] = True
            both.append(row)
            both.sort(key=lambda a: (a['seqid'], a['start']))
    fh = open(new_name, "w")
    #print >>fh, "\t".join(Flat.names)
    for b in both:
        print >> fh, Bed.row_string(b)
    fh.close()
    return Bed(fh.name)
示例#25
0
def main(cns_file, bedpath, fastapath):
    genespace = get_genespace(cns_file)
    bed = Bed(bedpath)
    f = Fasta(fastapath)
    handles = [
        '3_utr', '5_utr', 'intronic', '5_prox', '5_distal', '3_prox',
        '3_distal'
    ]
    fhs = open_files(handles)
    for gene in genespace.keys():
        #cnsspace = genespace[gene]
        try:
            accn = bed.accn(gene)
        except KeyError:
            continue
        cnsspace = [(max(0, accn['start'] - 12000), accn['end'] + 12000)]
        #print "GENESPACE {0}".format(cnsspace)
        locs = accn['locs']
        locs.sort()
        cnsspace.sort()
        write_to_pos_fasta(bed, accn, locs, cnsspace, fhs, f)
示例#26
0
文件: merge.py 项目: gturco/find_cns
def merge_same_hits(missed, fh_match, org_bed):
    """ groups genes that hit more then once """
    d = {}
    handle = open(fh_match)
    matches = handle.read()
    org_bed_path = org_bed.path
    path = org_bed_path.split('/')
    dirc = '/'.join(path[:-1])
    org = path[-1]
    fh = open('{0}/missed_from_{1}'.format(dirc,org), "wb")
    for match in matches.split('\n')[:-1]:
        qaccn,saccn = match.split('\t')
        #create dictionary
        try:
            seqid = missed.accn(qaccn)['seqid']
            haccn = missed.accn(qaccn)
        except KeyError: continue
        #if near_gene(haccn,org_bed)==True: continue
        if (seqid,saccn) not in d.keys():
            #append whole dict to keys
            d[(seqid,saccn)]= missed.accn(qaccn)
        else:
            #else add locs to exsting one
            gene_start = min(d[(seqid,saccn)]['locs'])[0]
            gene_end = max(d[(seqid,saccn)]['locs'])[1]
            missed_end = missed.accn(qaccn)['locs'][0][1]
            missed_start = missed.accn(qaccn)['locs'][0][0]
            if missed_end < gene_start:
                # if no intervening genes and they are close together...
                intervening_genes = get_intervening_genes(missed_end,gene_start,seqid, org_bed, d[(seqid,saccn)]['accn'])
                if intervening_genes is False:
                    d[(seqid,saccn)]['locs'] =  d[(seqid,saccn)]['locs'] + missed.accn(qaccn)['locs']
                    d[(seqid,saccn)]['start'] = missed_start
                    if 'Os' in qaccn:
		    	        d[seqid,saccn]['accn'] = qaccn
                else:
                    d[(seqid,qaccn)] = missed.accn(qaccn)
            elif gene_end < missed_start:
                intervening_genes = get_intervening_genes(gene_end,missed_start,seqid, org_bed,d[(seqid,saccn)]["accn"])
                if intervening_genes is False:
                    d[(seqid,saccn)]['locs'] =  d[(seqid,saccn)]['locs'] + missed.accn(qaccn)['locs']
                    d[(seqid,saccn)]['end'] = missed_end
                    if 'Os' in qaccn:
                        d[seqid,saccn]['accn'] = qaccn
                else:
                    d[(seqid,qaccn)]= missed.accn(qaccn)
            else:
                d[(seqid,saccn)]['locs'] =  d[(seqid,saccn)]['locs'] + missed.accn(qaccn)['locs']
        
    for key in d.keys():
        new_row = d[key]['locs'].sort()
        row = d[key]
        print >>fh, Bed.row_string(row)
示例#27
0
文件: merge.py 项目: gturco/find_cns
def merge(org_bed, missed, merge_file):
    """creates blast.all file and updates everything"""
    merge_fh = open(merge_file, "w")
    #cds_missed = missed[missed['ftype'] == 'CDS']
    #count = org_bed.shape[0] + missed[missed['ftype'] !='CDS'].shape[0]
    new_rows = []
    seen_accns = {}
    # CDS added to existing gene.
    for row_missed in missed:
        if row_missed['accn'] in seen_accns: continue
        try:
            org_bed_row = org_bed.accn(row_missed['accn'])
             # it's a CDS
        except KeyError:
            #its a new gene
            new_rows.append(row_missed)
            seen_accns[row_missed['accn']] = True
            continue
        locs_interval = Intersecter()
        [locs_interval.add_interval(Feature(start,stop)) for start,stop in org_bed_row['locs']]
        for missed_start,missed_end in row_missed['locs']:
            if len(locs_interval.find(missed_start,missed_end)) > 0:
#                print >>sys.stderr, org_bed_row['accn']
                locs_intersects = [(l.start,l.stop) for l in locs_interval.find(missed_start,missed_end)]
                [org_bed_row['locs'].remove(locs_intersect) for locs_intersect in locs_intersects]
                locs_intersects = set(locs_intersects)
		locs_intersects.add((missed_start,missed_end))
                locs_start = min([start for start,end in locs_intersects])
                locs_end = max([end for start,end in locs_intersects])
                org_bed_row['locs'] = org_bed_row['locs'] + [(locs_start,locs_end)]
                row_missed['locs'].remove((missed_start,missed_end))

        org_bed_row['locs'] = org_bed_row['locs'] + row_missed['locs']
        #print >>sys.stderr, "{0},{1}".format(row_missed['accn'], locs)
        org_bed_row['locs'].sort()
        org_bed_row['start'] = min(min([start for start,end in org_bed_row['locs']]), org_bed_row['start'])
        org_bed_row['end'] = max(max([end for start,end in org_bed_row['locs']]), org_bed_row['end'])
        new_rows.append(org_bed_row)
        seen_accns[org_bed_row['accn']] =True

    for org_bed_rw in org_bed:
        if org_bed_rw['accn'] not in seen_accns:
            new_rows.append(org_bed_rw)
            seen_accns[org_bed_rw['accn']] =True

    def row_cmp(a,b):
        return cmp(a['seqid'], b['seqid']) or cmp(a['start'], b['start'])


    new_rows.sort(cmp=row_cmp)
    #print >>merge_fh, "\t".join(Bed.names)
    for i, row in enumerate(new_rows):
        print >>merge_fh, Bed.row_string(row)
def freq(feature_file,window_size,interval,meth_data):
  features = Bed(feature_file)
  for feature in features:
        region = range(int(feature["start"]),int(feature["end"])+1)
        for window_start in region[::interval]:
            window_end = window_start + window_size
            if window_end > region[-1]:
                matches = meth_data[feature['seqid']].find(window_start, region[-1])
            else:
                matches = meth_data[feature['seqid']].find(window_start,window_end)
            if len(matches) < 15 : continue
            kw(matches,feature['seqid'],window_start,window_end)
示例#29
0
class TestMaize(unittest.TestCase):
    def setUp(self):
        handle = open(
            '/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt'
        )
        fh = handle.readlines()
        self.blast_str = ' , '.join(fh)
        self.unmasked_fasta = Fasta('/Users/gturco/find_cns/maize_v2_UM.fasta')

        self.qbed = Bed('/Users/gturco/rice_maize/rice_v6.bed')
        self.qbed.fill_dict()
        self.sbed = Bed('/Users/gturco/maize/maize_v2.bed',
                        '/Users/gturco/maize/maize_v2.fasta')
        self.sbed.fill_dict()
        self.sfeat = self.sbed.accn('GRMZM2G086714')
        self.qfeat = self.qbed.accn('Os09g27050')

    def test_get_cmd(self):
        sfasta = 'data/rice_v6_maize_v2/maize_v2_split/2.fasta'
        qfasta = 'data/rice_v6_maize_v2/rice_v6_split/4.fasta'

    def test_parse_balse(self):
        orientaion = -1
        cns = parse_blast(self.blast_str, orientaion, self.qfeat, self.sfeat,
                          self.qbed, self.sbed, 12000, 26000,
                          self.unmasked_fasta)
        print cns
示例#30
0
文件: assign.py 项目: brentp/find_cns
def main(cnsfile, qbed_file, sbed_file, pairsfile, pairs_fmt):
    qcns_file = qbed_file.replace(".bed", "_cns.gff")
    assert qcns_file != qbed_file
    qcns_gff = open(qcns_file, 'w')
    print >>qcns_gff, "##gff-version 3"
    if sbed_file != qbed_file:
        scns_file = sbed_file.replace(".bed", "_cns.gff")
        assert scns_file != sbed_file
        scns_gff = open(scns_file, 'w')
        print >>scns_gff, "##gff-version 3"
    else:
        scns_gff = qcns_gff
    
    qbed = Bed(qbed_file); qbed.fill_dict()
    sbed = Bed(sbed_file); sbed.fill_dict()


    cnsdict = get_cns_dict(cnsfile)
    qpair_map, spair_map = make_pair_maps(pairsfile, pairs_fmt, qbed, sbed)
    out = sys.stdout

    fmt = "%(cns_id)s,%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \
                       "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s"

    print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","")
    for cns, qfeat, sfeat in assign(cnsdict, qbed, sbed, qpair_map, spair_map):
        d = cns_fmt_dict(cns, qfeat, sfeat)
        d['cns_id'] = cns_id(d)
        if d['sstop'] < d['sstart']:
            d['sstop'], d['sstart'] = d['sstart'], d['sstop']

        print >>out, fmt % d
        write_gff(d, qcns_gff, scns_gff)
示例#31
0
def main(missed, fh_match, org_bed):
    """first megers all hits to the same gene... then updates the entire bed
    file output: all_ORG.bed """

    merge_same_hits(missed, fh_match, org_bed)
    org_bed_path = org_bed.path
    path = org_bed_path.split('/')
    dirc = '/'.join(path[:-1])
    org = path[-1]
    missed2 = '{0}/missed_from_{1}'.format(dirc, org)
    merge_fh = "{0}/all_{1}".format(dirc, org)
    print missed2
    merge(org_bed, Bed(missed2), merge_fh)
示例#32
0
def main(cnsfile, qbed_file, sbed_file, pairsfile, pck, qorg, sorg, padding):
    qbed = Bed(qbed_file); qbed.fill_dict()
    sbed = Bed(sbed_file); sbed.fill_dict()
    cnsdict = get_cns_dict(cnsfile)
    qpair_map = make_pair_maps(pairsfile, 'pair', qbed, sbed)
    out = sys.stdout
    
    fmt = "%(saccn)s,%(saccnL)s,%(saccnR)s,%(schr)s,%(sstart)i,%(sstop)i," + \
                     "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(link)s" 
                     
    print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","")
    for cns, saccn, saccn_l, saccn_r, qfeat in assign(cnsdict, qbed, qpair_map): 
        d = cns_fmt_dict(cns, qfeat, saccn, saccn_l, saccn_r)
        d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr,qfeat, pck, sbed, qbed, sorg, qorg, padding)
        print >>out, fmt % d
示例#33
0
def main(cnsfile, qbed_file, sbed_file, qorg, sorg, padding):
    qbed = Bed(qbed_file); qbed.fill_dict()
    sbed = Bed(sbed_file); sbed.fill_dict()
    cnsdict = get_cns_dict(cnsfile)
    out = sys.stdout
    
    fmt = "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \
                       "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(link)s"
                     
    print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","")
    for cns, qfeat, sfeat in assign(cnsdict, qbed, sbed): 
        d = cns_fmt_dict(cns, qfeat, sfeat)
	if d['sstop'] < d['sstart']:
            d['sstop'], d['sstart'] = d['sstart'], d['sstop']        
	d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr, sorg, qorg, padding)
        print >>out, fmt % d
示例#34
0
def main(cns_path, fmt, query_bed_path, subject_bed_path):
    cns_dic = cns_to_dic(cns_path, fmt)
    query_bed = Bed(query_bed_path)
    subject_bed = Bed(subject_bed_path)
    utr_dict = {}
    for cns in cns_dic:
        cns['qstop'] = int(cns['qstop'])
        cns['qstart'] = int(cns['qstart'])
        cns['sstop'] = int(cns['sstop'])
        cns['sstart'] = int(cns['sstart'])

        qfeat = query_bed.accn(cns['qaccn'])
        sfeat = subject_bed.accn(cns['saccn'])
        qgene_space_start = min(qfeat['locs'])[0]
        qgene_space_end = max(qfeat['locs'])[1]
        qgene_space_poly = LineString([(0.0, qgene_space_start),
                                       (0.0, qgene_space_end)])
        qgene_poly = LineString([(0.0, qfeat['start']), (0.0, qfeat['end'])])
        sgene_poly = LineString([(0.0, sfeat['start']), (0.0, sfeat['end'])])
        # if intron of one dont need to check other
        qcns = LineString([(0, cns['qstart']), (0, cns['qstop'])])
        scns = LineString([(0, cns['sstart']), (0, cns['sstop'])])
        cns_type(cns, qgene_space_poly, qgene_poly, sgene_poly, scns, qcns,
                 qgene_space_start, qfeat)
        create_utr_list(utr_dict, qfeat, cns, "q")
        create_utr_list(utr_dict, sfeat, cns, "s")
    for cns in cns_dic:
        if cns['type'] == "5-prox_dist":
            qgene_start = min(utr_dict[cns['qaccn']])
            qgene_stop = max(utr_dict[cns['qaccn']])
            # sstart = min(utr_dict[cns['saccn']])
            # sstop =  max(utr_dict[cns['saccn']])
            five_diff_pos = abs(qgene_start - cns["qstop"])
            five_diff_neg = abs(qgene_stop - cns["qstart"])
            if five_diff_pos <= 1000 and cns[
                    "qstrand"] == "+" or five_diff_neg <= 1000 and cns[
                        "qstrand"] == "-":
                cns["type"] = "5-proximal"
            elif five_diff_pos > 1000 and cns[
                    "qstrand"] == "+" or five_diff_neg > 1000 and cns[
                        "qstrand"] == "-":
                cns["type"] = "5-distal"
        elif cns['type'] == "3-prox_dist":
            qgene_start = min(utr_dict[cns['qaccn']])
            qgene_stop = max(utr_dict[cns['qaccn']])
            three_diff_pos = abs(cns["qstart"] - qgene_stop)
            three_diff_neg = abs(cns["qstop"] - qgene_start)
            if three_diff_pos <= 1000 and cns[
                    "qstrand"] == "+" or three_diff_neg <= 1000 and cns[
                        "qstrand"] == "-":
                cns["type"] = "3-proximal"
            elif three_diff_pos > 1000 and cns[
                    "qstrand"] == "+" or three_diff_neg > 1000 and cns[
                        "qstrand"] == "-":
                cns["type"] = "3-distal"
    return cns_dic
示例#35
0
def merge_flat(new_name, aflat, bflat):
    """take 2 flat files and return a new one that is the union of the 2
      existing"""
    seen = {}
    both = []
    for flat in (aflat, bflat):
        for row in flat:
            key = row['seqid'], row['accn']
            if key in seen: continue
            seen[key] = True
            both.append(row)
            both.sort(key=lambda a: (a['seqid'],a['start']))
    fh = open(new_name, "w")
    #print >>fh, "\t".join(Flat.names)
    for b in both:
        print >>fh, Bed.row_string(b)
    fh.close()
    return Bed(fh.name)
示例#36
0
def main(cnsfile, qbed_file, sbed_file, qorg, sorg, padding):
    qbed = Bed(qbed_file); qbed.fill_dict()
    sbed = Bed(sbed_file); sbed.fill_dict()
    cnsdict = get_cns_dict(cnsfile)
    out = sys.stdout
    
    fmt = "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \
                       "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(link)s"
                     
    print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","")
    for cns, qfeat, sfeat in assign(cnsdict, qbed, sbed): 
        d = cns_fmt_dict(cns, qfeat, sfeat)
        d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr, sorg, qorg, padding)
        print >>out, fmt % d
示例#37
0
def main(cnsfile, qbed_file, sbed_file, pairsfile, pck, qorg, sorg, padding):
    qbed = Bed(qbed_file); qbed.fill_dict()
    sbed = Bed(sbed_file); sbed.fill_dict()
    cnsdict = get_cns_dict(cnsfile)
    qpair_map = make_pair_maps(pairsfile, 'pair', qbed, sbed)
    out = sys.stdout
    
    fmt = "%(saccn)s,%(saccnL)s,%(saccnR)s,%(schr)s,%(sstart)i,%(sstop)i," + \
                     "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(link)s" 
                     
    print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","")
    for cns, saccn, saccn_l, saccn_r, qfeat in assign(cnsdict, qbed, qpair_map): 
        d = cns_fmt_dict(cns, qfeat, saccn, saccn_l, saccn_r)
        d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr,qfeat, pck, sbed, qbed, sorg, qorg, padding)
        print >>out, fmt % d
示例#38
0
def loadintointersect(bed_file):
    query_list_pos = {}
    query_list_neg = {}
    feature_list = Bed(bed_file)
    for feature in feature_list:
        ##    if float(feature['accn']) < .4: continue
        if feature["strand"] == "+":
            ### ADD one because bed adds one too number
            if feature['seqid'] not in list(query_list_pos):
                query_list_pos[feature['seqid']] = Intersecter()
            query_list_pos[feature['seqid']].add_interval(
                Feature(int(feature['start'] - 1),
                        int(feature['start'] - 1),
                        name=feature['strand']))
        elif feature["strand"] == "-":
            if feature['seqid'] not in list(query_list_neg):
                query_list_neg[feature['seqid']] = Intersecter()
            query_list_neg[feature['seqid']].add_interval(
                Feature(int(feature['start'] - 1),
                        int(feature['start'] - 1),
                        name=feature['strand']))
    return query_list_pos, query_list_neg
示例#39
0
def print_bed(flist, old_path):
    ipath, ext = op.splitext(old_path)
    path = "%s.with_new%s" % (ipath, ext)

    print >>sys.stderr,  "writing to: %s.with_new%s" % (ipath, ext)
    fh = open(path, 'wb')
    seen = {}

    for item in flist:
        # convert the locs to a tuple.
        #print >>sys.stderr, item
        item = list(item)
        item[6] = tuple(item[6])
        item = tuple(item)
        if item in seen: continue
        seen[item] = 1
        locs = item[6] # tuple(sorted([item[1], item[2]]))

        row = dict(accn=item[3], start=item[1], end=item[2], seqid=item[0],
                   locs=locs, score='.', strand=item[5], rgb='.', thickstart='.', thickend=".")
        print >>fh, Bed.row_string(row)
    fh.close()
    return Bed(path)
class TestPerfectTargetRegion(unittest.TestCase):
    def setUp(self):
        self.gene_name = "Os01g02110"
        self.bed = Bed("ricetest.bed")
        self.fasta = Fasta("ricetest.fasta")
        self.gene = self.bed.accn(self.gene_name)
        self.exons = self.gene['locs']


    def test_rel_pos(self):

        self.assertEqual((376,486),rel_pos(self.gene,self.exons[0]))
        self.assertEqual((1289,1789),rel_pos(self.gene,self.exons[-1]))

    def test_fasta(self):
        exon = self.exons[-1]
        seq = self.fasta[self.gene_name][:]
        self.assertTrue(1789 <= len(seq))

    def test_pattern(self):
        e = exons[-1]
        start, stop = rel_pos(self.gene,e)
        for exon in self.exons:
示例#41
0
class TestMaize(unittest.TestCase):
    def setUp(self):
        handle = open("/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt")
        fh = handle.readlines()
        self.blast_str = " , ".join(fh)
        self.unmasked_fasta = Fasta("/Users/gturco/find_cns/maize_v2_UM.fasta")

        self.qbed = Bed("/Users/gturco/rice_maize/rice_v6.bed")
        self.qbed.fill_dict()
        self.sbed = Bed("/Users/gturco/maize/maize_v2.bed", "/Users/gturco/maize/maize_v2.fasta")
        self.sbed.fill_dict()
        self.sfeat = self.sbed.accn("GRMZM2G086714")
        self.qfeat = self.qbed.accn("Os09g27050")

    def test_get_cmd(self):
        sfasta = "data/rice_v6_maize_v2/maize_v2_split/2.fasta"
        qfasta = "data/rice_v6_maize_v2/rice_v6_split/4.fasta"

    def test_parse_balse(self):
        orientaion = -1
        cns = parse_blast(
            self.blast_str, orientaion, self.qfeat, self.sfeat, self.qbed, self.sbed, 12000, 26000, self.unmasked_fasta
        )
        print cns
示例#42
0
 def __init__(self,filename,bed):
     self.filename = filename
     self.bed = Bed(bed)
     self.bed.fill_dict()
示例#43
0
    import optparse
    parser = optparse.OptionParser("usage: %prog [options] ")
    parser.add_option("-F", dest="mask", help="blast mask simple sequence [default: F]", default="F")
    parser.add_option("-n", dest="ncpu", help="parallelize to this many cores", type='int', default=8)
    parser.add_option("-q", dest="qfasta", help="path to genomic query fasta")
    parser.add_option("--qbed", dest="qbed", help="query bed file")
    parser.add_option("-s", dest="sfasta", help="path to genomic subject fasta")
    parser.add_option("--sbed", dest="sbed", help="subject bed file")
    parser.add_option("-p", dest="pairs", help="the pairs file. output from dagchainer")
    choices = ("dag", "cluster", "pair", 'qa', 'raw')
    parser.add_option("--pair_fmt", dest="pair_fmt", default='raw',
                      help="format of the pairs, one of: %s" % str(choices),
                      choices=choices)
    parser.add_option("--qpad", dest="qpad", type='int', default=12000,
                      help="how far from the end of the query gene to look for cnss")
    parser.add_option("--spad", dest="spad", type='int', default=26000,
                    help="how far from the end of the subject gene to look for cnss")
    parser.add_option("--UMfasta", dest="unmasked_fasta", help="path to unmasked fasta file file")
    (options, _) = parser.parse_args()


    if not (options.qfasta and options.sfasta and options.sbed and options.qbed):
        sys.exit(parser.print_help())

    qbed = Bed(options.qbed, options.qfasta); qbed.fill_dict()
    sbed = Bed(options.sbed, options.sfasta); sbed.fill_dict()
    unmasked_fasta = Fasta(options.unmasked_fasta)
    assert options.mask in 'FT'

    main(qbed, sbed, options.pairs, options.qpad, options.spad, unmasked_fasta, options.pair_fmt, options.mask, options.ncpu)
示例#44
0
        spos = sbed[raw.pos_b]
        key = (raw.seqid_a, raw.seqid_b)
        if not key in trees: trees[key] = []
        qpos = (qpos['start'] + qpos['end']) / 2
        spos = (spos['start'] + spos['end']) / 2
        trees[key].append((int(qpos), int(spos)))
    for k in trees:
        trees[k] = cKDTree(trees[k])
    return trees

if __name__ == "__main__":
    import optparse
    parser = optparse.OptionParser()
    parser.add_option("--qbed", dest="qbed", help="query bed file")
    parser.add_option("--sbed", dest="sbed", help="subject bed file")
    parser.add_option("--cns", dest="cns", help="path to raw cns")
    parser.add_option("--dist", dest="dist", type='int', help="max dist from gene to cns", default=12000)
    parser.add_option("--paralogy", dest="paralogy", help="path to paralogy file")
    parser.add_option("--orthology", dest="orthology", help="path to orthology file")

    options, args = parser.parse_args()    

    if not (options.sbed and options.qbed and options.cns, options.orthology):
        sys.exit(parser.print_help())

    qbed = Bed(options.qbed); qbed.fill_dict()
    sbed = Bed(options.sbed); sbed.fill_dict()
    
    qbed_new, sbed_new, new_pairs = main(qbed, sbed, options.cns, options.dist, options.orthology)
    write_new_pairs(options.paralogy, options.orthology, qbed, qbed_new, sbed, sbed_new, new_pairs) 
示例#45
0
    parser.add_option("--paralogy",  dest="paralogy",  help="paralogy file")
    parser.add_option("--orthology",  dest="orthology",  help="orthology file")

    opts, _ = parser.parse_args()

    if not (opts.qflat_all and opts.sflat_all and opts.datasheet):
        print "A"
        sys.exit(parser.print_help())
    if not (opts.qdsgid and opts.qorg and opts.sorg):
        print "B"
        sys.exit(parser.print_help())
    if not (opts. qdups and opts.sdups and opts.paralogy and opts.orthology):
        print "C"
        sys.exit(parser.print_help())

    qflat_new = Bed(opts.qflat_new)
    sflat_new = qflat_new if opts.qflat_new == opts.sflat_new else Bed(opts.sflat_new)

    qflat_all = Bed(opts.qflat_all)
    sflat_all = qflat_all if opts.qflat_all == opts.sflat_all else Bed(opts.sflat_all)

    qfpath = "%s.all%s" % op.splitext(qflat_new.path)
    sfpath = "%s.all%s" % op.splitext(sflat_new.path)

    qflat = merge_flat(qfpath, qflat_all, qflat_new)
    sflat = merge_flat(sfpath, sflat_all, sflat_new)

    
    qdups = parse_dups(opts.qdups, qflat)
    sdups = parse_dups(opts.sdups, sflat)
    qlocaldups = parse_dups(opts.qlocaldups,qflat)
 def setUp(self):
     self.gene_name = "Os01g02110"
     self.bed = Bed("ricetest.bed")
     self.fasta = Fasta("ricetest.fasta")
     self.gene = self.bed.accn(self.gene_name)
     self.exons = self.gene['locs']
示例#47
0
        #     print interval_list[0].find(0,100000000)
        #     print interval_list[0].find(3577840,3577841)
        #     print three_prom
        #     print gene_body
        #     print five_prom
        #three_prom = [i for i in three_prom if i.name == gene['strand']]
        #five_prom = [i for i in five_prom if i.name == gene['strand']]
        #gene_body = [i for i in gene_body if i.name == gene['strand']]

        if len(three_prom) > 0:
            l = "{0}\t3_prom\t{1}\t{2}\t{3}\n".format(
                gene_name, three_prom_p, three_prom_p * len(three_prom),
                sum(int(sig.name) for sig in three_prom))
            out.write(l)
        if len(five_prom) > 0:
            l = "{0}\t5_prom\t{1}\t{2}\t{3}\n".format(
                gene_name, five_prom_p, five_prom_p * len(five_prom),
                sum(int(sig.name) for sig in five_prom))
            out.write(l)
        if len(gene_body) > 0:
            l = "{0}\tgene_body\t{1}\t{2}\t{3}\n".format(
                gene_name, gene_body_p, gene_body_p * len(gene_body),
                sum(int(sig.name) for sig in gene_body))
            out.write(l)


genelist = Bed("sorg.bed")
interval_list = insert_queries("DMR_NONVAS_CG_HYPO")

find_intersections(3000, interval_list, genelist, "DMR_nonvas.genes")
示例#48
0
def merge_same_hits(missed, fh_match, org_bed):
    """ groups genes that hit more then once """
    d = {}
    handle = open(fh_match)
    matches = handle.read()
    org_bed_path = org_bed.path
    path = org_bed_path.split('/')
    dirc = '/'.join(path[:-1])
    org = path[-1]
    fh = open('{0}/missed_from_{1}'.format(dirc, org), "wb")
    for match in matches.split('\n')[:-1]:
        qaccn, saccn = match.split('\t')
        #create dictionary
        try:
            seqid = missed.accn(qaccn)['seqid']
            haccn = missed.accn(qaccn)
        except KeyError:
            continue
        #if near_gene(haccn,org_bed)==True: continue
        if (seqid, saccn) not in d.keys():
            #append whole dict to keys
            d[(seqid, saccn)] = missed.accn(qaccn)
        else:
            #else add locs to exsting one
            gene_start = min(d[(seqid, saccn)]['locs'])[0]
            gene_end = max(d[(seqid, saccn)]['locs'])[1]
            missed_end = missed.accn(qaccn)['locs'][0][1]
            missed_start = missed.accn(qaccn)['locs'][0][0]
            if missed_end < gene_start:
                # if no intervening genes and they are close together...
                intervening_genes = get_intervening_genes(
                    missed_end, gene_start, seqid, org_bed,
                    d[(seqid, saccn)]['accn'])
                if intervening_genes is False:
                    d[(seqid, saccn)]['locs'] = d[
                        (seqid, saccn)]['locs'] + missed.accn(qaccn)['locs']
                    d[(seqid, saccn)]['start'] = missed_start
                    if 'Os' in qaccn:
                        d[seqid, saccn]['accn'] = qaccn
                else:
                    d[(seqid, qaccn)] = missed.accn(qaccn)
            elif gene_end < missed_start:
                intervening_genes = get_intervening_genes(
                    gene_end, missed_start, seqid, org_bed,
                    d[(seqid, saccn)]["accn"])
                if intervening_genes is False:
                    d[(seqid, saccn)]['locs'] = d[
                        (seqid, saccn)]['locs'] + missed.accn(qaccn)['locs']
                    d[(seqid, saccn)]['end'] = missed_end
                    if 'Os' in qaccn:
                        d[seqid, saccn]['accn'] = qaccn
                else:
                    d[(seqid, qaccn)] = missed.accn(qaccn)
            else:
                d[(seqid, saccn)]['locs'] = d[
                    (seqid, saccn)]['locs'] + missed.accn(qaccn)['locs']

    for key in d.keys():
        new_row = d[key]['locs'].sort()
        row = d[key]
        print >> fh, Bed.row_string(row)
示例#49
0
 def test_main(self):
     """test for test_get_cns_dict"""
     qbed = Bed(self.qbed, self.qfasta); qbed.fill_dict()
     sbed = Bed(self.sbed, self.sfasta); sbed.fill_dict()
     x = main(qbed, sbed, self.pairs, 12000,12000, "pair", self.blast_path, "T",2)
     print x
# write a genomic fasta file with all sequences covered by features
# in the specified Bed file masked to N.
from flatfeature import Bed
import sys
# b = Bed(sys.argv[1], sys.argv[2])
b = Bed("/Users/gturco/data/rice_v6.bed", "/Users/gturco/data/rice_v6.fasta")

for seqid, seq in b.mask_cds():
    seqids =  []
    seq.tostring()
示例#51
0
def main(feature_bed, query_list_pos, query_list_neg, fasta_file, mtype, rand):
    features = Bed(feature_bed)
    fasta = Fasta(fasta_file)
    All_sites = defaultdict(list)
    r = {}
    cgene = {}
    for feature in features:
        rc = feature["strand"] == "-"
        if feature["strand"] == "+":
            TSS_region = range(
                int(feature['locs'][0][0]) - 2000, int(feature['locs'][0][0]))
            TTS_region = range(int(feature['locs'][-1][1]),
                               int(feature['locs'][-1][1]) + 2000)
            TSS_sites = get_matchs(query_list_pos, feature['seqid'],
                                   TSS_region,
                                   fasta["chromosome_" + feature["seqid"]],
                                   -2000, rc)
            TE_sites = get_matchs(query_list_pos, feature['seqid'], TTS_region,
                                  fasta["chromosome_" + feature['seqid']],
                                  1000, rc)
            gene_body, rebin = get_genebody(
                query_list_pos, feature,
                fasta["chromosome_" + feature["seqid"]], rc, rand)
            r[feature["accn"]] = rebin
            cgene[feature["accn"]] = gene_body

            #       [All_sites[str(region)].append(freq) for region,freq in TSS_sites]
            # 	[All_sites[str(region)].append(freq) for region,freq in TE_sites]
            [
                All_sites[feature["accn"]].append((region, freq))
                for region, freq in TSS_sites
            ]
            [
                All_sites[feature["accn"]].append((region, freq))
                for region, freq in TE_sites
            ]

        if feature["strand"] == "-":
            TTS_region = range(
                int(feature['locs'][0][0]) - 2000, int(feature['locs'][0][0]))
            TSS_region = range(int(feature['locs'][-1][1]),
                               int(feature['locs'][-1][1]) + 2000)
            TSS_sites = get_matchs(query_list_neg, feature['seqid'],
                                   TSS_region,
                                   fasta["chromosome_" + feature["seqid"]],
                                   -2000, rc)
            TE_sites = get_matchs(query_list_neg, feature['seqid'], TTS_region,
                                  fasta["chromosome_" + feature['seqid']],
                                  1100, rc)

            ###RV complent
            gene_body, rebin = get_genebody(
                query_list_neg, feature,
                fasta["chromosome_" + feature["seqid"]], rc, rand)
            r[feature["accn"]] = rebin
            cgene[feature["accn"]] = gene_body

            ##[All_sites[str(region)].append(freq) for region,freq in TSS_sites]
            ##[All_sites[str(region)].append(freq) for region,freq in TE_sites]
            [
                All_sites[feature["accn"]].append((region, freq))
                for region, freq in TSS_sites
            ]
            [
                All_sites[feature["accn"]].append((region, freq))
                for region, freq in TE_sites
            ]

    return All_sites, r, cgene
示例#52
0
        if strand == '-':
            my_seq = fasta
            fasta = str(Seq(my_seq).reverse_complement())
        if len(fasta) == 0:
            #print start,stop,accn['accn']
            continue
        seq_w = "{0}\n".format(fasta)
        new_fasta.write(w)
        new_fasta.write(seq_w)


####### tair ##########
#x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/thaliana_v8.fasta","/Users/gt/thaliana_v8_control_SB.fasta")
x = random_noncoding(
    '/Users/gt/Desktop/tmp.csv',
    Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_golden/thaliana_v8.with_new_cns_mask.bed'
        ),
    "/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta",
    "/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta"
)

######### rice,sorg,set #####
##### took out strand info used N to mask bed also ########

#x =
#random_noncoding('/Users/gt/Desktop/paper/G-box-seq/rice_rice/tmp.csv',Bed('/Users/gt/Desktop/paper/G-box-seq/rice.with_new_cns_mask.bed'),"/Users/gt/Desktop/paper/G-box-seq/rice_rice/rice_j.fasta","/Users/gt/Desktop/paper/G-box-seq/rice_rice/rice_rice_control_fasta")
#x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta")
#x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta")
#x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta")
#x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta")
#x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta")
#
示例#53
0
        seq = f[seqid][start:end]
        if "X" in seq:
            print accn, seqid, start, end
        if len(seq) < 15 and len(seq) > 0:
            print "OH NO!!!!!!"
        w = ">cns{0}\n".format(n)
        seq_w = "{0}\n".format(seq)
        new_fasta.write(w)
        new_fasta.write(seq_w)


dict_size = gene_size_dict('/Users/gturco/Desktop/rice_sorg_size.tsv')
#dict_size = gene_size_dict("test_file")
x = random_noncoding(
    dict_size,
    Bed('/Users/gturco/data/paper3/rice_b_sorghum_v1.nolocaldups.with_new_cns_mask.bed'
        ))
print len(x)
#####print x
get_seq(x, "/Users/gturco/data/paper3/rice_b.fasta",
        "/Users/gturco/test.fasta")
##
##### seq for cns
#handle = open("/Users/gturco/data/paper3/rice_b_sorghum_v1.cns.assigned_real.csv")
#fh = handle.read()
#cns_list = []
#for line in fh.split("\n")[:-1]:
#    if line[0] == "#": continue
#    cns_id,accn,seqid,start,end,strand = line.split(",")[:6]
#    cns_list.append((seqid,int(start),int(end)))
#
#len(cns_list)
示例#54
0
def write_bed(gene,merge_fh):
     new_line = Bed.row_string(gene)
     merge_fh.write("{0}\n".format(new_line))
示例#55
0
                      type='string',
                      help="path to query localdup_file")
    parser.add_option("--sdups",
                      dest="sdups",
                      type='string',
                      help="path to subject localdup_file")
    parser.add_option("--cns_file",
                      dest="cns_file",
                      type='string',
                      help="path to cns file cns.txt")
    parser.add_option("--UMfasta",
                      dest="unmasked_fasta",
                      help="path to unmasked fasta file file")
    (options, _) = parser.parse_args()

    qbed = Bed(options.qbed, options.qfasta)
    qbed.fill_dict()
    sbed = Bed(options.sbed, options.sfasta)
    sbed.fill_dict()
    unmasked_fasta = Fasta(options.unmasked_fasta)
    assert options.mask in 'FT'

    qnolocaldups_path = qbed.path.split(".")[0] + ".nolocaldups.bed"
    snolocaldups_path = sbed.path.split(".")[0] + ".nolocaldups.bed"
    #pairs_to_qa("{0}.local".format(options.pairs),'pair',"{0}.nolocaldups.local".format(qbed.path.split(".")[0]),"{0}.nolocaldups.local".format(sbed.path.split(".")[0]),"{0}.raw.filtered.local".format(options.pairs.split(".")[0]))

    import logging
    LOG_FILENAME = path.dirname(options.qfasta) + "dup_rdups.log"
    logging.basicConfig(filename=LOG_FILENAME, level=logging.INFO)

    main(options.cns_file, options.qdups, options.sdups, options.pairs,
示例#56
0
def merge(org_bed, missed, merge_file):
    """creates blast.all file and updates everything"""
    merge_fh = open(merge_file, "w")
    #cds_missed = missed[missed['ftype'] == 'CDS']
    #count = org_bed.shape[0] + missed[missed['ftype'] !='CDS'].shape[0]
    new_rows = []
    seen_accns = {}
    # CDS added to existing gene.
    for row_missed in missed:
        if row_missed['accn'] in seen_accns: continue
        try:
            org_bed_row = org_bed.accn(row_missed['accn'])
            # it's a CDS
        except KeyError:
            #its a new gene
            new_rows.append(row_missed)
            seen_accns[row_missed['accn']] = True
            continue
        locs_interval = Intersecter()
        [
            locs_interval.add_interval(Feature(start, stop))
            for start, stop in org_bed_row['locs']
        ]
        for missed_start, missed_end in row_missed['locs']:
            if len(locs_interval.find(missed_start, missed_end)) > 0:
                #                print >>sys.stderr, org_bed_row['accn']
                locs_intersects = [
                    (l.start, l.stop)
                    for l in locs_interval.find(missed_start, missed_end)
                ]
                [
                    org_bed_row['locs'].remove(locs_intersect)
                    for locs_intersect in locs_intersects
                ]
                locs_intersects = set(locs_intersects)
                locs_intersects.add((missed_start, missed_end))
                locs_start = min([start for start, end in locs_intersects])
                locs_end = max([end for start, end in locs_intersects])
                org_bed_row['locs'] = org_bed_row['locs'] + [
                    (locs_start, locs_end)
                ]
                row_missed['locs'].remove((missed_start, missed_end))

        org_bed_row['locs'] = org_bed_row['locs'] + row_missed['locs']
        #print >>sys.stderr, "{0},{1}".format(row_missed['accn'], locs)
        org_bed_row['locs'].sort()
        org_bed_row['start'] = min(
            min([start for start, end in org_bed_row['locs']]),
            org_bed_row['start'])
        org_bed_row['end'] = max(
            max([end for start, end in org_bed_row['locs']]),
            org_bed_row['end'])
        new_rows.append(org_bed_row)
        seen_accns[org_bed_row['accn']] = True

    for org_bed_rw in org_bed:
        if org_bed_rw['accn'] not in seen_accns:
            new_rows.append(org_bed_rw)
            seen_accns[org_bed_rw['accn']] = True

    def row_cmp(a, b):
        return cmp(a['seqid'], b['seqid']) or cmp(a['start'], b['start'])

    new_rows.sort(cmp=row_cmp)
    #print >>merge_fh, "\t".join(Bed.names)
    for i, row in enumerate(new_rows):
        print >> merge_fh, Bed.row_string(row)
示例#57
0
            print "OH NO!!!!!!"
        w = ">cns{0}\n".format(n)
        seq_w = "{0}\n".format(seq)
        new_fasta.write(w)
        new_fasta.write(seq_w)


######## rice_set ##########
#dict_size = gene_size_dict('/Users/gt/tmp.tsv')
#x = random_noncoding(dict_size,Bed('/Users/gt/data/paper4/rice_j_setaria_n/rice_j_set.nolocaldups.with_new_cns_mask.bed'))
#get_seq(x,"/Users/gt/data/paper4/rice_j.fasta","/Users/gt/data/paper4/rice_j_setaria_n/testing.fasta")
####### rice_sorg #########
dict_size = gene_size_dict('/Users/gt/tmp.tsv')
x = random_noncoding(
    dict_size,
    Bed('/Users/gt/data/paper4/rice_j_sorghum_n/rice_j_sorg.nolocaldups.with_new_cns_mask.bed'
        ))
get_seq(x, "/Users/gt/data/paper4/rice_j.fasta",
        "/Users/gt/data/paper4/rice_j_sorghum_n/testing.fasta")

##### seq for cns
#handle = open("/Users/gturco/data/paper3/rice_b_sorghum_v1.cns.assigned_real.csv")
#fh = handle.read()
#cns_list = []
#for line in fh.split("\n")[:-1]:
#    if line[0] == "#": continue
#    cns_id,accn,seqid,start,end,strand = line.split(",")[:6]
#    cns_list.append((seqid,int(start),int(end)))
#
#len(cns_list)
#get_seq(cns_list,"/Users/gturco/data/paper3/rice_b.fasta","/Users/gturco/test_cns.fasta")
##
示例#58
0
def main(qbed_path, sbed_path, cnsfile, dist, orthology_path):
    """
    here, we remove cnss that have been called proteins/rnas from 
    the cns list, and add them to the bed files.
    AND have to do the preliminary assignment of cnss that remain to the new-genes
    that _were_ cnss. the proper assignment is then handled in assign.py
    """
    qcns_file = qbed_path.replace(".bed", "_cns.gff")
    assert qcns_file != qbed_path
    qcns_gff = open(qcns_file, 'w')
    print >>qcns_gff, "##gff-version 3"
    if sbed_path != qbed_path:
        scns_file = sbed_path.replace(".bed", "_cns.gff")
        assert scns_file != sbed_path
        scns_gff = open(scns_file, 'w')
        print >>scns_gff, "##gff-version 3"
    else: scns_gff = qcns_gff

    qrawbed = RawBed(qbed_path)
    srawbed = RawBed(sbed_path)
  
    ortho_trees = read_orthos_to_trees(orthology_path, qrawbed,srawbed)
    
    qbed = Bed(qbed_path); qbed.fill_dict()
    sbed = Bed(sbed_path); sbed.fill_dict()

    name, ext = op.splitext(cnsfile)
    real_cns_fh = open("%s.real%s" % (name, ext), "w")
    print >>sys.stderr, "writing to:", real_cns_fh.name
    outdir = op.dirname(cnsfile)
    print >>real_cns_fh, "#qseqid,qaccn,sseqid,saccn,qstart,qend,sstart,send,eval"

    crna = read_cns_to_rna(outdir)
    cpro = read_cns_to_protein_exons(outdir)

    #cns_items = list(parse_raw_cns(cnsfile))
    proteins = collections.defaultdict(list)
    rnas = collections.defaultdict(list)
    real_cns_items = []
    for cnsi in CNS.parse_raw_line(cnsfile):
        cns_id = cnsi.cns_id
        cns = cnsi.to_dict()
        key = (cns['qseqid'], cns['sseqid'])
        if cns_id in cpro:
            proteins[key].append((cns, cpro[cns_id]))
        elif cns_id in crna:
            rnas[key].append((cns, crna[cns_id]))
        else:
            real_cns_items.append((cns_id, cns))
    p_trees = fill_tree(proteins)
    r_trees = fill_tree(rnas)

    def assign_new_names(prs, protein_or_rna):
        n = {}
        for seqid_pair, li in prs.iteritems():
            if not seqid_pair in n: n[seqid_pair] = []
            for gnew, info in li[:]:
                new_qname = "%(qseqid)s_%(qstart)i_%(qend)i_cns" % gnew
                new_sname = "%(sseqid)s_%(sstart)i_%(send)i_cns" % gnew
                # and give them both an id so we know they were a pair.
                new_qname += "_%s" % (protein_or_rna)
                new_sname += "_%s" % (protein_or_rna)
                #print >>sys.stderr, gnew['qaccn'], cns["qaccn"]
                try:
                    qstrand = qbed.d[gnew['qaccn']]['strand']
                    sstrand = sbed.d[gnew['saccn']]['strand']
                except:
                    print >>sys.stderr, gnew
                    raise
                gnew['qaccn'] = new_qname
                gnew['saccn'] = new_sname
                gnew['qstrand'] = qstrand
                gnew['sstrand'] = sstrand
                n[seqid_pair].append((gnew, info))
        return n
    nproteins = assign_new_names(proteins, "protein")
    nrnas = assign_new_names(rnas, "rna")

    cns_seen = {}
    # go through the remaining cnss, print and assign them to the new
    # genes (previously cnss) in within dist.
    for cns_id, cns in real_cns_items:
        print >>real_cns_fh, cns_to_str(cns)
        key = (cns['qseqid'], cns['sseqid'])
        
        for pnew, info in get_new(cns, p_trees, key, nproteins, dist + 1000):
            cns['qaccn'] = pnew['qaccn']
            cns['saccn'] = pnew['saccn']
            cns_str = cns_to_str(cns)
            if cns_str in cns_seen: continue
            cns_seen[cns_str] = 1
            print >>real_cns_fh, cns_str

        for rnew, info in get_new(cns, r_trees, key, nrnas, dist + 1000):
            cns['qaccn'] = rnew['qaccn']
            cns['saccn'] = rnew['saccn']
            cns_str = cns_to_str(cns)
            if cns_str in cns_seen: continue
            cns_seen[cns_str] = 1
            print >>real_cns_fh, cns_str

    qbed_list, qnew_pairs = merge_bed(qbed, nproteins, nrnas, ortho_trees, 'q')
    print >> sys.stderr, len(qnew_pairs)
    # dont need to do the orthos 2x so send in empty dict.
    sbed_list, snew_pairs_unused = merge_bed(sbed, nproteins, nrnas, {}, 's')

    # if it's the same org, we add the new cnss again to the same we send in both lists.
    # print_bed handles the repeats.
    if qbed.path == sbed.path:
        qbed_new = sbed_new = print_bed(qbed_list + sbed_list, qbed.path)
    else:
        qbed_new = print_bed(qbed_list, qbed.path)
        sbed_new = print_bed(sbed_list, sbed.path)

    return qbed_new.path, sbed_new.path, qnew_pairs