示例#1
0
def main(cnsfile, qbed_file, sbed_file, pairsfile, pairs_fmt, qdsid, sdsid,qpad,spad):
    qcns_file = qbed_file.replace(".nolocaldups", "_cns.gff")
    assert qcns_file != qbed_file
    qcns_gff = open(qcns_file, 'w')
    print >>qcns_gff, "##gff-version 3"
    if sbed_file != qbed_file:
        scns_file = sbed_file.replace(".nolocaldups", "_cns.gff")
        assert scns_file != sbed_file
        scns_gff = open(scns_file, 'w')
        print >>scns_gff, "##gff-version 3"
    else:
        scns_gff = qcns_gff
    
    qbed = Bed(qbed_file); qbed.fill_dict()
    sbed = Bed(sbed_file); sbed.fill_dict()


    cnsdict, evaldict = get_cns_dict(cnsfile)
    qpair_map, spair_map = make_pair_maps(pairsfile, pairs_fmt, qbed, sbed)
    out = sys.stdout

    fmt = "%(cns_id)s,%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \
                       "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(eval)s,%(link)s"

    print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","")
    for cns, qfeat, sfeat in assign(cnsdict,qbed, sbed, qpair_map, spair_map):
        d = cns_fmt_dict(cns, qfeat, sfeat, evaldict)
        d['cns_id'] = cns_id(d)
        if d['sstop'] < d['sstart']:
            d['sstop'], d['sstart'] = d['sstart'], d['sstop']
        d['link'] = cns_link(d, qdsid, sdsid,qpad,spad)
        print >>out, fmt % d
        write_gff(d, qcns_gff, scns_gff)
示例#2
0
class TestMaize(unittest.TestCase):
    def setUp(self):
        handle = open(
            '/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt'
        )
        fh = handle.readlines()
        self.blast_str = ' , '.join(fh)
        self.unmasked_fasta = Fasta('/Users/gturco/find_cns/maize_v2_UM.fasta')

        self.qbed = Bed('/Users/gturco/rice_maize/rice_v6.bed')
        self.qbed.fill_dict()
        self.sbed = Bed('/Users/gturco/maize/maize_v2.bed',
                        '/Users/gturco/maize/maize_v2.fasta')
        self.sbed.fill_dict()
        self.sfeat = self.sbed.accn('GRMZM2G086714')
        self.qfeat = self.qbed.accn('Os09g27050')

    def test_get_cmd(self):
        sfasta = 'data/rice_v6_maize_v2/maize_v2_split/2.fasta'
        qfasta = 'data/rice_v6_maize_v2/rice_v6_split/4.fasta'

    def test_parse_balse(self):
        orientaion = -1
        cns = parse_blast(self.blast_str, orientaion, self.qfeat, self.sfeat,
                          self.qbed, self.sbed, 12000, 26000,
                          self.unmasked_fasta)
        print cns
示例#3
0
class TestAssign(unittest.TestCase):
    def setUp(self):
        self.cns_filename = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.cns.txt"
        self.pairsfile = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.pairs.txt"
        self.qbed = Bed("data/rice_v6_sorghum_v1/rice_v6.bed") ;self.qbed.fill_dict()
        self.sbed = Bed("data/rice_v6_sorghum_v1/sorghum_v1.bed") ;self.sbed.fill_dict()
        self.cns_dict, self.evalue_dict = get_cns_dict(self.cns_filename)
        self.qpair_map, self.spair_map = make_pair_maps(self.pairsfile, "pair", self.qbed, self.sbed)

    def test_get_cns_dict(self):
        """test for test_get_cns_dict"""
        #print self.cns_dict.keys()
        print "keys!",  self.evalue_dict.keys()


    def test_assign(self):
      assign(self.cns_dict, self.qbed, self.sbed, self.qpair_map, self.spair_map)
    
    def test_cns_fmt_dict(self):
      for cns, qfeat, sfeat in assign(self.cns_dict, self.qbed, self.sbed, self.qpair_map, self.spair_map):
        d = cns_fmt_dict(cns, qfeat, sfeat, self.evalue_dict)
        print "dddddddd", d
        
    def test_main(self):
      pass
示例#4
0
文件: assign.py 项目: brentp/find_cns
def main(cnsfile, qbed_file, sbed_file, pairsfile, pairs_fmt):
    qcns_file = qbed_file.replace(".bed", "_cns.gff")
    assert qcns_file != qbed_file
    qcns_gff = open(qcns_file, 'w')
    print >>qcns_gff, "##gff-version 3"
    if sbed_file != qbed_file:
        scns_file = sbed_file.replace(".bed", "_cns.gff")
        assert scns_file != sbed_file
        scns_gff = open(scns_file, 'w')
        print >>scns_gff, "##gff-version 3"
    else:
        scns_gff = qcns_gff
    
    qbed = Bed(qbed_file); qbed.fill_dict()
    sbed = Bed(sbed_file); sbed.fill_dict()


    cnsdict = get_cns_dict(cnsfile)
    qpair_map, spair_map = make_pair_maps(pairsfile, pairs_fmt, qbed, sbed)
    out = sys.stdout

    fmt = "%(cns_id)s,%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \
                       "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s"

    print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","")
    for cns, qfeat, sfeat in assign(cnsdict, qbed, sbed, qpair_map, spair_map):
        d = cns_fmt_dict(cns, qfeat, sfeat)
        d['cns_id'] = cns_id(d)
        if d['sstop'] < d['sstart']:
            d['sstop'], d['sstart'] = d['sstart'], d['sstop']

        print >>out, fmt % d
        write_gff(d, qcns_gff, scns_gff)
示例#5
0
class TestAssign(unittest.TestCase):
    def setUp(self):
        self.cns_filename = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.cns.txt"
        self.pairsfile = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.pairs.txt"
        self.qbed = Bed("data/rice_v6_sorghum_v1/rice_v6.bed")
        self.qbed.fill_dict()
        self.sbed = Bed("data/rice_v6_sorghum_v1/sorghum_v1.bed")
        self.sbed.fill_dict()
        self.cns_dict, self.evalue_dict = get_cns_dict(self.cns_filename)
        self.qpair_map, self.spair_map = make_pair_maps(
            self.pairsfile, "pair", self.qbed, self.sbed)

    def test_get_cns_dict(self):
        """test for test_get_cns_dict"""
        #print self.cns_dict.keys()
        print "keys!", self.evalue_dict.keys()

    def test_assign(self):
        assign(self.cns_dict, self.qbed, self.sbed, self.qpair_map,
               self.spair_map)

    def test_cns_fmt_dict(self):
        for cns, qfeat, sfeat in assign(self.cns_dict, self.qbed, self.sbed,
                                        self.qpair_map, self.spair_map):
            d = cns_fmt_dict(cns, qfeat, sfeat, self.evalue_dict)
            print "dddddddd", d

    def test_main(self):
        pass
示例#6
0
class TestPseudo(unittest.TestCase):
    def setUp(self):
        self.qallbed = Bed("data/rice_v6_setaria64/rice_v6.all.bed", "data/rice_v6_setaria64/rice_v6.fasta")
        self.qallbed.fill_dict()
        self.sallbed = Bed("data/rice_v6_setaria64/setaria64.all.bed", "data/rice_v6_setaria64/setaria64.fasta")
        self.sallbed.fill_dict()
        self.saccn = self.sallbed.accn("Si000834m")
        blastfh = open("blast_res")
        self.blast = blastfh.read()
        self.d, self.pseudo = group_cds(self.blast, self.saccn)

    def test_group_cds_1(self):
        self.assertEqual(len(self.d.keys()), 4)
        total_values = []
        for key in self.d.keys():
            values = len(self.d[key])
            total_values.append(values)
        self.assertEqual(sum(total_values), 38)

    def test_group_cds_2(self):
        blast_2fh = open("blast_2")
        blast_2 = blast_2fh.read()

        d, pseudo = group_cds(blast_2, self.sallbed.accn("Si002524m"))

        self.assertEqual(len(d.keys()), 5)
        for key in d.keys():
            # logging.info('key: {0}'.format(key))

            self.assertEqual(1, len(d[key]))

    def test_append_to_included_groups(self):
        locs = [1, 2, 3, 4]
        group_dict = {(2, 5): [], (3, 6): [], (9, 8): []}

        result_dict = append_to_included_groups(locs, group_dict)
        expected = {(2, 5): [(1, 2, 3, 4)], (3, 6): [(1, 2, 3, 4)], (9, 8): []}

        self.assertEquals(expected, result_dict)

    def test_remove_crossing_hit(self):
        qaccn = self.qallbed.accn("Os01g01890")
        for group_key in self.d.keys():
            exon_hits = self.d[group_key]
            non_crossing = remove_crossing_hits(exon_hits, qaccn, self.saccn)
            if len(non_crossing) > 1:
                mid, start, stop = bites(non_crossing)

    def test_find_orf(self):
        qaccn = self.qallbed.accn("Os01g01295")
        orf = find_orf(self.qallbed, qaccn)
        self.assertEqual(orf + 1, 141084)

    def test_find_orf_neg(self):
        saccn = self.sallbed.accn("Si001539m")
        orf = find_orf(self.sallbed, saccn)
        self.assertEqual(orf, 7662777)
示例#7
0
def main(cnsfile, qbed_file, sbed_file, qorg, sorg, padding):
    qbed = Bed(qbed_file); qbed.fill_dict()
    sbed = Bed(sbed_file); sbed.fill_dict()
    cnsdict = get_cns_dict(cnsfile)
    out = sys.stdout
    
    fmt = "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \
                       "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(link)s"
                     
    print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","")
    for cns, qfeat, sfeat in assign(cnsdict, qbed, sbed): 
        d = cns_fmt_dict(cns, qfeat, sfeat)
        d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr, sorg, qorg, padding)
        print >>out, fmt % d
示例#8
0
def main(cnsfile, qbed_file, sbed_file, pairsfile, pck, qorg, sorg, padding):
    qbed = Bed(qbed_file); qbed.fill_dict()
    sbed = Bed(sbed_file); sbed.fill_dict()
    cnsdict = get_cns_dict(cnsfile)
    qpair_map = make_pair_maps(pairsfile, 'pair', qbed, sbed)
    out = sys.stdout
    
    fmt = "%(saccn)s,%(saccnL)s,%(saccnR)s,%(schr)s,%(sstart)i,%(sstop)i," + \
                     "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(link)s" 
                     
    print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","")
    for cns, saccn, saccn_l, saccn_r, qfeat in assign(cnsdict, qbed, qpair_map): 
        d = cns_fmt_dict(cns, qfeat, saccn, saccn_l, saccn_r)
        d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr,qfeat, pck, sbed, qbed, sorg, qorg, padding)
        print >>out, fmt % d
示例#9
0
def main(cnsfile, qbed_file, sbed_file, pairsfile, pck, qorg, sorg, padding):
    qbed = Bed(qbed_file); qbed.fill_dict()
    sbed = Bed(sbed_file); sbed.fill_dict()
    cnsdict = get_cns_dict(cnsfile)
    qpair_map = make_pair_maps(pairsfile, 'pair', qbed, sbed)
    out = sys.stdout
    
    fmt = "%(saccn)s,%(saccnL)s,%(saccnR)s,%(schr)s,%(sstart)i,%(sstop)i," + \
                     "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(link)s" 
                     
    print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","")
    for cns, saccn, saccn_l, saccn_r, qfeat in assign(cnsdict, qbed, qpair_map): 
        d = cns_fmt_dict(cns, qfeat, saccn, saccn_l, saccn_r)
        d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr,qfeat, pck, sbed, qbed, sorg, qorg, padding)
        print >>out, fmt % d
示例#10
0
def main(cnsfile, qbed_file, sbed_file, qorg, sorg, padding):
    qbed = Bed(qbed_file); qbed.fill_dict()
    sbed = Bed(sbed_file); sbed.fill_dict()
    cnsdict = get_cns_dict(cnsfile)
    out = sys.stdout
    
    fmt = "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \
                       "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(link)s"
                     
    print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","")
    for cns, qfeat, sfeat in assign(cnsdict, qbed, sbed): 
        d = cns_fmt_dict(cns, qfeat, sfeat)
	if d['sstop'] < d['sstart']:
            d['sstop'], d['sstart'] = d['sstart'], d['sstop']        
	d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr, sorg, qorg, padding)
        print >>out, fmt % d
示例#11
0
class LocalDups(object):
    def __init__(self,filename,bed):
        self.filename = filename
        self.bed = Bed(bed)
        self.bed.fill_dict()

    def get_order_dups(self):
        d = {}
        for line in open(self.filename):
            dupline = DupLine(line)
            dups = dupline.get_order(self.bed)
            d[dups[0]['accn']] = "P"
            for dup in dups[1:]:
                d[dup['accn']] = dups[0]['accn']
            intervening = dupline.get_interving_genes(self.bed)
            for i in intervening:
                if i in d.keys():continue
                d[i] = "I"
        self.filename.close()
        return d

    def write_ordered(self,out_fh):
        """write localdups to outfile"""
        localdup_fh = open(out_fh, "w")
        d = {}
        for line in open(self.filename):
            dupline = DupLine(line)
            dups = dupline.get_order(self.bed)
            line = "{0}\n".format("\t".join(dups))
            localdup_fh.write(line)
        localdup_fh.close()


    def get_dups(self):
        d = {}
        for  line in open(self.filename):
            dupline = DupLine(line)
            d[dupline.parent] = 'P'
            for dup in dupline.children:
                d[dup] = dupline.parent
            intervening = dupline.get_interving_genes(self.bed)
            for i in intervening:
                if i in d.keys(): continue
                d[i] = "I"
        self.filename.close()
        return d
示例#12
0
class LocalDups(object):
    def __init__(self, filename, bed):
        self.filename = filename
        self.bed = Bed(bed)
        self.bed.fill_dict()

    def get_order_dups(self):
        d = {}
        for line in open(self.filename):
            dupline = DupLine(line)
            dups = dupline.get_order(self.bed)
            d[dups[0]['accn']] = "P"
            for dup in dups[1:]:
                d[dup['accn']] = dups[0]['accn']
            intervening = dupline.get_interving_genes(self.bed)
            for i in intervening:
                if i in d.keys(): continue
                d[i] = "I"
        self.filename.close()
        return d

    def write_ordered(self, out_fh):
        """write localdups to outfile"""
        localdup_fh = open(out_fh, "w")
        d = {}
        for line in open(self.filename):
            dupline = DupLine(line)
            dups = dupline.get_order(self.bed)
            line = "{0}\n".format("\t".join(dups))
            localdup_fh.write(line)
        localdup_fh.close()

    def get_dups(self):
        d = {}
        for line in open(self.filename):
            dupline = DupLine(line)
            d[dupline.parent] = 'P'
            for dup in dupline.children:
                d[dup] = dupline.parent
            intervening = dupline.get_interving_genes(self.bed)
            for i in intervening:
                if i in d.keys(): continue
                d[i] = "I"
        self.filename.close()
        return d
示例#13
0
class TestMaize(unittest.TestCase):
    def setUp(self):
        handle = open("/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt")
        fh = handle.readlines()
        self.blast_str = " , ".join(fh)
        self.unmasked_fasta = Fasta("/Users/gturco/find_cns/maize_v2_UM.fasta")

        self.qbed = Bed("/Users/gturco/rice_maize/rice_v6.bed")
        self.qbed.fill_dict()
        self.sbed = Bed("/Users/gturco/maize/maize_v2.bed", "/Users/gturco/maize/maize_v2.fasta")
        self.sbed.fill_dict()
        self.sfeat = self.sbed.accn("GRMZM2G086714")
        self.qfeat = self.qbed.accn("Os09g27050")

    def test_get_cmd(self):
        sfasta = "data/rice_v6_maize_v2/maize_v2_split/2.fasta"
        qfasta = "data/rice_v6_maize_v2/rice_v6_split/4.fasta"

    def test_parse_balse(self):
        orientaion = -1
        cns = parse_blast(
            self.blast_str, orientaion, self.qfeat, self.sfeat, self.qbed, self.sbed, 12000, 26000, self.unmasked_fasta
        )
        print cns
示例#14
0
def main(qbed_path, sbed_path, cnsfile, dist, orthology_path):
    """
    here, we remove cnss that have been called proteins/rnas from 
    the cns list, and add them to the bed files.
    AND have to do the preliminary assignment of cnss that remain to the new-genes
    that _were_ cnss. the proper assignment is then handled in assign.py
    """
    qcns_file = qbed_path.replace(".bed", "_cns.gff")
    assert qcns_file != qbed_path
    qcns_gff = open(qcns_file, 'w')
    print >> qcns_gff, "##gff-version 3"
    if sbed_path != qbed_path:
        scns_file = sbed_path.replace(".bed", "_cns.gff")
        assert scns_file != sbed_path
        scns_gff = open(scns_file, 'w')
        print >> scns_gff, "##gff-version 3"
    else:
        scns_gff = qcns_gff

    qrawbed = RawBed(qbed_path)
    srawbed = RawBed(sbed_path)

    ortho_trees = read_orthos_to_trees(orthology_path, qrawbed, srawbed)

    qbed = Bed(qbed_path)
    qbed.fill_dict()
    sbed = Bed(sbed_path)
    sbed.fill_dict()

    name, ext = op.splitext(cnsfile)
    real_cns_fh = open("%s.real%s" % (name, ext), "w")
    print >> sys.stderr, "writing to:", real_cns_fh.name
    outdir = op.dirname(cnsfile)
    print >> real_cns_fh, "#qseqid,qaccn,sseqid,saccn,qstart,qend,sstart,send,eval"

    crna = read_cns_to_rna(outdir)
    cpro = read_cns_to_protein_exons(outdir)

    #cns_items = list(parse_raw_cns(cnsfile))
    proteins = collections.defaultdict(list)
    rnas = collections.defaultdict(list)
    real_cns_items = []
    for cnsi in CNS.parse_raw_line(cnsfile):
        cns_id = cnsi.cns_id
        cns = cnsi.to_dict()
        key = (cns['qseqid'], cns['sseqid'])
        if cns_id in cpro:
            proteins[key].append((cns, cpro[cns_id]))
        elif cns_id in crna:
            rnas[key].append((cns, crna[cns_id]))
        else:
            real_cns_items.append((cns_id, cns))
    p_trees = fill_tree(proteins)
    r_trees = fill_tree(rnas)

    def assign_new_names(prs, protein_or_rna):
        n = {}
        for seqid_pair, li in prs.iteritems():
            if not seqid_pair in n: n[seqid_pair] = []
            for gnew, info in li[:]:
                new_qname = "%(qseqid)s_%(qstart)i_%(qend)i_cns" % gnew
                new_sname = "%(sseqid)s_%(sstart)i_%(send)i_cns" % gnew
                # and give them both an id so we know they were a pair.
                new_qname += "_%s" % (protein_or_rna)
                new_sname += "_%s" % (protein_or_rna)
                #print >>sys.stderr, gnew['qaccn'], cns["qaccn"]
                try:
                    qstrand = qbed.d[gnew['qaccn']]['strand']
                    sstrand = sbed.d[gnew['saccn']]['strand']
                except:
                    print >> sys.stderr, gnew
                    raise
                gnew['qaccn'] = new_qname
                gnew['saccn'] = new_sname
                gnew['qstrand'] = qstrand
                gnew['sstrand'] = sstrand
                n[seqid_pair].append((gnew, info))
        return n

    nproteins = assign_new_names(proteins, "protein")
    nrnas = assign_new_names(rnas, "rna")

    cns_seen = {}
    # go through the remaining cnss, print and assign them to the new
    # genes (previously cnss) in within dist.
    for cns_id, cns in real_cns_items:
        print >> real_cns_fh, cns_to_str(cns)
        key = (cns['qseqid'], cns['sseqid'])

        for pnew, info in get_new(cns, p_trees, key, nproteins, dist + 1000):
            cns['qaccn'] = pnew['qaccn']
            cns['saccn'] = pnew['saccn']
            cns_str = cns_to_str(cns)
            if cns_str in cns_seen: continue
            cns_seen[cns_str] = 1
            print >> real_cns_fh, cns_str

        for rnew, info in get_new(cns, r_trees, key, nrnas, dist + 1000):
            cns['qaccn'] = rnew['qaccn']
            cns['saccn'] = rnew['saccn']
            cns_str = cns_to_str(cns)
            if cns_str in cns_seen: continue
            cns_seen[cns_str] = 1
            print >> real_cns_fh, cns_str

    qbed_list, qnew_pairs = merge_bed(qbed, nproteins, nrnas, ortho_trees, 'q')
    print >> sys.stderr, len(qnew_pairs)
    # dont need to do the orthos 2x so send in empty dict.
    sbed_list, snew_pairs_unused = merge_bed(sbed, nproteins, nrnas, {}, 's')

    # if it's the same org, we add the new cnss again to the same we send in both lists.
    # print_bed handles the repeats.
    if qbed.path == sbed.path:
        qbed_new = sbed_new = print_bed(qbed_list + sbed_list, qbed.path)
    else:
        qbed_new = print_bed(qbed_list, qbed.path)
        sbed_new = print_bed(sbed_list, sbed.path)

    return qbed_new.path, sbed_new.path, qnew_pairs
示例#15
0
                      help="path to query localdup_file")
    parser.add_option("--sdups",
                      dest="sdups",
                      type='string',
                      help="path to subject localdup_file")
    parser.add_option("--cns_file",
                      dest="cns_file",
                      type='string',
                      help="path to cns file cns.txt")
    parser.add_option("--UMfasta",
                      dest="unmasked_fasta",
                      help="path to unmasked fasta file file")
    (options, _) = parser.parse_args()

    qbed = Bed(options.qbed, options.qfasta)
    qbed.fill_dict()
    sbed = Bed(options.sbed, options.sfasta)
    sbed.fill_dict()
    unmasked_fasta = Fasta(options.unmasked_fasta)
    assert options.mask in 'FT'

    qnolocaldups_path = qbed.path.split(".")[0] + ".nolocaldups.bed"
    snolocaldups_path = sbed.path.split(".")[0] + ".nolocaldups.bed"
    #pairs_to_qa("{0}.local".format(options.pairs),'pair',"{0}.nolocaldups.local".format(qbed.path.split(".")[0]),"{0}.nolocaldups.local".format(sbed.path.split(".")[0]),"{0}.raw.filtered.local".format(options.pairs.split(".")[0]))

    import logging
    LOG_FILENAME = path.dirname(options.qfasta) + "dup_rdups.log"
    logging.basicConfig(filename=LOG_FILENAME, level=logging.INFO)

    main(options.cns_file, options.qdups, options.sdups, options.pairs,
         options.pair_fmt, qbed, sbed, options.qpad, options.spad,
示例#16
0
def main(qbed_path, sbed_path, cnsfile, dist, orthology_path):
    """
    here, we remove cnss that have been called proteins/rnas from 
    the cns list, and add them to the bed files.
    AND have to do the preliminary assignment of cnss that remain to the new-genes
    that _were_ cnss. the proper assignment is then handled in assign.py
    """
    qcns_file = qbed_path.replace(".bed", "_cns.gff")
    assert qcns_file != qbed_path
    qcns_gff = open(qcns_file, 'w')
    print >>qcns_gff, "##gff-version 3"
    if sbed_path != qbed_path:
        scns_file = sbed_path.replace(".bed", "_cns.gff")
        assert scns_file != sbed_path
        scns_gff = open(scns_file, 'w')
        print >>scns_gff, "##gff-version 3"
    else: scns_gff = qcns_gff

    qrawbed = RawBed(qbed_path)
    srawbed = RawBed(sbed_path)
  
    ortho_trees = read_orthos_to_trees(orthology_path, qrawbed,srawbed)
    
    qbed = Bed(qbed_path); qbed.fill_dict()
    sbed = Bed(sbed_path); sbed.fill_dict()

    name, ext = op.splitext(cnsfile)
    real_cns_fh = open("%s.real%s" % (name, ext), "w")
    print >>sys.stderr, "writing to:", real_cns_fh.name
    outdir = op.dirname(cnsfile)
    print >>real_cns_fh, "#qseqid,qaccn,sseqid,saccn,qstart,qend,sstart,send,eval"

    crna = read_cns_to_rna(outdir)
    cpro = read_cns_to_protein_exons(outdir)

    #cns_items = list(parse_raw_cns(cnsfile))
    proteins = collections.defaultdict(list)
    rnas = collections.defaultdict(list)
    real_cns_items = []
    for cnsi in CNS.parse_raw_line(cnsfile):
        cns_id = cnsi.cns_id
        cns = cnsi.to_dict()
        key = (cns['qseqid'], cns['sseqid'])
        if cns_id in cpro:
            proteins[key].append((cns, cpro[cns_id]))
        elif cns_id in crna:
            rnas[key].append((cns, crna[cns_id]))
        else:
            real_cns_items.append((cns_id, cns))
    p_trees = fill_tree(proteins)
    r_trees = fill_tree(rnas)

    def assign_new_names(prs, protein_or_rna):
        n = {}
        for seqid_pair, li in prs.iteritems():
            if not seqid_pair in n: n[seqid_pair] = []
            for gnew, info in li[:]:
                new_qname = "%(qseqid)s_%(qstart)i_%(qend)i_cns" % gnew
                new_sname = "%(sseqid)s_%(sstart)i_%(send)i_cns" % gnew
                # and give them both an id so we know they were a pair.
                new_qname += "_%s" % (protein_or_rna)
                new_sname += "_%s" % (protein_or_rna)
                #print >>sys.stderr, gnew['qaccn'], cns["qaccn"]
                try:
                    qstrand = qbed.d[gnew['qaccn']]['strand']
                    sstrand = sbed.d[gnew['saccn']]['strand']
                except:
                    print >>sys.stderr, gnew
                    raise
                gnew['qaccn'] = new_qname
                gnew['saccn'] = new_sname
                gnew['qstrand'] = qstrand
                gnew['sstrand'] = sstrand
                n[seqid_pair].append((gnew, info))
        return n
    nproteins = assign_new_names(proteins, "protein")
    nrnas = assign_new_names(rnas, "rna")

    cns_seen = {}
    # go through the remaining cnss, print and assign them to the new
    # genes (previously cnss) in within dist.
    for cns_id, cns in real_cns_items:
        print >>real_cns_fh, cns_to_str(cns)
        key = (cns['qseqid'], cns['sseqid'])
        
        for pnew, info in get_new(cns, p_trees, key, nproteins, dist + 1000):
            cns['qaccn'] = pnew['qaccn']
            cns['saccn'] = pnew['saccn']
            cns_str = cns_to_str(cns)
            if cns_str in cns_seen: continue
            cns_seen[cns_str] = 1
            print >>real_cns_fh, cns_str

        for rnew, info in get_new(cns, r_trees, key, nrnas, dist + 1000):
            cns['qaccn'] = rnew['qaccn']
            cns['saccn'] = rnew['saccn']
            cns_str = cns_to_str(cns)
            if cns_str in cns_seen: continue
            cns_seen[cns_str] = 1
            print >>real_cns_fh, cns_str

    qbed_list, qnew_pairs = merge_bed(qbed, nproteins, nrnas, ortho_trees, 'q')
    print >> sys.stderr, len(qnew_pairs)
    # dont need to do the orthos 2x so send in empty dict.
    sbed_list, snew_pairs_unused = merge_bed(sbed, nproteins, nrnas, {}, 's')

    # if it's the same org, we add the new cnss again to the same we send in both lists.
    # print_bed handles the repeats.
    if qbed.path == sbed.path:
        qbed_new = sbed_new = print_bed(qbed_list + sbed_list, qbed.path)
    else:
        qbed_new = print_bed(qbed_list, qbed.path)
        sbed_new = print_bed(sbed_list, sbed.path)

    return qbed_new.path, sbed_new.path, qnew_pairs
示例#17
0
    import optparse
    parser = optparse.OptionParser("usage: %prog [options] ")
    parser.add_option("-F", dest="mask", help="blast mask simple sequence [default: F]", default="F")
    parser.add_option("-n", dest="ncpu", help="parallelize to this many cores", type='int', default=8)
    parser.add_option("-q", dest="qfasta", help="path to genomic query fasta")
    parser.add_option("--qbed", dest="qbed", help="query bed file")
    parser.add_option("-s", dest="sfasta", help="path to genomic subject fasta")
    parser.add_option("--sbed", dest="sbed", help="subject bed file")
    parser.add_option("-p", dest="pairs", help="the pairs file. output from dagchainer")
    choices = ("dag", "cluster", "pair", 'qa', 'raw')
    parser.add_option("--pair_fmt", dest="pair_fmt", default='raw',
                      help="format of the pairs, one of: %s" % str(choices),
                      choices=choices)
    parser.add_option("--qpad", dest="qpad", type='int', default=12000,
                      help="how far from the end of the query gene to look for cnss")
    parser.add_option("--spad", dest="spad", type='int', default=26000,
                    help="how far from the end of the subject gene to look for cnss")
    parser.add_option("--UMfasta", dest="unmasked_fasta", help="path to unmasked fasta file file")
    (options, _) = parser.parse_args()


    if not (options.qfasta and options.sfasta and options.sbed and options.qbed):
        sys.exit(parser.print_help())

    qbed = Bed(options.qbed, options.qfasta); qbed.fill_dict()
    sbed = Bed(options.sbed, options.sfasta); sbed.fill_dict()
    unmasked_fasta = Fasta(options.unmasked_fasta)
    assert options.mask in 'FT'

    main(qbed, sbed, options.pairs, options.qpad, options.spad, unmasked_fasta, options.pair_fmt, options.mask, options.ncpu)
示例#18
0
 def test_main(self):
     """test for test_get_cns_dict"""
     qbed = Bed(self.qbed, self.qfasta); qbed.fill_dict()
     sbed = Bed(self.sbed, self.sfasta); sbed.fill_dict()
     x = main(qbed, sbed, self.pairs, 12000,12000, "pair", self.blast_path, "T",2)
     print x
示例#19
0
        spos = sbed[raw.pos_b]
        key = (raw.seqid_a, raw.seqid_b)
        if not key in trees: trees[key] = []
        qpos = (qpos['start'] + qpos['end']) / 2
        spos = (spos['start'] + spos['end']) / 2
        trees[key].append((int(qpos), int(spos)))
    for k in trees:
        trees[k] = cKDTree(trees[k])
    return trees

if __name__ == "__main__":
    import optparse
    parser = optparse.OptionParser()
    parser.add_option("--qbed", dest="qbed", help="query bed file")
    parser.add_option("--sbed", dest="sbed", help="subject bed file")
    parser.add_option("--cns", dest="cns", help="path to raw cns")
    parser.add_option("--dist", dest="dist", type='int', help="max dist from gene to cns", default=12000)
    parser.add_option("--paralogy", dest="paralogy", help="path to paralogy file")
    parser.add_option("--orthology", dest="orthology", help="path to orthology file")

    options, args = parser.parse_args()    

    if not (options.sbed and options.qbed and options.cns, options.orthology):
        sys.exit(parser.print_help())

    qbed = Bed(options.qbed); qbed.fill_dict()
    sbed = Bed(options.sbed); sbed.fill_dict()
    
    qbed_new, sbed_new, new_pairs = main(qbed, sbed, options.cns, options.dist, options.orthology)
    write_new_pairs(options.paralogy, options.orthology, qbed, qbed_new, sbed, sbed_new, new_pairs) 
示例#20
0
            print >> fcnss, "%s,%s,%s,[%s,%s],%s,%s" % (qname, qfeat['seqid'], sname, sfeat['qleft_gene'], sfeat['qright_gene'], sfeat['seqid'],
                             ",".join(map(lambda l: ",".join(map(str,l)), cnss)))

    return None

if __name__ == "__main__":
    import optparse
    parser = optparse.OptionParser("usage: %prog [options] ")
    parser.add_option("-F", dest="mask", help="blast mask simple sequence [default: F]", default="F")
    parser.add_option("-n", dest="ncpu", help="parallelize to this many cores", type='int', default=8)
    parser.add_option("-q", dest="qfasta", help="path to genomic query fasta")
    parser.add_option("--qbed", dest="qbed", help="query bed file")
    parser.add_option("-s", dest="sfasta", help="path to genomic subject fasta")
    parser.add_option("--sbed", dest="sbed", help="subject bed file")
    parser.add_option("-p", dest="pairs", help="the pairs file. output from dagchainer")
    choices = ("dag", "cluster", "pair", 'qa', 'raw', 'pck')
    parser.add_option("--pair_fmt", dest="pair_fmt", default='raw',
                          help="format of the pairs, one of: %s" % str(choices),
                          choices=choices)
    (options, _) = parser.parse_args()


    if not (options.qfasta and options.sfasta and options.sbed and options.qbed):
        sys.exit(parser.print_help())

    qbed = Bed(options.qbed, options.qfasta); qbed.fill_dict()
    sbed = Bed(options.sbed, options.sfasta); sbed.fill_dict()
    assert options.mask in 'FT'

    main(qbed, sbed, options.pairs, options.pair_fmt, options.mask, options.ncpu)