示例#1
0
def parse_dups(dups_file, flat):
    #####THIS ONLY WORKS IF WE CHANGE QUOTA
    flat.fill_dict()
    dup_dic = {}
    seen = []

    for line in open(dups_file):
        line = line.strip().split("\t")
        parent = line[0]
        dups = line[1:]
        
        all = [Bed.row_to_dict(flat.d[f]) for f in list(set(line))]
        all.sort(key=operator.itemgetter('start'))
        dup_start = all[0]
        dup_end = all[-1]
        dup_dic[parent] = 'P'
        seen += [parent]
        for dup in dups:
            if dup in seen: continue
            seen.append(dup)
            dup_dic[dup] = parent
        # so here, there are all the genes that arent part of the local dup
        # array, but we want to mark them with 'I'
        intervening = flat.get_features_in_region(dup_start['seqid'], dup_start['start'], dup_end['end'])
        for ii in intervening:
            if ii['accn'] == parent or ii['accn'] == dup_end: continue
            if not ii['accn'] in dup_dic.keys():
                dup_dic[ii['accn']] = 'I'
    return dup_dic
示例#2
0
def write_genelist(q_or_s, outfile, flat, pairs, orthos, mcnss, link_fmt, this_org, other_org,
        other_flat, dups, local_dups):
    # used in the link_fmt
    qorg, sorg = this_org, other_org

    fmt = "%(accn)s\t%(seqid)s\t%(start)i\t%(end)i\t%(ortholog)s\t%(ortho_cns)s\t"
    fmt +="%(regional_dup_info)s\t%(local_dup_info)s\t%(strand)s\t"
    fmt += "%(new_gene_info)s\t%(link)s"
    header = fmt.replace('%(', '').replace(')s','').replace(')i','')

    outdir = op.dirname(flat.path)
    annos = dict([kv.rstrip().split(",") for kv in open("%s/%s_protein_rna.anno" % (outdir, q_or_s))])
    if flat.path == other_flat.path:
        annos.update(dict([kv.rstrip().split(",") for kv in open("%s/s_protein_rna.anno" % (outdir,))]))

    out = open(outfile, 'w')
    print >>sys.stderr, "writing genelist to %s" % (outfile,)
    print >>out, header.replace('ortho_', other_org + '_')

    same_org = this_org == other_org
    for feat in flat:

        these_pairs = pairs.get(feat['accn'], [])
        cnss = mcnss.get(feat['accn'], [])

        ortholog, other_pairs = split_pairs(feat, [other_flat.d[t] for t in these_pairs], orthos, q_or_s=='s')
        ortho_cns, non_ortho_cns = split_cns(cnss, orthos, q_or_s=='s')
        regional_dup_info = dups.get(feat['accn'], '')
        local_dup_info = local_dups.get(feat['accn'], '')

        if ortholog:
            ortho = ortholog[0]
            link = link_fmt % dict(qorg=qorg, sorg=sorg,
                                   accn1=ortho['accn'], accn2=feat['accn']
                                  )
        else:
            link = ''

        new_gene_info = ""
        if feat['accn'].endswith(("_cns_protein", "_cns_rna")):
            try:
                new_gene_info = annos[feat['accn']]
            except KeyError: # from coannoation of previous run.
                pass

        ortholog = len(ortholog) and ",".join([o["accn"] for o in ortholog]) or ""
        if len(ortho_cns) > 0 and len(ortholog) == 0:
           print >>sys.stderr, "\nBAD", feat, "\n", ortho_cns, "\nthese:", these_pairs, "\nother:", other_pairs, "\n\n"
           # fell right on the edge of a syntenic block. the cns got in, but not the gene.
           #1/0

        other_pairs = ",".join([o["accn"] for o in other_pairs])
        fmt_dict = locals()
        fmt_dict.update(Bed.row_to_dict(feat))
        fmt_dict.update({'ortho_cns': len(ortho_cns) if ortholog else "",
                         'ortho_NON_cns_count': len(non_ortho_cns) if
                         other_pairs else ""})
        print >>out, fmt % fmt_dict