Пример #1
0
def _main(args):

    if len(args) != 3:
        print("usage: xls_get_region_from_fasta.py <fasta> <xls> <window>")
        sys.exit(1)

    win = int(args[2])

    fasta = fasta_subseq_2.FastaDB()
    fasta.openFastaFile(args[0])

    seqs = []

    for ln in open(args[1]):
        sp = ln[:-1].split()
        print sp
        pk = int(sp[1]) + int(sp[4])
        seq = fasta[sp[0]]['sequence'][(pk - win):(pk + win)]
        get_in = sp[
            -1]  #raw_input(">%s:%d..%d\n\'k\'=keep; \'r\' = reverse comp; \'<anything else>\' = discard: " % (sp[0],pk-win,pk+win))
        if get_in == 'k':
            pass
        elif get_in == 'r':
            seq = fasta_subseq_2.revcomp(seq)
        else:
            continue

        seqs.append(">%s:%d..%d\n%s" % (sp[0], pk - win, pk + win, seq))

    outfile = raw_input("name of output file: ")
    outfh = open(outfile, "w")
    for s in seqs:
        print >> outfh, s
def get_enrich(xls_regs, sgr, winsize, fasta):

    fasta_db = fasta_subseq_2.FastaDB()
    fasta_db.openFastaFile(fasta)
    for reg in xls_regs:

        for x in ('mtx1_hits', 'mtx2_hits'):
            hit_info = []
            for h in reg[x]:
                hit = {'hit_obj': h}
                width = abs(h['start'] - h['end'])
                if h['strand'] == "+":
                    seq = fasta_db[h['chr']]['sequence'][h['start']:(
                        h['start'] + width)]
                    hit['loc'] = h['start']
                else:
                    ### !!!!! CHANGE IF FIX HIT DATABASE!!!!
                    seq = fasta_subseq_2.revcomp(
                        fasta_db[h['chr']]['sequence'][h['end']:(h['end'] +
                                                                 width)])
                hit['loc'] = h['end']
                hit['nearest'] = (0, 0)
                hit['vals'] = []
                hit['seq'] = seq
                hit_info.append(hit)
            reg[x + '_info'] = hit_info

    for y in open(sgr):
        (chr, loc, val) = y.split()
        loc = int(loc)
        val = int(val)
        #print chr
        for x in xls_regs:
            for hit_info in ('mtx1_hits_info', 'mtx2_hits_info'):
                for d in x[hit_info]:
                    #print (loc,target_loc)
                    target_loc = d['loc']
                    if (chr == d['hit_obj']['chr']) and (
                            abs(loc - target_loc) <
                            abs(loc - d['nearest'][0])):
                        d['nearest'] = (loc, val)

                    if (chr == d['hit_obj']['chr']) and (abs(loc - target_loc)
                                                         < (winsize / 2)):
                        d['vals'].append(val)
                        print >> sys.stderr, d
    for x in xls_regs:
        for hit_info in ('mtx1_hits_info', 'mtx2_hits_info'):
            for h in x[hit_info]:
                h['win_mean'] = np.mean(h['vals'])
                h['win_median'] = np.median(h['vals'])
                h['enrich_md'] = h['nearest'][1] / h['win_median']
                h['enrich_mn'] = h['nearest'][1] / h['win_mean']
                print >> sys.stderr, h

    return xls_regs
Пример #3
0
def _main(args):

    if len(args) < 1:
        print "usage: revcomseq.py <fasta>"
        sys.exit(0)

    sq = ""
    header = ""
    for l in open(args[0]):
        if re.search("^>",l):
            header = l[:-1] + "_rc"
            sq=""
            if (sq):
                tr_seq = re.sub("\n","",sq)
                print "%s\n%s" % (header,revcomp(tr_seq))
        else:
            sq += l
    tr_seq = re.sub("\n","",sq)
    print "%s\n%s" % (header,revcomp(tr_seq))
Пример #4
0
def _main(args):

    if len(args) < 4:
        print >> sys.stderr, "usage: xls_motif_window.py <xls> <fasta> <matrix_file> <window>"
        sys.exit(1)

    fasta = fasta_subseq_2.FastaDB()
    fasta.openFastaFile(args[1])
    xls_regions = []
    for x in open(args[0]):
        spl = x[:-1].split()
        region = {
            'chr': spl[0],
            'start': int(spl[1]),
            'end': int(spl[2]),
            'enrich': spl[7]
        }
        region['seq'] = fasta[
            region['chr']]['sequence'][region['start']:region['end']]
        xls_regions.append(region)

    for r in xls_regions:
        try:
            annot = patser_tools.makePatserAnnotation(sequence=r['seq'],
                                                      matrix=args[2])
        except IOError:
            print >> sys.stderr, "Error in seq %s:%d..%d:" % (
                r['chr'], r['start'], r['end'])
            continue
        if len(annot.getAllFeatures()) < 1:
            continue
        maxhit = annot.getMaxFeature("score")
        winstart = None
        winend = None
        winseq = None
        if maxhit.tags["strand"] == '+':
            winstart = r['start'] + (maxhit.start - int(args[3]) / 2)
            winend = r['start'] + (maxhit.start + int(args[3]) / 2)
            win_seq = fasta[r['chr']]['sequence'][winstart:winend]
        else:
            winstart = r['start'] + ((maxhit.end - 3) - int(args[3]) / 2)
            winend = r['start'] + ((maxhit.end - 3) + int(args[3]) / 2)
            win_seq = fasta_subseq_2.revcomp(
                fasta[r['chr']]['sequence'][winstart:winend])
        print ">%s:%d..%d:%s enr=%s mtx=%s" % (
            r['chr'], winstart, winend, maxhit.tags['strand'], r['enrich'],
            maxhit.tags['score'])
        print win_seq
Пример #5
0
def run_aln_mapline(map_line):

    files = []
    for (sp, fasta) in map_line['fastas']:
        #print >> sys.stderr, (map_line['map_idx'],'a')
        sp_map = map_line['map_dict'][sp]
        #print >> sys.stderr, (map_line['map_idx'],'b')
        outname = str(map_line['map_idx']) + "_" + sp + ".fa"
        out = open(outname, "w")
        #print >> sys.stderr, (map_line['map_idx'],'c')
        print >> sys.stderr, sp_map
        print >> sys.stderr, sp
        seq = fasta[sp_map['chr']][sp_map['start']:sp_map['end'] - 1]
        #print >> sys.stderr, (map_line['map_idx'],'d')
        if sp_map['strand'] == "-":
            print sp + " revcomp"
            seq = fasta_subseq_2.revcomp(seq)
        #print >> sys.stderr, (map_line['map_idx'],'e')
        print >> out, ">%s %s-%d:%d" % (sp, sp_map['chr'], sp_map['start'],
                                        sp_map['end'])
        print >> out, seq
        print "Wrote fasta %s" % (outname, )
        #print (map_line['map_idx'],'f')
        out.close()
        files.append(outname)
        #print >> sys.stderr, (map_line['map_idx'],'g')

    call_pecan = [
        "java", "-classpath", PECANPATH, "-Xmx2000m", "bp.pecan.Pecan", "-E",
        map_line['tree'] + ";", "-F"
    ]
    call_pecan.extend(files)
    call_pecan.extend(["-G", str(map_line['map_idx']) + ".mfa"])
    print "Running pecan with command: %s" % (" ".join(call_pecan))
    print "Starting alignment %d..." % (map_line['map_idx'], )
    sub.check_call(call_pecan)
    print "Alignment %d finished" % (map_line['map_idx'], )
    map(os.remove, files)
Пример #6
0
def _main(args):

    if len(args) != 3:
        print "usage: <bed_file> <seq_file> <matrix>"
        sys.exit(0)

    fasta = fasta_subseq_2.FastaDB()
    fasta.openFastaFile(args[1])

    bed_annots = []
    bed_in = open(args[0])

    for line in bed_in:

        spl = line[:-1].split()
        fseq = fasta[spl[0]]["sequence"][int(spl[1]):int(spl[2])]
        if spl[5] == "-":
            fseq = fasta_subseq_2.revcomp(fseq)
        #print spl
        try:
            patannot = patser_tools.makePatserAnnotation(sequence=fseq,
                                                         matrix=args[2])
        except:
            continue
        #print "-" * 30
        #print spl
        #print pp(patannot.getAllFeatures())
        bed_annots.append({
            "seq": spl[0] + "_" + spl[1] + "_" + spl[2],
            "annotation": patannot
        })

    for ann in bed_annots:
        for feat in ann["annotation"].getAllFeatures():
            print "%s\t%i\t%i\t%f\t%f\t%s" % (
                ann["seq"], feat.st, feat.en, feat.tagset["score"],
                feat.tagset["pval"], feat.tagset["strand"])
Пример #7
0
def _main(args):

    if len(args) < 1:
        print "usage: patser_list.py [mtx1][mtx2] ... < seqs.fa"
        sys.exit(1)

    matrices = args
    seqs = sys.stdin
    hits = {}
    name = ""
    seq = ""
    pssms = convertFreqMtx(matrices)
    for s in seqs:
        nameres = re.search(">(\S+)", s)

        if nameres and not (name == ""):
            hits[name] = {'seq': seq}
            name = nameres.group(1)
            seq = ""
        elif nameres and (name == ""):
            #print "1"
            name = nameres.group(1)
            seq = ""
        else:
            seq += s[:-1]
    if not (name == ""):
        hits[name] = {'seq': seq}

    #print hits

    mtx_names = []
    for (name, d) in hits.iteritems():
        for mtx in matrices:
            hit_annot = patser_tools.makePatserAnnotation(sequence=d['seq'],
                                                          matrix=mtx,
                                                          seqname=name,
                                                          scorecut=-100)
            features = hit_annot.getAllFeatures()
            hit = None
            if len(features) > 0:
                max = features[0]
                for x in features:
                    if x.tags['score'] > max.tags['score']:
                        max = x
                hit = max
            else:
                print >> sys.stderr, "Sequence %s: No hit for matrix %s in %s" % (
                    name, mtx, d['seq'])
                continue
            #print hit
            d[hit.tags['motif_name']] = hit
            if hit.tags['motif_name'] not in mtx_names:
                mtx_names.append(hit.tags['motif_name'])
    #print hits
    print "name\tsequence\t",
    for x in mtx_names:
        print "%s_score\t%s_pval\t%s_PSSM_score" % (x, x, x),
    print ""
    for (name, h) in hits.iteritems():
        matrices = [x for x in h.keys() if not x == 'seq']
        print "%s\t%s\t" % (name, h['seq']),
        for x in matrices:
            print str(h[x].tags['score']) + "\t",
            if 'pval' in h[x].tags.keys():
                print str(h[x].tags['pval']) + "\t",
            else:
                print "-",
            pssm_scores = [
                scoreSeq(pssms[x], h['seq']),
                scoreSeq(pssms[x], fasta_subseq_2.revcomp(h['seq']))
            ]
            score = None
            if pssm_scores[1] > pssm_scores[0]:
                score = pssm_scores[1]
            else:
                score = pssm_scores[0]
            print str(score) + "\t",
        print ""