Пример #1
0
def main():

    if len(sys.argv) < 6:
        print("%s transfac|basic pwmfile inmaf threshold spec1,spec2,... " %
              sys.argv[0],
              file=sys.stderr)
        sys.exit(0)

    pwm = {}
    format = sys.argv[1]
    for wm in pwmx.Reader(open(sys.argv[2]), format=format):
        pwm[wm.id] = wm

    inmaf = open(sys.argv[3])
    threshold = float(sys.argv[4])

    species = []

    for sp in sys.argv[5].split(','):
        species.append(sp)

    for maf in align_maf.Reader(inmaf):
        mafchrom = maf.components[0].src.split('.')[1]
        mafstart = maf.components[0].start
        mafend = maf.components[0].end
        reftext = maf.components[0].text

        # maf block scores for each matrix
        for scoremax, width, headers in MafBlockScorer(pwm, species, maf):
            blocklength = width
            mafsrc, mafstart, mafend = headers[0]
            mafchrom = mafsrc.split('.')[1]

            # lists of scores for each position in scoremax
            for id, mx in scoremax.items():
                for offset in range(blocklength):

                    # scan all species with threshold
                    for i in range(len(species)):
                        if mx[i][offset] > threshold:
                            refstart = mafstart + offset - reftext.count(
                                '-', 0, offset)
                            refend = refstart + len(pwm[id])
                            data = " ".join([
                                "%.2f" % mx[x][offset]
                                for x in range(len(species))
                            ])
                            # underscore spaces in the name
                            print(mafchrom, refstart, refend,
                                  id.replace(' ', '_'), data)
                            break
Пример #2
0
def scoreMaf(maf, motif, sources, sourceDist):
    """
    read a single maf and a single motif
    return score matrices
    """
    pwms = {}
    pwms[motif.id] = motif
    wtCoef, valCoef = 1, 1
    fullSize = maf.text_size
    summedWeightedScore = np.zeros(shape=(fullSize, 1))
    summedScore = np.zeros(shape=(fullSize, 1))
    componentScores = np.zeros(shape=(len(sources), fullSize))
    weights = np.zeros(shape=(fullSize, 1))
    componentWeightedScores = np.zeros(shape=(len(sources), fullSize))
    for scores, width, headers in MafBlockScorer(pwms, sources, maf):
#    for scores, width, headers in MafMotifScorer(sources, maf, "TGCATG"):
    data = scores[motif.id]



        for speciesN, srcName in enumerate(sources):
            similarity = scores[motif.id][speciesN]

            for pos, val in enumerate(similarity):

                if not np.isnan(val):
            val = sigmoid(val, .8, 20)
            componentScores[speciesN, pos] = val
                    summedScore[pos] += val

                    weight = weight_fxn(sourceDist[srcName])
                    #weightedScore = valCoef*val * wtCoef*weight
                    weightedScore = (valCoef * val) + wtCoef*(val * weight)

                    summedWeightedScore[pos] += weightedScore
                    componentWeightedScores[speciesN, pos] = weightedScore
def main():

    if len(sys.argv) < 5:
        print >> sys.stderr, "%s bedfile inmaf spec1,spec2,... motif_file " % sys.argv[
            0]
        sys.exit(0)

    # read in intervals
    regions = {}
    for line in open(sys.argv[1]):
        if line.startswith('#'): continue
        fields = line.strip().split()
        chrom, start, end = fields[0], int(fields[1]), int(fields[2])
        try:
            name = fields[3]
        except:
            name = None
        if chrom not in regions: regions[chrom] = intervals.Intersecter()
        regions[chrom].add(start, end, name)

    pwm = {}
    for wm in pwmx.Reader(open(sys.argv[4])):
        pwm[wm.id] = wm
        print >> sys.stderr, wm.id, len(wm)

    inmaf = open(sys.argv[2])
    threshold = 0.5

    species = []

    for sp in sys.argv[3].split(','):
        species.append(sp)

    for maf in align_maf.Reader(inmaf):
        mafchrom = maf.components[0].src.split('.')[1]
        mafstart = maf.components[0].start
        mafend = maf.components[0].end
        reftext = maf.components[0].text

        # maf block scores for each matrix
        for scoremax, width, headers in MafBlockScorer(pwm, species, maf):
            #print >>sys.stderr,headers
            blocklength = width
            mafsrc, mafstart, mafend = headers[0]
            mafchrom = mafsrc.split('.')[1]

            # lists of scores for each position in scoremax
            for mx_name, mx in scoremax.items():
                #print >>sys.stderr, mx_name, len(pwm[mx_name])

                for offset in range(blocklength):

                    # scan all species with threshold
                    for i in range(len(species)):
                        if mx[i][offset] > threshold:
                            refstart = mafstart + offset - reftext.count(
                                '-', 0, offset)
                            refend = refstart + len(pwm[mx_name])

                            data = " ".join([
                                "%.2f" % mx[x][offset]
                                for x in range(len(species))
                            ])
                            # quote the motif
                            r = regions[mafchrom].find(refstart, refend)
                            if mafchrom in regions and len(r) > 0:
                                region_label = r[0].value
                            else:
                                #region_label = 0
                                continue
                            v_name = mx_name.replace(' ', '_')
                            print mafchrom, refstart, refend, region_label, v_name, data
                            break