def main(): if len(sys.argv) < 6: print("%s transfac|basic pwmfile inmaf threshold spec1,spec2,... " % sys.argv[0], file=sys.stderr) sys.exit(0) pwm = {} format = sys.argv[1] for wm in pwmx.Reader(open(sys.argv[2]), format=format): pwm[wm.id] = wm inmaf = open(sys.argv[3]) threshold = float(sys.argv[4]) species = [] for sp in sys.argv[5].split(','): species.append(sp) for maf in align_maf.Reader(inmaf): mafchrom = maf.components[0].src.split('.')[1] mafstart = maf.components[0].start mafend = maf.components[0].end reftext = maf.components[0].text # maf block scores for each matrix for scoremax, width, headers in MafBlockScorer(pwm, species, maf): blocklength = width mafsrc, mafstart, mafend = headers[0] mafchrom = mafsrc.split('.')[1] # lists of scores for each position in scoremax for id, mx in scoremax.items(): for offset in range(blocklength): # scan all species with threshold for i in range(len(species)): if mx[i][offset] > threshold: refstart = mafstart + offset - reftext.count( '-', 0, offset) refend = refstart + len(pwm[id]) data = " ".join([ "%.2f" % mx[x][offset] for x in range(len(species)) ]) # underscore spaces in the name print(mafchrom, refstart, refend, id.replace(' ', '_'), data) break
def scoreMaf(maf, motif, sources, sourceDist): """ read a single maf and a single motif return score matrices """ pwms = {} pwms[motif.id] = motif wtCoef, valCoef = 1, 1 fullSize = maf.text_size summedWeightedScore = np.zeros(shape=(fullSize, 1)) summedScore = np.zeros(shape=(fullSize, 1)) componentScores = np.zeros(shape=(len(sources), fullSize)) weights = np.zeros(shape=(fullSize, 1)) componentWeightedScores = np.zeros(shape=(len(sources), fullSize)) for scores, width, headers in MafBlockScorer(pwms, sources, maf): # for scores, width, headers in MafMotifScorer(sources, maf, "TGCATG"): data = scores[motif.id] for speciesN, srcName in enumerate(sources): similarity = scores[motif.id][speciesN] for pos, val in enumerate(similarity): if not np.isnan(val): val = sigmoid(val, .8, 20) componentScores[speciesN, pos] = val summedScore[pos] += val weight = weight_fxn(sourceDist[srcName]) #weightedScore = valCoef*val * wtCoef*weight weightedScore = (valCoef * val) + wtCoef*(val * weight) summedWeightedScore[pos] += weightedScore componentWeightedScores[speciesN, pos] = weightedScore
def main(): if len(sys.argv) < 5: print >> sys.stderr, "%s bedfile inmaf spec1,spec2,... motif_file " % sys.argv[ 0] sys.exit(0) # read in intervals regions = {} for line in open(sys.argv[1]): if line.startswith('#'): continue fields = line.strip().split() chrom, start, end = fields[0], int(fields[1]), int(fields[2]) try: name = fields[3] except: name = None if chrom not in regions: regions[chrom] = intervals.Intersecter() regions[chrom].add(start, end, name) pwm = {} for wm in pwmx.Reader(open(sys.argv[4])): pwm[wm.id] = wm print >> sys.stderr, wm.id, len(wm) inmaf = open(sys.argv[2]) threshold = 0.5 species = [] for sp in sys.argv[3].split(','): species.append(sp) for maf in align_maf.Reader(inmaf): mafchrom = maf.components[0].src.split('.')[1] mafstart = maf.components[0].start mafend = maf.components[0].end reftext = maf.components[0].text # maf block scores for each matrix for scoremax, width, headers in MafBlockScorer(pwm, species, maf): #print >>sys.stderr,headers blocklength = width mafsrc, mafstart, mafend = headers[0] mafchrom = mafsrc.split('.')[1] # lists of scores for each position in scoremax for mx_name, mx in scoremax.items(): #print >>sys.stderr, mx_name, len(pwm[mx_name]) for offset in range(blocklength): # scan all species with threshold for i in range(len(species)): if mx[i][offset] > threshold: refstart = mafstart + offset - reftext.count( '-', 0, offset) refend = refstart + len(pwm[mx_name]) data = " ".join([ "%.2f" % mx[x][offset] for x in range(len(species)) ]) # quote the motif r = regions[mafchrom].find(refstart, refend) if mafchrom in regions and len(r) > 0: region_label = r[0].value else: #region_label = 0 continue v_name = mx_name.replace(' ', '_') print mafchrom, refstart, refend, region_label, v_name, data break