def read_dir(self, d): lookup = dict() scores_by_what = dict() if not os.path.exists(d): for p in DATA_SEARCH_PATH: if os.path.exists(os.path.join(p, d)): d = os.path.join(p, d) break for file in os.listdir(d): if file.endswith(".match"): for line in (open(os.path.join(d, file))): if line.startswith("#"): continue line = line.strip() f = line.split() if len(f) >= 4: chr, start, end, what = f[0:4] else: continue if not chr in lookup: lookup[chr] = intervals.Intersecter() lookup[chr].add_interval( intervals.Interval(int(start), int(end), what)) else: k = file.split('.')[0] if not k in scores_by_what: scores_by_what[k] = FileBinnedArray( open(os.path.join(d, file))) if lookup == {}: self.lookup = None self.scores = scores_by_what #return None, scores_by_what else: self.lookup = lookup self.scores = scores_by_what
def __main__(): # Parse Command Line options, args = doc_optparse.parse(__doc__) try: range_filename = args[0] refindex = int(args[1]) if options.mincols: mincols = int(options.mincols) else: mincols = 10 except: doc_optparse.exit() # Load Intervals intersecter = intervals.Intersecter() for line in file(range_filename): fields = line.split() intersecter.add_interval( intervals.Interval(int(fields[0]), int(fields[1]))) # Start axt on stdout out = bx.align.axt.Writer(sys.stdout) # Iterate over input axt for axt in bx.align.axt.Reader(sys.stdin): ref_component = axt.components[refindex] # Find overlap with reference component intersections = intersecter.find(ref_component.start, ref_component.end) # Keep output axt ordered intersections.sort() # Write each intersecting block for interval in intersections: start = max(interval.start, ref_component.start) end = min(interval.end, ref_component.end) sliced = axt.slice_by_component(refindex, start, end) good = True for c in sliced.components: if c.size < 1: good = False if good and sliced.text_size > mincols: out.write(sliced) # Close output axt out.close()
def main(): intersecters = {} # Read ranges for chr, start, end in read_intervals( misc.open_compressed( sys.argv[1] ) ): if not intersecters.has_key( chr ): intersecters[ chr ] = intervals.Intersecter() intersecters[ chr ].add_interval( intervals.Interval( start, end ) ) # Count intersection total = 0 for chr, start, end in read_intervals( misc.open_compressed( sys.argv[2] ) ): if intersecters.has_key( chr ): intersection = intersecters[ chr ].find( start, end ) if intersection: #print chr, intersection total += 1 print total
def __main__(): # Parse Command Line options, args = doc_optparse.parse(__doc__) try: assert len(args) > 0 except: doc_optparse.exit() # Load Intervals intersector = intervals.Intersecter() for f in args: for line in file(f): if line.startswith("#") or line.isspace(): continue fields = line.split() intersector.add_interval( intervals.Interval(int(fields[0]), int(fields[1]))) # Start MAF on stdout out = bx.align.maf.Writer(sys.stdout) # Iterate over input MAF for maf in bx.align.maf.Reader(sys.stdin): # Find overlap with reference component intersections = intersector.find(maf.components[0].start, maf.components[0].end) # Write only if no overlap if len(intersections) == 0: out.write(maf) # Close output MAF out.close()
def __main__(): # Parse Command Line options, args = doc_optparse.parse( __doc__ ) try: range_filename = args[ 0 ] try: refindex = int( args[ 1 ] ) refname = None except: refindex = None refname = args[ 1 ] if options.mincols: mincols = int( options.mincols ) else: mincols = 10 if options.prefix: prefix = options.prefix else: prefix = "" except: doc_optparse.exit() # Load Intervals intersecters = dict() for line in file( range_filename ): fields = line.split() src = prefix + fields[0] if not src in intersecters: intersecters[src] = intervals.Intersecter() intersecters[src].add_interval( intervals.Interval( int( fields[1] ), int( fields[2] ) ) ) # Start MAF on stdout out = bx.align.maf.Writer( sys.stdout ) # Iterate over input MAF for maf in bx.align.maf.Reader( sys.stdin ): if refname: sourcenames = [ cmp.src.split('.')[0] for cmp in maf.components ] try: refindex = sourcenames.index( refname ) except: continue ref_component = maf.components[ refindex ] # Find overlap with reference component if not ( ref_component.src in intersecters ): continue intersections = intersecters[ ref_component.src ].find( ref_component.start, ref_component.end ) # Keep output maf ordered intersections.sort() # Write each intersecting block for interval in intersections: start = max( interval.start, ref_component.start ) end = min( interval.end, ref_component.end ) sliced = maf.slice_by_component( refindex, start, end ) good = True for c in sliced.components: if c.size < 1: good = False if good and sliced.text_size > mincols: out.write( sliced ) # Close output MAF out.close()
def main(): if len(sys.argv) < 5: print >> sys.stderr, "%s bedfile inmaf spec1,spec2,... motif_file " % sys.argv[ 0] sys.exit(0) # read in intervals regions = {} for line in open(sys.argv[1]): if line.startswith('#'): continue fields = line.strip().split() chrom, start, end = fields[0], int(fields[1]), int(fields[2]) try: name = fields[3] except: name = None if chrom not in regions: regions[chrom] = intervals.Intersecter() regions[chrom].add(start, end, name) pwm = {} for wm in pwmx.Reader(open(sys.argv[4])): pwm[wm.id] = wm print >> sys.stderr, wm.id, len(wm) inmaf = open(sys.argv[2]) threshold = 0.5 species = [] for sp in sys.argv[3].split(','): species.append(sp) for maf in align_maf.Reader(inmaf): mafchrom = maf.components[0].src.split('.')[1] mafstart = maf.components[0].start mafend = maf.components[0].end reftext = maf.components[0].text # maf block scores for each matrix for scoremax, width, headers in MafBlockScorer(pwm, species, maf): #print >>sys.stderr,headers blocklength = width mafsrc, mafstart, mafend = headers[0] mafchrom = mafsrc.split('.')[1] # lists of scores for each position in scoremax for mx_name, mx in scoremax.items(): #print >>sys.stderr, mx_name, len(pwm[mx_name]) for offset in range(blocklength): # scan all species with threshold for i in range(len(species)): if mx[i][offset] > threshold: refstart = mafstart + offset - reftext.count( '-', 0, offset) refend = refstart + len(pwm[mx_name]) data = " ".join([ "%.2f" % mx[x][offset] for x in range(len(species)) ]) # quote the motif r = regions[mafchrom].find(refstart, refend) if mafchrom in regions and len(r) > 0: region_label = r[0].value else: #region_label = 0 continue v_name = mx_name.replace(' ', '_') print mafchrom, refstart, refend, region_label, v_name, data break
sys.exit() ##### INPUTS AND OUTPUTS ##### # get intervals print "Loading data..." p1_ints = np.loadtxt(options.i, 'str', delimiter="\t") print "Calculating rank..." # save as fast lookup bedInts = {} for j in range(0, len(p1_ints)): chr = p1_ints[j][0] start = int(p1_ints[j][1]) end = int(p1_ints[j][2]) if not bedInts.has_key(chr): bedInts[chr] = intervals.Intersecter() bedInts[chr].insert(start, end, np.append(p1_ints[j], j)) # pass filter idx idx = np.array(np.ones(len(p1_ints)), dtype=int) ##### SCRIPT ##### # loop through files print "Looping through intervals..." for i in range(0, len(p1_ints)): # look for overlaps atacChr = p1_ints[i][0] atacStart = int(p1_ints[i][1]) atacEnd = int(p1_ints[i][2]) # if pass filter and only 1 match
def main(): if len(sys.argv) < 5: print("%s bedfile inmaf spec1,spec2,... string [string2,...]" % sys.argv[0], file=sys.stderr) sys.exit(0) # read in intervals regions = {} for line in open(sys.argv[1]): if line.startswith('#'): continue fields = line.strip().split() chrom, start, end = fields[0], int(fields[1]), int(fields[2]) try: name = fields[3] except IndexError: name = None if chrom not in regions: regions[chrom] = intervals.Intersecter() regions[chrom].add(start, end, name) motif_strings = sys.argv[4:] if not isinstance(motif_strings, list): motif_strings = [motif_strings] inmaf = open(sys.argv[2]) threshold = 0.5 species = [] for sp in sys.argv[3].split(','): species.append(sp) for maf in align_maf.Reader(inmaf): mafchrom = maf.components[0].src.split('.')[1] mafstart = maf.components[0].start mafend = maf.components[0].end reftext = maf.components[0].text r = regions[mafchrom].find(mafstart, mafend) if mafchrom not in regions or len(r) == 0: continue # maf block scores for each matrix for scoremax, width, headers in MafMotifScorer(species, maf, motif_strings): blocklength = width mafsrc, mafstart, mafend = headers[0] mafchrom = mafsrc.split('.')[1] # lists of scores for each position in scoremax for mx_name, mx in scoremax.items(): for offset in range(blocklength): # scan all species with threshold for i in range(len(species)): if mx[i][offset] > threshold: refstart = mafstart + offset - reftext.count( '-', 0, offset) refend = refstart + len(mx_name) data = " ".join([ "%.2f" % mx[x][offset] for x in range(len(species)) ]) # quote the motif r = regions[mafchrom].find(refstart, refend) if mafchrom in regions and len(r) > 0: region_label = r[0].value else: # region_label = 0 continue v_name = mx_name.replace(' ', '_') print(mafchrom, refstart, refend, region_label, v_name, data) break