def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--pattern", dest="pattern", default=r'{{CHROM}}.phastCons46way.bw')
    parser.add_argument('--bed', dest='bed_file', default=None)
    parser.add_argument("bigwig_file_dir")
    args = parser.parse_args()
    if which(BIGWIG_TO_BEDGRAPH_BIN) is None:
        parser.error('bigWigToBedGraph binary not found in PATH')
    if args.bed_file is not None:
        if not os.path.exists(args.bed_file):
            parser.error('bed file %s not found' % (args.bed_file))
    else:
        parser.error('specify a bed file using --bed')
    # find bigwig files
    chrom_bigwig_dict = find_bigwig_files(args.bigwig_file_dir, args.pattern)
    i = 0
    for f in BEDFeature.parse(open(args.bed_file)):
        # read conservation
        bigwig_file = chrom_bigwig_dict[f.chrom]
        arr = extract_bigwig_data(f, bigwig_file) 
        fields = [f.chrom, str(f.tx_start), str(f.tx_end), str(np.mean(arr))]
        print '\t'.join(fields)
        i += 1
        if (i % 1000) == 0:
            logging.debug("finished %d" % (i))
    return 0
示例#2
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--bin-path", dest="bin_path", default='')
    parser.add_argument('--window-size', type=int, dest='window_size', default=300)
    parser.add_argument('--threshold', type=float, dest='mapability_threshold', default=0.99)
    parser.add_argument('-n', type=int, dest='num_samples', default=1000)
    parser.add_argument('bigwig_file')
    parser.add_argument('genome_bed_file')
    parser.add_argument('transcript_bed_file')
    # parse arguments
    args = parser.parse_args()
    bigwig_file = args.bigwig_file
    genome_bed_file = args.genome_bed_file
    transcript_bed_file = args.transcript_bed_file
    window_size = max(1, args.window_size)
    mapability_threshold = args.mapability_threshold
    num_samples = max(1, args.num_samples)
    # check arguments
    if which(BIGWIG_TO_BEDGRAPH_BIN) is None:
        parser.error('bigWigToBedGraph binary not found in PATH')
    if not os.path.exists(bigwig_file):
        parser.error('bigwig file %s not found' % (bigwig_file))
    if not os.path.exists(genome_bed_file):
        parser.error('genome bed file %s not found' % (genome_bed_file))
    if not os.path.exists(transcript_bed_file):
        parser.error('transcript bed file %s not found' % (transcript_bed_file))
    # parameters
    logging.info("Window size: %d" % (window_size))
    logging.info("Mapability file: %s" % (bigwig_file))
    logging.info("Mapability threshold: %f" % (mapability_threshold))
    # get transcript lengths from transcript bed file
    transcript_lengths = []
    for f in BEDFeature.parse(open(transcript_bed_file)):
        length = sum((e[1]-e[0]) for e in f.exons)
        transcript_lengths.append(length)
    # first pass to index windows
    features = []
    window_cumsums = []
    num_windows = 0
    for f in BEDFeature.parse(open(genome_bed_file)):
        length = sum((e[1]-e[0]) for e in f.exons)
        if length == 0:
            continue
        if window_size > length:
            continue
        window_cumsums.append(num_windows)
        features.append(f)
        num_windows += length - window_size + 1
    logging.debug('Found %d total windows' % (num_windows))

    # get windows
    for i in xrange(num_samples):
        
    
    
    # now sample windows
    if num_windows < num_samples:
        logging.warning('Only sampling %d windows' % (num_windows))
    num_samples = min(num_samples, num_windows)
    windows = []
    trials = 0
    while len(windows) < num_samples:
        # choose random window
        windex = random.randrange(num_windows)
        feature_index = bisect.bisect_right(window_cumsums, windex) - 1
        woffset = windex - window_cumsums[feature_index]
        f = features[feature_index]
        # fetch data in window
        arr = extract_bigwig_data(f, bigwig_file) 
        score = np.mean(arr[woffset:woffset+window_size])
        if score >= mapability_threshold:
            windows.append((f.chrom, f.tx_start + woffset, f.tx_start + woffset + window_size))
            logging.debug('Sampled %d windows (%d trials)' % (len(windows), trials))
        trials += 1
    for window in sorted(windows):
        print '\t'.join(map(str, window))
    return 0

if __name__ == '__main__':
    sys.exit(main())