def main(): usage = 'usage: %prog [options] <params_file> <model_file> <bed_file>' parser = OptionParser(usage) parser.add_option('-f', dest='genome_fasta', default=None, help='Genome FASTA for sequences [Default: %default]') parser.add_option( '-l', dest='mut_len', default=200, type='int', help='Length of center sequence to mutate [Default: %default]') parser.add_option('-o', dest='out_dir', default='sat_mut', help='Output directory [Default: %default]') parser.add_option('--plots', dest='plots', default=False, action='store_true', help='Make heatmap plots [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Ensemble forward and reverse complement predictions [Default: %default]' ) parser.add_option('--shifts', dest='shifts', default='0', help='Ensemble prediction shifts [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') (options, args) = parser.parse_args() if len(args) == 3: # single worker params_file = args[0] model_file = args[1] bed_file = args[2] elif len(args) == 5: # multi worker options_pkl_file = args[0] params_file = args[1] model_file = args[2] bed_file = args[3] worker_index = int(args[4]) # load options options_pkl = open(options_pkl_file, 'rb') options = pickle.load(options_pkl) options_pkl.close() # update output directory options.out_dir = '%s/job%d' % (options.out_dir, worker_index) else: parser.error( 'Must provide parameters and model files and QTL BED file') if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) options.shifts = [int(shift) for shift in options.shifts.split(',')] ################################################################# # read parameters and collet target information job = params.read_job_params(params_file) if options.targets_file is None: target_ids = ['t%d' % ti for ti in range(job['num_targets'])] target_labels = [''] * len(target_ids) target_subset = None else: targets_df = pd.read_table(options.targets_file, index_col=0) target_ids = targets_df.identifier target_labels = targets_df.description target_subset = targets_df.index if len(target_subset) == job['num_targets']: target_subset = None num_targets = len(target_ids) ################################################################# # sequence dataset # read sequences from BED seqs_dna, seqs_coords = bed_seqs(bed_file, options.genome_fasta, job['seq_length']) # filter for worker SNPs if options.processes is not None: worker_bounds = np.linspace(0, len(seqs_dna), options.processes + 1, dtype='int') seqs_dna = seqs_dna[ worker_bounds[worker_index]:worker_bounds[worker_index + 1]] seqs_coords = seqs_coords[ worker_bounds[worker_index]:worker_bounds[worker_index + 1]] num_seqs = len(seqs_dna) # determine mutation region limits seq_mid = job['seq_length'] // 2 mut_start = seq_mid - options.mut_len // 2 mut_end = mut_start + options.mut_len # make data ops data_ops = satmut_data_ops(seqs_dna, mut_start, mut_end, job['batch_size']) ################################################################# # setup model # build model model = seqnn.SeqNN() model.build_sad(job, data_ops, target_subset=target_subset, ensemble_rc=options.rc, ensemble_shifts=options.shifts) ################################################################# # setup output scores_h5_file = '%s/scores.h5' % options.out_dir if os.path.isfile(scores_h5_file): os.remove(scores_h5_file) scores_h5 = h5py.File('%s/scores.h5' % options.out_dir) scores_h5.create_dataset('scores', dtype='float16', shape=(num_seqs, options.mut_len, 4, num_targets)) scores_h5.create_dataset('seqs', dtype='bool', shape=(num_seqs, options.mut_len, 4)) # store mutagenesis sequence coordinates seqs_chr, seqs_start, _ = zip(*seqs_coords) seqs_chr = np.array(seqs_chr, dtype='S') seqs_start = np.array(seqs_start) + mut_start seqs_end = seqs_start + options.mut_len scores_h5.create_dataset('chrom', data=seqs_chr) scores_h5.create_dataset('start', data=seqs_start) scores_h5.create_dataset('end', data=seqs_end) preds_per_seq = 1 + 3 * options.mut_len score_threads = [] score_queue = Queue() for i in range(1): sw = ScoreWorker(score_queue, scores_h5) sw.start() score_threads.append(sw) ################################################################# # predict scores, write output # initialize saver saver = tf.train.Saver() with tf.Session() as sess: # coordinator coord = tf.train.Coordinator() tf.train.start_queue_runners(coord=coord) # load variables into session saver.restore(sess, model_file) # initialize predictions stream preds_stream = PredStream(sess, model, 32) # predictions index pi = 0 for si in range(num_seqs): print('Predicting %d' % si, flush=True) # collect sequence predictions seq_preds = [] for spi in range(preds_per_seq): seq_preds.append(preds_stream[pi]) pi += 1 # wait for previous to finish score_queue.join() # queue sequence for scoring score_queue.put((seqs_dna[si], seq_preds, si)) # queue sequence for plotting if options.plots: plot_queue.put((seqs_dna[si], seq_preds, si)) # finish queue print('Waiting for threads to finish.', flush=True) score_queue.join() # close output HDF5 scores_h5.close()
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <vcf_file>' parser = OptionParser(usage) parser.add_option('-c', dest='center_pct', default=0.25, type='float', help='Require clustered SNPs lie in center region [Default: %default]') parser.add_option('-f', dest='genome_fasta', default='%s/data/hg19.fa' % os.environ['BASENJIDIR'], help='Genome FASTA for sequences [Default: %default]') parser.add_option('--flip', dest='flip_ref', default=False, action='store_true', help='Flip reference/alternate alleles when simple [Default: %default]') parser.add_option('--local', dest='local', default=1024, type='int', help='Local SAD score [Default: %default]') parser.add_option('-n', dest='norm_file', default=None, help='Normalize SAD scores') parser.add_option('-o',dest='out_dir', default='sad', help='Output directory for tables and plots [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script') parser.add_option('--pseudo', dest='log_pseudo', default=1, type='float', help='Log2 pseudocount [Default: %default]') parser.add_option('--rc', dest='rc', default=False, action='store_true', help='Average forward and reverse complement predictions [Default: %default]') parser.add_option('--shifts', dest='shifts', default='0', type='str', help='Ensemble prediction shifts [Default: %default]') parser.add_option('--stats', dest='sad_stats', default='SAD', help='Comma-separated list of stats to save. [Default: %default]') parser.add_option('-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') parser.add_option('--ti', dest='track_indexes', default=None, type='str', help='Comma-separated list of target indexes to output BigWig tracks') parser.add_option('-u', dest='penultimate', default=False, action='store_true', help='Compute SED in the penultimate layer [Default: %default]') (options, args) = parser.parse_args() if len(args) == 3: # single worker params_file = args[0] model_file = args[1] vcf_file = args[2] elif len(args) == 5: # multi worker options_pkl_file = args[0] params_file = args[1] model_file = args[2] vcf_file = args[3] worker_index = int(args[4]) # load options options_pkl = open(options_pkl_file, 'rb') options = pickle.load(options_pkl) options_pkl.close() # update output directory options.out_dir = '%s/job%d' % (options.out_dir, worker_index) else: parser.error('Must provide parameters and model files and QTL VCF file') if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.track_indexes is None: options.track_indexes = [] else: options.track_indexes = [int(ti) for ti in options.track_indexes.split(',')] if not os.path.isdir('%s/tracks' % options.out_dir): os.mkdir('%s/tracks' % options.out_dir) options.shifts = [int(shift) for shift in options.shifts.split(',')] options.sad_stats = options.sad_stats.split(',') ################################################################# # read parameters and collet target information job = params.read_job_params(params_file, require=['seq_length','num_targets']) if options.targets_file is None: target_ids = ['t%d' % ti for ti in range(job['num_targets'])] target_labels = ['']*len(target_ids) target_subset = None else: targets_df = pd.read_table(options.targets_file, index_col=0) target_ids = targets_df.identifier target_labels = targets_df.description target_subset = targets_df.index if len(target_subset) == job['num_targets']: target_subset = None ################################################################# # load SNPs # read sorted SNPs from VCF snps = bvcf.vcf_snps(vcf_file, require_sorted=True, flip_ref=options.flip_ref, validate_ref_fasta=options.genome_fasta) # filter for worker SNPs if options.processes is not None: worker_bounds = np.linspace(0, len(snps), options.processes+1, dtype='int') snps = snps[worker_bounds[worker_index]:worker_bounds[worker_index+1]] num_snps = len(snps) # cluster SNPs by position snp_clusters = cluster_snps(snps, job['seq_length'], options.center_pct) # delimit sequence boundaries [sc.delimit(job['seq_length']) for sc in snp_clusters] # open genome FASTA genome_open = pysam.Fastafile(options.genome_fasta) # make SNP sequence generator def snp_gen(): for sc in snp_clusters: snp_1hot_list = sc.get_1hots(genome_open) for snp_1hot in snp_1hot_list: yield {'sequence':snp_1hot} snp_types = {'sequence': tf.float32} snp_shapes = {'sequence': tf.TensorShape([tf.Dimension(job['seq_length']), tf.Dimension(4)])} dataset = tf.data.Dataset.from_generator(snp_gen, output_types=snp_types, output_shapes=snp_shapes) dataset = dataset.batch(job['batch_size']) dataset = dataset.prefetch(2*job['batch_size']) # dataset = dataset.apply(tf.contrib.data.prefetch_to_device('/device:GPU:0')) iterator = dataset.make_one_shot_iterator() data_ops = iterator.get_next() ################################################################# # setup model # build model t0 = time.time() model = seqnn.SeqNN() model.build_sad(job, data_ops, ensemble_rc=options.rc, ensemble_shifts=options.shifts, embed_penultimate=options.penultimate, target_subset=target_subset) print('Model building time %f' % (time.time() - t0), flush=True) if options.penultimate: # labels become inappropriate target_ids = ['']*model.hp.cnn_filters[-1] target_labels = target_ids # read target normalization factors target_norms = np.ones(len(target_labels)) if options.norm_file is not None: ti = 0 for line in open(options.norm_file): target_norms[ti] = float(line.strip()) ti += 1 num_targets = len(target_ids) ################################################################# # setup output snp_flips = np.array([snp.flipped for snp in snps], dtype='bool') sad_out = initialize_output_h5(options.out_dir, options.sad_stats, snps, target_ids, target_labels) snp_threads = [] snp_queue = Queue() for i in range(1): sw = SNPWorker(snp_queue, sad_out, options.sad_stats, options.log_pseudo) sw.start() snp_threads.append(sw) ################################################################# # predict SNP scores, write output # initialize saver saver = tf.train.Saver() with tf.Session() as sess: # load variables into session saver.restore(sess, model_file) # initialize predictions stream preds_stream = PredStream(sess, model, 32) # predictions index pi = 0 # SNP index si = 0 for snp_cluster in snp_clusters: ref_preds = preds_stream[pi] pi += 1 for snp in snp_cluster.snps: # print(snp, flush=True) alt_preds = preds_stream[pi] pi += 1 # queue SNP if snp_flips[si]: snp_queue.put((alt_preds, ref_preds, si)) else: snp_queue.put((ref_preds, alt_preds, si)) # update SNP index si += 1 # finish queue print('Waiting for threads to finish.', flush=True) snp_queue.join() # close genome genome_open.close() ################################################### # compute SAD distributions across variants # define percentiles d_fine = 0.001 d_coarse = 0.01 percentiles_neg = np.arange(d_fine, 0.1, d_fine) percentiles_base = np.arange(0.1, 0.9, d_coarse) percentiles_pos = np.arange(0.9, 1, d_fine) percentiles = np.concatenate([percentiles_neg, percentiles_base, percentiles_pos]) sad_out.create_dataset('percentiles', data=percentiles) pct_len = len(percentiles) for sad_stat in options.sad_stats: sad_stat_pct = '%s_pct' % sad_stat # compute sad_pct = np.percentile(sad_out[sad_stat], 100*percentiles, axis=0).T sad_pct = sad_pct.astype('float16') # save sad_out.create_dataset(sad_stat_pct, data=sad_pct, dtype='float16') sad_out.close()
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <bed_file>' parser = OptionParser(usage) parser.add_option( '-b', dest='bigwig_indexes', default=None, help='Comma-separated list of target indexes to write BigWigs') parser.add_option('-e', dest='embed_layer', default=None, type='int', help='Embed sequences using the specified layer index.') parser.add_option('-f', dest='genome_fasta', default=None, help='Genome FASTA for sequences [Default: %default]') parser.add_option('-g', dest='genome_file', default=None, help='Chromosome length information [Default: %default]') parser.add_option( '-l', dest='site_length', default=None, type='int', help='Prediction site length. [Default: params.seq_length]') parser.add_option('-o', dest='out_dir', default='pred_out', help='Output directory [Default: %default]') # parser.add_option('--plots', dest='plots', # default=False, action='store_true', # help='Make heatmap plots [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Ensemble forward and reverse complement predictions [Default: %default]' ) parser.add_option('-s', dest='sum', default=False, action='store_true', help='Sum site predictions [Default: %default]') parser.add_option('--shifts', dest='shifts', default='0', help='Ensemble prediction shifts [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') (options, args) = parser.parse_args() if len(args) == 3: params_file = args[0] model_file = args[1] bed_file = args[2] elif len(args) == 5: # multi worker options_pkl_file = args[0] params_file = args[1] model_file = args[2] bed_file = args[3] worker_index = int(args[4]) # load options options_pkl = open(options_pkl_file, 'rb') options = pickle.load(options_pkl) options_pkl.close() # update output directory options.out_dir = '%s/job%d' % (options.out_dir, worker_index) else: parser.error('Must provide parameter and model files and BED file') if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) options.shifts = [int(shift) for shift in options.shifts.split(',')] if options.bigwig_indexes is not None: options.bigwig_indexes = [ int(bi) for bi in options.bigwig_indexes.split(',') ] else: options.bigwig_indexes = [] if len(options.bigwig_indexes) > 0: bigwig_dir = '%s/bigwig' % options.out_dir if not os.path.isdir(bigwig_dir): os.mkdir(bigwig_dir) ################################################################# # read parameters and collet target information job = params.read_job_params(params_file, require=['num_targets', 'seq_length']) if job.get('batch_buffer', 0) > 0: print('Turn off batch_buffer.', file=sys.stderr) exit(1) num_targets = np.sum(job['num_targets']) if options.targets_file is None: target_subset = None else: targets_df = pd.read_table(options.targets_file, index_col=0) target_subset = targets_df.index if len(target_subset) == num_targets: target_subset = None else: num_targets = len(target_subset) if options.site_length is None: options.site_length = params['seq_length'] ################################################################# # sequence dataset # construct model sequences model_seqs_dna, model_seqs_coords = make_bed_data(bed_file, options.genome_fasta, job['seq_length']) # construct site coordinates site_seqs_coords = read_bed(bed_file, options.site_length) # filter for worker SNPs if options.processes is not None: worker_bounds = np.linspace(0, len(model_seqs_dna), options.processes + 1, dtype='int') model_seqs_dna = model_seqs_dna[ worker_bounds[worker_index]:worker_bounds[worker_index + 1]] model_seqs_coords = model_seqs_coords[ worker_bounds[worker_index]:worker_bounds[worker_index + 1]] site_seqs_coords = site_seqs_coords[ worker_bounds[worker_index]:worker_bounds[worker_index + 1]] num_seqs = len(model_seqs_dna) # make data ops data_ops = seq_data_ops(model_seqs_dna, job['batch_size']) ################################################################# # setup model # build model model = seqnn.SeqNN() model.build_sad(job, data_ops, ensemble_rc=options.rc, ensemble_shifts=options.shifts, embed_layer=options.embed_layer, target_subset=target_subset) ################################################################# # setup output # determine site boundaries in predictions space assert (job['seq_length'] % model.preds_length == 0) preds_window = job['seq_length'] // model.preds_length assert (model.preds_length % 2 == 0) preds_mid = model.preds_length // 2 assert (options.site_length % preds_window == 0) site_preds_length = options.site_length // preds_window assert (site_preds_length % 2 == 0) site_preds_start = preds_mid - site_preds_length // 2 site_preds_end = site_preds_start + site_preds_length # initialize HDF5 out_h5_file = '%s/predict.h5' % options.out_dir if os.path.isfile(out_h5_file): os.remove(out_h5_file) out_h5 = h5py.File(out_h5_file, 'w') # create predictions if options.sum: out_h5.create_dataset('preds', shape=(num_seqs, model.preds_depth), dtype='float16') else: out_h5.create_dataset('preds', shape=(num_seqs, site_preds_length, model.preds_depth), dtype='float16') # store site coordinates site_seqs_chr, site_seqs_start, site_seqs_end = zip(*site_seqs_coords) site_seqs_chr = np.array(site_seqs_chr, dtype='S') site_seqs_start = np.array(site_seqs_start) site_seqs_end = np.array(site_seqs_end) out_h5.create_dataset('chrom', data=site_seqs_chr) out_h5.create_dataset('start', data=site_seqs_start) out_h5.create_dataset('end', data=site_seqs_end) ################################################################# # predict scores, write output # initialize saver saver = tf.train.Saver() with tf.Session() as sess: # load variables into session saver.restore(sess, model_file) # initialize predictions stream preds_stream = PredStream(sess, model, 64) for si in range(num_seqs): print('Predicting %d' % si, flush=True) # predict preds_full = preds_stream[si] # slice site preds_site = preds_full[site_preds_start:site_preds_end, :] # write if options.sum: out_h5['preds'][si] = preds_site.sum(axis=0) else: out_h5['preds'][si] = preds_site # write bigwig for ti in options.bigwig_indexes: bw_file = '%s/s%d_t%d.bw' % (bigwig_dir, si, ti) bigwig_write(preds_full[:, ti], model_seqs_coords[si], bw_file, options.genome_file, model.hp.batch_buffer) # close output HDF5 out_h5.close()
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <bed_file>' parser = OptionParser(usage) parser.add_option( '-b', dest='bigwig_indexes', default=None, help='Comma-separated list of target indexes to write BigWigs') parser.add_option('-f', dest='genome_fasta', default=None, help='Genome FASTA for sequences [Default: %default]') parser.add_option('-g', dest='genome_file', default=None, help='Chromosome length information [Default: %default]') # parser.add_option('-l', dest='mid_len', # default=256, type='int', # help='Length of center sequence to sum predictions for [Default: %default]') parser.add_option('-o', dest='out_dir', default='pred_out', help='Output directory [Default: %default]') # parser.add_option('--plots', dest='plots', # default=False, action='store_true', # help='Make heatmap plots [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Ensemble forward and reverse complement predictions [Default: %default]' ) parser.add_option('--shifts', dest='shifts', default='0', help='Ensemble prediction shifts [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') (options, args) = parser.parse_args() if len(args) == 3: params_file = args[0] model_file = args[1] bed_file = args[2] elif len(args) == 5: # multi worker options_pkl_file = args[0] params_file = args[1] model_file = args[2] bed_file = args[3] worker_index = int(args[4]) # load options options_pkl = open(options_pkl_file, 'rb') options = pickle.load(options_pkl) options_pkl.close() # update output directory options.out_dir = '%s/job%d' % (options.out_dir, worker_index) else: parser.error('Must provide parameter and model files and BED file') if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) options.shifts = [int(shift) for shift in options.shifts.split(',')] if options.bigwig_indexes is not None: options.bigwig_indexes = [ int(bi) for bi in options.bigwig_indexes.split(',') ] else: options.bigwig_indexes = [] if len(options.bigwig_indexes) > 0: bigwig_dir = '%s/bigwig' % options.out_dir if not os.path.isdir(bigwig_dir): os.mkdir(bigwig_dir) ################################################################# # read parameters and collet target information job = params.read_job_params(params_file, require=['num_targets', 'seq_length']) num_targets = np.sum(job['num_targets']) if options.targets_file is None: target_subset = None else: targets_df = pd.read_table(options.targets_file, index_col=0) target_subset = targets_df.index if len(target_subset) == num_targets: target_subset = None else: num_targets = len(target_subset) ################################################################# # sequence dataset # read sequences from BED seqs_dna, seqs_coords = bed_seqs(bed_file, options.genome_fasta, job['seq_length']) # filter for worker SNPs if options.processes is not None: worker_bounds = np.linspace(0, len(seqs_dna), options.processes + 1, dtype='int') seqs_dna = seqs_dna[ worker_bounds[worker_index]:worker_bounds[worker_index + 1]] seqs_coords = seqs_coords[ worker_bounds[worker_index]:worker_bounds[worker_index + 1]] num_seqs = len(seqs_dna) # make data ops data_ops = seq_data_ops(seqs_dna, job['batch_size']) ################################################################# # setup model # build model model = seqnn.SeqNN() model.build_sad(job, data_ops, ensemble_rc=options.rc, ensemble_shifts=options.shifts, target_subset=target_subset) ################################################################# # setup output out_h5_file = '%s/predict.h5' % options.out_dir if os.path.isfile(out_h5_file): os.remove(out_h5_file) out_h5 = h5py.File(out_h5_file, 'w') out_h5.create_dataset('preds', shape=(num_seqs, num_targets), dtype='float16') # store sequence coordinates seqs_chr, seqs_start, _ = zip(*seqs_coords) seqs_chr = np.array(seqs_chr, dtype='S') seqs_start = np.array(seqs_start) seqs_end = seqs_start + job['seq_length'] out_h5.create_dataset('chrom', data=seqs_chr) out_h5.create_dataset('start', data=seqs_start) out_h5.create_dataset('end', data=seqs_end) if model.preds_length % 2 == 0: # sum center two mid_start = model.preds_length // 2 - 1 mid_end = mid_start + 2 else: # take center one mid_start = model.preds_length // 2 mid_end = mid_start + 1 ################################################################# # predict scores, write output # initialize saver saver = tf.train.Saver() with tf.Session() as sess: # coordinator coord = tf.train.Coordinator() tf.train.start_queue_runners(coord=coord) # load variables into session saver.restore(sess, model_file) # initialize predictions stream preds_stream = PredStream(sess, model, 64) for si in range(num_seqs): print('Predicting %d' % si, flush=True) # predict preds_full = preds_stream[si] # slice middle and summarize preds = preds_full[mid_start:mid_end, :].sum(axis=0) # write out_h5['preds'][si] = preds # write bigwig for ti in options.bigwig_indexes: bw_file = '%s/s%d_t%d.bw' % (bigwig_dir, si, ti) bigwig_write(preds_full[:, ti], seqs_coords[si], bw_file, options.genome_file, model.hp.batch_buffer) # close output HDF5 out_h5.close()
def main(): usage = "usage: %prog [options] <params_file> <model_file> <vcf_file>" parser = OptionParser(usage) parser.add_option( "-c", dest="center_pct", default=0.25, type="float", help="Require clustered SNPs lie in center region [Default: %default]", ) parser.add_option( "-f", dest="genome_fasta", default="%s/data/hg19.fa" % os.environ["BASENJIDIR"], help="Genome FASTA for sequences [Default: %default]", ) parser.add_option( "-g", dest="genome_file", default="%s/data/human.hg19.genome" % os.environ["BASENJIDIR"], help="Chromosome lengths file [Default: %default]", ) parser.add_option( "--local", dest="local", default=1024, type="int", help="Local SAD score [Default: %default]", ) parser.add_option("-n", dest="norm_file", default=None, help="Normalize SAD scores") parser.add_option( "-o", dest="out_dir", default="sad", help="Output directory for tables and plots [Default: %default]", ) parser.add_option( "-p", dest="processes", default=None, type="int", help="Number of processes, passed by multi script", ) parser.add_option( "--pseudo", dest="log_pseudo", default=1, type="float", help="Log2 pseudocount [Default: %default]", ) parser.add_option( "--rc", dest="rc", default=False, action="store_true", help="Average forward and reverse complement predictions [Default: %default]", ) parser.add_option( "--shifts", dest="shifts", default="0", type="str", help="Ensemble prediction shifts [Default: %default]", ) parser.add_option( "--stats", dest="sad_stats", default="SAD", help="Comma-separated list of stats to save. [Default: %default]", ) parser.add_option( "-t", dest="targets_file", default=None, type="str", help="File specifying target indexes and labels in table format", ) parser.add_option( "--ti", dest="track_indexes", default=None, type="str", help="Comma-separated list of target indexes to output BigWig tracks", ) parser.add_option( "-u", dest="penultimate", default=False, action="store_true", help="Compute SED in the penultimate layer [Default: %default]", ) (options, args) = parser.parse_args() if len(args) == 3: # single worker params_file = args[0] model_file = args[1] vcf_file = args[2] elif len(args) == 5: # multi worker options_pkl_file = args[0] params_file = args[1] model_file = args[2] vcf_file = args[3] worker_index = int(args[4]) # load options options_pkl = open(options_pkl_file, "rb") options = pickle.load(options_pkl) options_pkl.close() # update output directory options.out_dir = "%s/job%d" % (options.out_dir, worker_index) else: parser.error("Must provide parameters and model files and QTL VCF file") if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.track_indexes is None: options.track_indexes = [] else: options.track_indexes = [int(ti) for ti in options.track_indexes.split(",")] if not os.path.isdir("%s/tracks" % options.out_dir): os.mkdir("%s/tracks" % options.out_dir) options.shifts = [int(shift) for shift in options.shifts.split(",")] options.sad_stats = options.sad_stats.split(",") ################################################################# # read parameters and collet target information job = params.read_job_params(params_file, require=["seq_length", "num_targets"]) if options.targets_file is None: target_ids = ["t%d" % ti for ti in range(job["num_targets"])] target_labels = [""] * len(target_ids) target_subset = None else: targets_df = pd.read_table(options.targets_file, index_col=0) target_ids = targets_df.identifier target_labels = targets_df.description target_subset = targets_df.index if len(target_subset) == job["num_targets"]: target_subset = None ################################################################# # load SNPs # read sorted SNPs from VCF snps = bvcf.vcf_snps( vcf_file, require_sorted=True, flip_ref=False, validate_ref_fasta=options.genome_fasta, ) # filter for worker SNPs if options.processes is not None: worker_bounds = np.linspace(0, len(snps), options.processes + 1, dtype="int") snps = snps[worker_bounds[worker_index] : worker_bounds[worker_index + 1]] num_snps = len(snps) # cluster SNPs by position snp_clusters = cluster_snps(snps, job["seq_length"], options.center_pct) # delimit sequence boundaries [sc.delimit(job["seq_length"]) for sc in snp_clusters] # open genome FASTA genome_open = pysam.Fastafile(options.genome_fasta) # make SNP sequence generator def snp_gen(): for sc in snp_clusters: snp_1hot_list = sc.get_1hots(genome_open) for snp_1hot in snp_1hot_list: yield {"sequence": snp_1hot} snp_types = {"sequence": tf.float32} snp_shapes = { "sequence": tf.TensorShape([tf.Dimension(job["seq_length"]), tf.Dimension(4)]) } dataset = tf.data.Dataset.from_generator( snp_gen, output_types=snp_types, output_shapes=snp_shapes ) dataset = dataset.batch(job["batch_size"]) dataset = dataset.prefetch(2 * job["batch_size"]) # dataset = dataset.apply(tf.contrib.data.prefetch_to_device('/device:GPU:0')) iterator = dataset.make_one_shot_iterator() data_ops = iterator.get_next() ################################################################# # setup model # build model t0 = time.time() model = seqnn.SeqNN() model.build_sad( job, data_ops, ensemble_rc=options.rc, ensemble_shifts=options.shifts, embed_penultimate=options.penultimate, target_subset=target_subset, ) print("Model building time %f" % (time.time() - t0), flush=True) if options.penultimate: # labels become inappropriate target_ids = [""] * model.hp.cnn_filters[-1] target_labels = target_ids # read target normalization factors target_norms = np.ones(len(target_labels)) if options.norm_file is not None: ti = 0 for line in open(options.norm_file): target_norms[ti] = float(line.strip()) ti += 1 num_targets = len(target_ids) ################################################################# # setup output sad_out = initialize_output_h5( options.out_dir, options.sad_stats, snps, target_ids, target_labels ) snp_threads = [] snp_queue = Queue() for i in range(1): sw = SNPWorker(snp_queue, sad_out, options.sad_stats, options.log_pseudo) sw.start() snp_threads.append(sw) ################################################################# # predict SNP scores, write output # initialize saver saver = tf.train.Saver() with tf.Session() as sess: # load variables into session saver.restore(sess, model_file) # initialize predictions stream preds_stream = PredStream(sess, model, 32) # predictions index pi = 0 # SNP index si = 0 for snp_cluster in snp_clusters: ref_preds = preds_stream[pi] pi += 1 for snp in snp_cluster.snps: # print(snp, flush=True) alt_preds = preds_stream[pi] pi += 1 # queue SNP snp_queue.put((ref_preds, alt_preds, si)) # update SNP index si += 1 # finish queue print("Waiting for threads to finish.", flush=True) snp_queue.join() # close genome genome_open.close() ################################################### # compute SAD distributions across variants # define percentiles d_fine = 0.001 d_coarse = 0.01 percentiles_neg = np.arange(d_fine, 0.1, d_fine) percentiles_base = np.arange(0.1, 0.9, d_coarse) percentiles_pos = np.arange(0.9, 1, d_fine) percentiles = np.concatenate([percentiles_neg, percentiles_base, percentiles_pos]) sad_out.create_dataset("percentiles", data=percentiles) pct_len = len(percentiles) for sad_stat in options.sad_stats: sad_stat_pct = "%s_pct" % sad_stat # compute sad_pct = np.percentile(sad_out[sad_stat], 100 * percentiles, axis=0).T sad_pct = sad_pct.astype("float16") # save sad_out.create_dataset(sad_stat_pct, data=sad_pct, dtype="float16") sad_out.close()
def main(): usage = "usage: %prog [options] <params_file> <model_file> <bed_file>" parser = OptionParser(usage) parser.add_option( "-f", dest="genome_fasta", default=None, help="Genome FASTA for sequences [Default: %default]", ) parser.add_option( "-l", dest="mut_len", default=200, type="int", help="Length of center sequence to mutate [Default: %default]", ) parser.add_option( "-o", dest="out_dir", default="sat_mut", help="Output directory [Default: %default]", ) parser.add_option( "--plots", dest="plots", default=False, action="store_true", help="Make heatmap plots [Default: %default]", ) parser.add_option( "-p", dest="processes", default=None, type="int", help="Number of processes, passed by multi script", ) parser.add_option( "--rc", dest="rc", default=False, action="store_true", help= "Ensemble forward and reverse complement predictions [Default: %default]", ) parser.add_option( "--shifts", dest="shifts", default="0", help="Ensemble prediction shifts [Default: %default]", ) parser.add_option( "-t", dest="targets_file", default=None, type="str", help="File specifying target indexes and labels in table format", ) (options, args) = parser.parse_args() if len(args) == 3: # single worker params_file = args[0] model_file = args[1] bed_file = args[2] elif len(args) == 5: # multi worker options_pkl_file = args[0] params_file = args[1] model_file = args[2] bed_file = args[3] worker_index = int(args[4]) # load options options_pkl = open(options_pkl_file, "rb") options = pickle.load(options_pkl) options_pkl.close() # update output directory options.out_dir = "%s/job%d" % (options.out_dir, worker_index) else: parser.error("Must provide parameter and model files and BED file") if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) options.shifts = [int(shift) for shift in options.shifts.split(",")] ################################################################# # read parameters and collet target information job = params.read_job_params(params_file) if options.targets_file is None: target_ids = ["t%d" % ti for ti in range(job["num_targets"])] target_labels = [""] * len(target_ids) target_subset = None else: targets_df = pd.read_table(options.targets_file, index_col=0) target_ids = targets_df.identifier target_labels = targets_df.description target_subset = targets_df.index if len(target_subset) == job["num_targets"]: target_subset = None num_targets = len(target_ids) ################################################################# # sequence dataset # read sequences from BED seqs_dna, seqs_coords = bed_seqs(bed_file, options.genome_fasta, job["seq_length"]) # filter for worker SNPs if options.processes is not None: worker_bounds = np.linspace(0, len(seqs_dna), options.processes + 1, dtype="int") seqs_dna = seqs_dna[ worker_bounds[worker_index]:worker_bounds[worker_index + 1]] seqs_coords = seqs_coords[ worker_bounds[worker_index]:worker_bounds[worker_index + 1]] num_seqs = len(seqs_dna) # determine mutation region limits seq_mid = job["seq_length"] // 2 mut_start = seq_mid - options.mut_len // 2 mut_end = mut_start + options.mut_len # make data ops data_ops = satmut_data_ops(seqs_dna, mut_start, mut_end, job["batch_size"]) ################################################################# # setup model # build model model = seqnn.SeqNN() model.build_sad( job, data_ops, target_subset=target_subset, ensemble_rc=options.rc, ensemble_shifts=options.shifts, ) ################################################################# # setup output scores_h5_file = "%s/scores.h5" % options.out_dir if os.path.isfile(scores_h5_file): os.remove(scores_h5_file) scores_h5 = h5py.File("%s/scores.h5" % options.out_dir) scores_h5.create_dataset("scores", dtype="float16", shape=(num_seqs, options.mut_len, 4, num_targets)) scores_h5.create_dataset("seqs", dtype="bool", shape=(num_seqs, options.mut_len, 4)) # store mutagenesis sequence coordinates seqs_chr, seqs_start, _, seqs_strand = zip(*seqs_coords) seqs_chr = np.array(seqs_chr, dtype="S") seqs_start = np.array(seqs_start) + mut_start seqs_end = seqs_start + options.mut_len seqs_strand = np.array(seqs_strand, dtype="S") scores_h5.create_dataset("chrom", data=seqs_chr) scores_h5.create_dataset("start", data=seqs_start) scores_h5.create_dataset("end", data=seqs_end) scores_h5.create_dataset("strand", data=seqs_strand) preds_per_seq = 1 + 3 * options.mut_len score_threads = [] score_queue = Queue() for i in range(1): sw = ScoreWorker(score_queue, scores_h5) sw.start() score_threads.append(sw) ################################################################# # predict scores, write output # initialize saver saver = tf.train.Saver() with tf.Session() as sess: # coordinator coord = tf.train.Coordinator() tf.train.start_queue_runners(coord=coord) # load variables into session saver.restore(sess, model_file) # initialize predictions stream preds_stream = PredStream(sess, model, 32) # predictions index pi = 0 for si in range(num_seqs): print("Predicting %d" % si, flush=True) # collect sequence predictions seq_preds = [] for spi in range(preds_per_seq): seq_preds.append(preds_stream[pi]) pi += 1 # wait for previous to finish score_queue.join() # queue sequence for scoring score_queue.put((seqs_dna[si], seq_preds, si)) # queue sequence for plotting if options.plots: plot_queue.put((seqs_dna[si], seq_preds, si)) # finish queue print("Waiting for threads to finish.", flush=True) score_queue.join() # close output HDF5 scores_h5.close()