def plot_filter_logo(filter_outs, filter_size, seqs, out_prefix, raw_t=0, maxpct_t=None): if maxpct_t: all_outs = np.ravel(filter_outs) all_outs_mean = all_outs.mean() all_outs_norm = all_outs - all_outs_mean raw_t = maxpct_t * all_outs_norm.max() + all_outs_mean # print fasta file of positive outputs filter_fasta_out = open('%s.fa' % out_prefix, 'w') filter_count = 0 for i in range(filter_outs.shape[0]): for j in range(filter_outs.shape[1]): if filter_outs[i, j] > raw_t: kmer = seqs[i][j:j + filter_size] print >> filter_fasta_out, '>%d_%d' % (i, j) print >> filter_fasta_out, kmer filter_count += 1 filter_fasta_out.close() # make weblogo if filter_count > 0: weblogo_cmd = 'weblogo %s < %s.fa > %s.eps' % (weblogo_opts, out_prefix, out_prefix) if subprocess.call(weblogo_cmd, shell=True): message('Error running weblogo', 'error')
def seq_logo(seq, heights, out_eps, weblogo_args=''): # print the sequence to a temp fasta file fasta_fd, fasta_file = tempfile.mkstemp() fasta_out = open(fasta_file, 'w') print >> fasta_out, '>seq\n%s' % seq fasta_out.close() # print figure to a temp eps file eps_fd, eps_file = tempfile.mkstemp() weblogo_cmd = 'weblogo --errorbars NO --show-xaxis NO --show-yaxis NO --fineprint "" -c classic -n %d %s < %s > %s' % ( len(seq), weblogo_args, fasta_file, eps_file) if subprocess.call(weblogo_cmd, shell=True): message('Error running weblogo', 'error') # copy eps file over and write in my own heights start_stack_re = re.compile('^\(\d*\) StartStack') out_eps_open = open(out_eps, 'w') weblogo_eps_in = open(eps_file) line = weblogo_eps_in.readline() si = 0 while line: start_stack_match = start_stack_re.search(line) # nt column begins if start_stack_match: print >> out_eps_open, line, # loop over 4 nt's for i in range(4): line = weblogo_eps_in.readline() a = line.split() nt = a[2][1:-1] if nt != seq[si]: print >> out_eps_open, line, else: # change the nt of seq a[1] = '%.6f' % heights[si] print >> out_eps_open, ' %s' % ' '.join(a) # move to next nucleotide si += 1 else: print >> out_eps_open, line, # advance to next line line = weblogo_eps_in.readline() # clean os.close(fasta_fd) os.remove(fasta_file) os.close(eps_fd) os.remove(eps_file)
def seq_logo(seq, heights, out_eps, weblogo_args=''): # print the sequence to a temp fasta file fasta_fd, fasta_file = tempfile.mkstemp() fasta_out = open(fasta_file, 'w') print >> fasta_out, '>seq\n%s' % seq fasta_out.close() # print figure to a temp eps file eps_fd, eps_file = tempfile.mkstemp() weblogo_cmd = 'weblogo --errorbars NO --show-xaxis NO --show-yaxis NO --fineprint "" -c classic -n %d %s < %s > %s' % (len(seq), weblogo_args, fasta_file, eps_file) if subprocess.call(weblogo_cmd, shell=True): message('Error running weblogo', 'error') # copy eps file over and write in my own heights start_stack_re = re.compile('^\(\d*\) StartStack') out_eps_open = open(out_eps, 'w') weblogo_eps_in = open(eps_file) line = weblogo_eps_in.readline() si = 0 while line: start_stack_match = start_stack_re.search(line) # nt column begins if start_stack_match: print >> out_eps_open, line, # loop over 4 nt's for i in range(4): line = weblogo_eps_in.readline() a = line.split() nt = a[2][1:-1] if nt != seq[si]: print >> out_eps_open, line, else: # change the nt of seq a[1] = '%.6f' % heights[si] print >> out_eps_open, ' %s' % ' '.join(a) # move to next nucleotide si += 1 else: print >> out_eps_open, line, # advance to next line line = weblogo_eps_in.readline() # clean os.close(fasta_fd) os.remove(fasta_file) os.close(eps_fd) os.remove(eps_file)
def prep_snp_seqs(vcf_file, out_dir, seq_len, genome_fasta, from_=None, to_=None): message('prep SNP sequences') # Prepare hdf5 file h5f = h5py.File('%s/model_in.h5'%out_dir, 'w') dset = h5f.create_dataset('test_in', (1, 4, 1, seq_len), maxshape=(None, 4, 1, seq_len)) # Read through VCF current_shape = 0 # start off with 0 (init to 1 bc got error otherwise) with open(vcf_file, "r") as f, gzip.open(vcf_file, 'rb') as fz: if vcf_file.endswith(".gz"): f = fz for line in f: # Get one hot coded sequence snp = vcf.SNP(line) if from_ is not None and snp.pos < from_: continue if to_ is not None and snp.pos > to_: break seq_vecs, seqs, seq_headers = vcf.snps_seq1([snp], genome_fasta, seq_len) seq_vecs = seq_vecs.reshape((seq_vecs.shape[0],4,1,seq_vecs.shape[1]/4)) # Add to hd5 file dset.resize(current_shape+seq_vecs.shape[0], axis=0) dset[current_shape:,...] = seq_vecs current_shape = dset.shape[0] h5f.close()
def plot_filter_logo(filter_outs, filter_size, seqs, out_prefix, raw_t=0, maxpct_t=None): if maxpct_t: all_outs = np.ravel(filter_outs) all_outs_mean = all_outs.mean() all_outs_norm = all_outs - all_outs_mean raw_t = maxpct_t * all_outs_norm.max() + all_outs_mean # print fasta file of positive outputs filter_fasta_out = open('%s.fa' % out_prefix, 'w') filter_count = 0 for i in range(filter_outs.shape[0]): for j in range(filter_outs.shape[1]): if filter_outs[i,j] > raw_t: kmer = seqs[i][j:j+filter_size] print >> filter_fasta_out, '>%d_%d' % (i,j) print >> filter_fasta_out, kmer filter_count += 1 filter_fasta_out.close() # make weblogo if filter_count > 0: weblogo_cmd = 'weblogo %s < %s.fa > %s.eps' % (weblogo_opts, out_prefix, out_prefix) if subprocess.call(weblogo_cmd, shell=True): message('Error running weblogo', 'error')
def main(): usage = 'usage: %prog [options] <model_file> <vcf_file>' parser = OptionParser(usage) parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option('-f', dest='genome_fasta', default='%s/data/genomes/hg19.fa'%os.environ['BASSETDIR'], help='Genome FASTA from which sequences will be drawn [Default: %default]') parser.add_option('-g', dest='gain_height', default=False, action='store_true', help='Nucleotide heights determined by the max of loss and gain [Default: %default]') parser.add_option('-l', dest='seq_len', type='int', default=600, help='Sequence length provided to the model [Default: %default]') parser.add_option('-m', dest='min_limit', default=0.1, type='float', help='Minimum heatmap limit [Default: %default]') parser.add_option('-n', dest='center_nt', default=200, type='int', help='Nt around the SNP to mutate and plot in the heatmap [Default: %default]') parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]') parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide Basset model file and input SNPs in VCF format') else: model_file = args[0] vcf_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # prep SNP sequences ################################################################# # load SNPs snps = vcf.vcf_snps(vcf_file) # get one hot coded input sequences seqs_1hot, seqs, seq_headers = vcf.snps_seq1(snps, options.genome_fasta, options.seq_len) # reshape sequences for torch seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4)) # write to HDF5 model_input_hdf5 = '%s/model_in.h5'%options.out_dir h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() ################################################################# # Torch predict modifications ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_sat_predict.lua -center_nt %d %s %s %s' % (options.center_nt, model_file, model_input_hdf5, options.model_hdf5_file) if subprocess.call(torch_cmd, shell=True): message('Error running basset_sat_predict.lua', 'error') ################################################################# # load modification predictions ################################################################# hdf5_in = h5py.File(options.model_hdf5_file, 'r') seq_mod_preds = np.array(hdf5_in['seq_mod_preds']) hdf5_in.close() # trim seqs to match seq_mod_preds length delta_start = 0 delta_len = seq_mod_preds.shape[2] if delta_len < options.seq_len: delta_start = (options.seq_len - delta_len)/2 for si in range(len(seqs)): seqs[si] = seqs[si][delta_start:delta_start+delta_len] # decide which cells to plot if options.targets == '-1': plot_cells = xrange(seq_mod_preds.shape[3]) else: plot_cells = [int(ci) for ci in options.targets.split(',')] ################################################################# # plot ################################################################# table_out = open('%s/table.txt' % options.out_dir, 'w') rdbu = sns.color_palette("RdBu_r", 10) nts = 'ACGT' for si in range(seq_mod_preds.shape[0]): header = seq_headers[si] seq = seqs[si] # plot some descriptive heatmaps for each individual cell type for ci in plot_cells: seq_mod_preds_cell = seq_mod_preds[si,:,:,ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) # compute matrices norm_matrix = seq_mod_preds_cell - real_pred_cell min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell]) # prepare figure sns.set(style='white', font_scale=0.5) sns.axes_style({'axes.linewidth':1}) heat_cols = 400 sad_start = 1 sad_end = 323 logo_start = 0 logo_end = 324 fig = plt.figure(figsize=(20,3)) ax_logo = plt.subplot2grid((3,heat_cols), (0,logo_start), colspan=(logo_end-logo_start)) ax_sad = plt.subplot2grid((3,heat_cols), (1,sad_start), colspan=(sad_end-sad_start)) ax_heat = plt.subplot2grid((3,heat_cols), (2,0), colspan=heat_cols) # print a WebLogo of the sequence vlim = max(options.min_limit, abs(minmax_matrix).max()) if options.gain_height: seq_heights = 0.25 + 1.75/vlim*(abs(minmax_matrix).max(axis=0)) else: seq_heights = 0.25 + 1.75/vlim*(-minmax_matrix[0]) logo_eps = '%s/%s_c%d_seq.eps' % (options.out_dir, header.replace(':','_'), ci) seq_logo(seq, seq_heights, logo_eps) # add to figure logo_png = '%s.png' % logo_eps[:-4] logo_cmd = 'convert -density 300 %s %s' % (logo_eps, logo_png) if subprocess.call(logo_cmd, shell=True): message('Error running convert', 'error') logo = Image.open(logo_png) ax_logo.imshow(logo) ax_logo.set_axis_off() # plot loss and gain SAD scores ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label='loss', linewidth=1) ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label='gain', linewidth=1) ax_sad.set_xlim(0,minmax_matrix.shape[1]) ax_sad.legend() # ax_sad.grid(True, linestyle=':') for axis in ['top','bottom','left','right']: ax_sad.spines[axis].set_linewidth(0.5) # plot real-normalized scores vlim = max(options.min_limit, abs(norm_matrix).max()) sns.heatmap(norm_matrix, linewidths=0, cmap='RdBu_r', vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat) ax_heat.yaxis.set_ticklabels('TGCA', rotation='horizontal') # , size=10) # save final figure plt.tight_layout() plt.savefig('%s/%s_c%d_heat.pdf' % (options.out_dir,header.replace(':','_'), ci), dpi=300) plt.close() ################################################################# # print table of nt variability for each cell ################################################################# for ci in range(seq_mod_preds.shape[3]): seq_mod_preds_cell = seq_mod_preds[si,:,:,ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0) gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell for pos in range(seq_mod_preds_cell.shape[1]): cols = [header, delta_start+pos, ci, loss_matrix[pos], gain_matrix[pos]] print >> table_out, '\t'.join([str(c) for c in cols]) table_out.close()
def main(): usage = 'usage: %prog [options] <target_beds_file>' parser = OptionParser(usage) parser.add_option('-a', dest='db_act_file', help='Existing database activity table.') parser.add_option('-b', dest='db_bed', help='Existing database BED file.') parser.add_option('-c', dest='chrom_lengths_file', help='Table of chromosome lengths') parser.add_option('-m', dest='merge_overlap', default=200, type='int', help='Overlap length (after extension to feature_size) above which to merge features [Default: %default]') parser.add_option('-n', dest='no_db_activity', default=False, action='store_true', help='Do not pass along the activities of the database sequences [Default: %default]') parser.add_option('-o', dest='out_prefix', default='features', help='Output file prefix [Default: %default]') parser.add_option('-s', dest='feature_size', default=600, type='int', help='Extend features to this size [Default: %default]') parser.add_option('-y', dest='ignore_y', default=False, action='store_true', help='Ignore Y chromsosome features [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide file labeling the targets and providing BED file paths.') else: target_beds_file = args[0] # determine whether we'll add to an existing DB db_targets = [] db_add = False if options.db_bed: db_add = True if not options.no_db_activity: if options.db_act_file is None: parser.error('Must provide both activity table or specify -n if you want to add to an existing database') else: # read db target names db_act_in = open(options.db_act_file) db_targets = db_act_in.readline().strip().split('\t') db_act_in.close() # read in targets and assign them indexes into the db target_beds = [] target_dbi = [] for line in open(target_beds_file): a = line.split() if len(a) != 2: print a print >> sys.stderr, 'Each row of the target BEDS file must contain a label and BED file separated by whitespace' exit(1) target_dbi.append(len(db_targets)) db_targets.append(a[0]) target_beds.append(a[1]) # read in chromosome lengths chrom_lengths = {} if options.chrom_lengths_file: chrom_lengths = {} for line in open(options.chrom_lengths_file): a = line.split() chrom_lengths[a[0]] = int(a[1]) else: print >> sys.stderr, 'Warning: chromosome lengths not provided, so regions near ends may be incorrect.' ################################################################# # print peaks to chromosome-specific files ################################################################# chrom_files = {} chrom_outs = {} peak_beds = target_beds if db_add: peak_beds.append(options.db_bed) for bi in range(len(peak_beds)): if peak_beds[bi][-3:] == '.gz': peak_bed_in = gzip.open(peak_beds[bi]) else: peak_bed_in = open(peak_beds[bi]) for line in peak_bed_in: a = line.split('\t') a[-1] = a[-1].rstrip() chrom = a[0] strand = '+' if len(a) > 5 and a[5] in '+-': strand = a[5] chrom_key = (chrom,strand) # open chromosome file if chrom_key not in chrom_outs: chrom_files[chrom_key] = '%s_%s_%s.bed' % (options.out_prefix, chrom, strand) chrom_outs[chrom_key] = open(chrom_files[chrom_key], 'w') # if it's the db bed if db_add and bi == len(peak_beds)-1: if options.no_db_activity: # set activity to null a[6] = '.' print >> chrom_outs[chrom_key], '\t'.join(a[:7]) else: print >> chrom_outs[chrom_key], line, # if it's a new bed else: # specify the target index while len(a) < 7: a.append('') a[5] = strand a[6] = str(target_dbi[bi]) print >> chrom_outs[chrom_key], '\t'.join(a[:7]) peak_bed_in.close() # close chromosome-specific files for chrom_key in chrom_outs: chrom_outs[chrom_key].close() # ignore Y if options.ignore_y: for orient in '+-': chrom_key = ('chrY',orient) if chrom_key in chrom_files: os.remove(chrom_files[chrom_key]) del chrom_files[chrom_key] ################################################################# # sort chromosome-specific files ################################################################# for chrom_key in chrom_files: chrom,strand = chrom_key chrom_sbed = '%s_%s_%s_sort.bed' % (options.out_prefix,chrom,strand) sort_cmd = 'sortBed -i %s > %s' % (chrom_files[chrom_key], chrom_sbed) if subprocess.call(sort_cmd, shell=True): message('Error running SortBed. Is bedtools installed?', 'error') os.remove(chrom_files[chrom_key]) chrom_files[chrom_key] = chrom_sbed ################################################################# # parse chromosome-specific files ################################################################# final_bed_out = open('%s.bed' % options.out_prefix, 'w') for chrom_key in chrom_files: chrom, strand = chrom_key open_peaks = [] for line in open(chrom_files[chrom_key]): a = line.split('\t') a[-1] = a[-1].rstrip() # construct Peak peak_start = int(a[1]) peak_end = int(a[2]) peak_act = activity_set(a[6]) peak = Peak(peak_start, peak_end, peak_act) peak.extend(options.feature_size, chrom_lengths.get(chrom,None)) if len(open_peaks) == 0: # initialize open peak open_end = peak.end open_peaks = [peak] else: # operate on exiting open peak # if beyond existing open peak if open_end - options.merge_overlap <= peak.start: # close open peak mpeaks = merge_peaks(open_peaks, options.feature_size, options.merge_overlap, chrom_lengths.get(chrom,None)) # print to file for mpeak in mpeaks: print >> final_bed_out, mpeak.bed_str(chrom, strand) # initialize open peak open_end = peak.end open_peaks = [peak] else: # extend open peak open_peaks.append(peak) open_end = max(open_end, peak.end) if len(open_peaks) > 0: # close open peak mpeaks = merge_peaks(open_peaks, options.feature_size, options.merge_overlap, chrom_lengths.get(chrom,None)) # print to file for mpeak in mpeaks: print >> final_bed_out, mpeak.bed_str(chrom, strand) final_bed_out.close() # clean for chrom_key in chrom_files: os.remove(chrom_files[chrom_key]) ################################################################# # construct/update activity table ################################################################# final_act_out = open('%s_act.txt' % options.out_prefix, 'w') # print header cols = [''] + db_targets print >> final_act_out, '\t'.join(cols) # print sequences for line in open('%s.bed' % options.out_prefix): a = line.rstrip().split('\t') # index peak peak_id = '%s:%s-%s(%s)' % (a[0], a[1], a[2], a[5]) # construct full activity vector peak_act = [0]*len(db_targets) for ai in a[6].split(','): if ai != '.': peak_act[int(ai)] = 1 # print line cols = [peak_id] + peak_act print >> final_act_out, '\t'.join([str(c) for c in cols]) final_act_out.close()
def main(): usage = 'usage: %prog [options] <model_file> <input_file>' parser = OptionParser(usage) parser.add_option('-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file') parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option('-g', dest='gain_height', default=False, action='store_true', help='Nucleotide heights determined by the max of loss and gain [Default: %default]') parser.add_option('-m', dest='min_limit', default=0.1, type='float', help='Minimum heatmap limit [Default: %default]') parser.add_option('-n', dest='center_nt', default=200, type='int', help='Center nt to mutate and plot in the heat map [Default: %default]') parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]') parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file') else: model_file = args[0] input_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == '>': seq_headers.append(line[1:].rstrip()) seqs.append('') else: seqs[-1] += line.rstrip() model_input_hdf5 = '%s/model_in.h5'%options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot(input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False) # read in target names target_labels = open(options.input_activity_file).readline().strip().split('\t') else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None target_labels = None # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] if targets is not None: targets = targets[sample_i] # reshape sequences for torch seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() except (IOError, IndexError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, 'r') seqs_1hot = np.array(hdf5_in['test_in']) targets = np.array(hdf5_in['test_out']) try: # TEMP seq_headers = np.array(hdf5_in['test_headers']) target_labels = np.array(hdf5_in['target_labels']) except: seq_headers = None target_labels = None hdf5_in.close() # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] targets = targets[sample_i] # write sampled data to a new HDF5 file model_input_hdf5 = '%s/model_in.h5'%options.out_dir h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error('Could not parse input file as FASTA or HDF5.') ################################################################# # Torch predict modifications ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_sat_predict.lua -center_nt %d %s %s %s' % (options.center_nt, model_file, model_input_hdf5, options.model_hdf5_file) if subprocess.call(torch_cmd, shell=True): message('Error running basset_sat_predict.lua', 'error') ################################################################# # load modification predictions ################################################################# hdf5_in = h5py.File(options.model_hdf5_file, 'r') seq_mod_preds = np.array(hdf5_in['seq_mod_preds']) hdf5_in.close() # trim seqs to match seq_mod_preds length seq_len = len(seqs[0]) delta_start = 0 delta_len = seq_mod_preds.shape[2] if delta_len < seq_len: delta_start = (seq_len - delta_len)/2 for i in range(len(seqs)): seqs[i] = seqs[i][delta_start:delta_start+delta_len] # decide which cells to plot if options.targets == '-1': plot_targets = xrange(seq_mod_preds.shape[3]) else: plot_targets = [int(ci) for ci in options.targets.split(',')] ################################################################# # plot ################################################################# table_out = open('%s/table.txt' % options.out_dir, 'w') rdbu = sns.color_palette("RdBu_r", 10) nts = 'ACGT' for si in range(seq_mod_preds.shape[0]): try: header = seq_headers[si] except TypeError: header = 'seq%d' % si seq = seqs[si] # plot some descriptive heatmaps for each individual cell type for ci in plot_targets: seq_mod_preds_cell = seq_mod_preds[si,:,:,ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) # compute matrices norm_matrix = seq_mod_preds_cell - real_pred_cell min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell]) # prepare figure sns.set(style='white', font_scale=0.5) sns.axes_style({'axes.linewidth':1}) heat_cols = 400 sad_start = 1 sad_end = 323 logo_start = 0 logo_end = 324 fig = plt.figure(figsize=(20,3)) ax_logo = plt.subplot2grid((3,heat_cols), (0,logo_start), colspan=(logo_end-logo_start)) ax_sad = plt.subplot2grid((3,heat_cols), (1,sad_start), colspan=(sad_end-sad_start)) ax_heat = plt.subplot2grid((3,heat_cols), (2,0), colspan=heat_cols) # print a WebLogo of the sequence vlim = max(options.min_limit, abs(minmax_matrix).max()) if options.gain_height: seq_heights = 0.25 + 1.75/vlim*(abs(minmax_matrix).max(axis=0)) else: seq_heights = 0.25 + 1.75/vlim*(-minmax_matrix[0]) logo_eps = '%s/%s_c%d_seq.eps' % (options.out_dir, header_filename(header), ci) seq_logo(seq, seq_heights, logo_eps) # add to figure logo_png = '%s.png' % logo_eps[:-4] logo_cmd = 'convert -density 300 %s %s' % (logo_eps, logo_png) if subprocess.call(logo_cmd, shell=True): message('Error running convert', 'error') logo = Image.open(logo_png) ax_logo.imshow(logo) ax_logo.set_axis_off() # plot loss and gain SAD scores ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label='loss', linewidth=1) ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label='gain', linewidth=1) ax_sad.set_xlim(0,minmax_matrix.shape[1]) ax_sad.legend() # ax_sad.grid(True, linestyle=':') for axis in ['top','bottom','left','right']: ax_sad.spines[axis].set_linewidth(0.5) # plot real-normalized scores vlim = max(options.min_limit, abs(norm_matrix).max()) sns.heatmap(norm_matrix, linewidths=0, cmap='RdBu_r', vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat) ax_heat.yaxis.set_ticklabels('TGCA', rotation='horizontal') # , size=10) # save final figure plt.tight_layout() plt.savefig('%s/%s_c%d_heat.pdf' % (options.out_dir,header.replace(':','_'), ci), dpi=300) plt.close() ################################################################# # print table of nt variability for each cell ################################################################# for ci in range(seq_mod_preds.shape[3]): seq_mod_preds_cell = seq_mod_preds[si,:,:,ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0) gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell for pos in range(seq_mod_preds_cell.shape[1]): cols = [header, delta_start+pos, ci, loss_matrix[pos], gain_matrix[pos]] print >> table_out, '\t'.join([str(c) for c in cols]) table_out.close()
def main(): usage = "usage: %prog [options] <model_file> <input_file>" parser = OptionParser(usage) parser.add_option( "-a", dest="input_activity_file", help="Optional activity table corresponding to an input FASTA file" ) parser.add_option( "-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5 [Default: %default]" ) parser.add_option( "-g", dest="gain_height", default=False, action="store_true", help="Nucleotide heights determined by the max of loss and gain [Default: %default]", ) parser.add_option( "-m", dest="min_limit", default=0.1, type="float", help="Minimum heatmap limit [Default: %default]" ) parser.add_option( "-n", dest="center_nt", default=200, type="int", help="Center nt to mutate and plot in the heat map [Default: %default]", ) parser.add_option("-o", dest="out_dir", default="heat", help="Output directory [Default: %default]") parser.add_option( "-s", dest="sample", default=None, type="int", help="Sample sequences from the test set [Default:%default]" ) parser.add_option( "-t", dest="targets", default="0", help="Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 2: parser.error("Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file") else: model_file = args[0] input_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == ">": seq_headers.append(line[1:].rstrip()) seqs.append("") else: seqs[-1] += line.rstrip() model_input_hdf5 = "%s/model_in.h5" % options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot( input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False ) # read in target names target_labels = open(options.input_activity_file).readline().strip().split("\t") else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None target_labels = None # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] if targets is not None: targets = targets[sample_i] # reshape sequences for torch seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, "w") h5f.create_dataset("test_in", data=seqs_1hot) h5f.close() except (IOError, IndexError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, "r") seqs_1hot = np.array(hdf5_in["test_in"]) targets = np.array(hdf5_in["test_out"]) try: # TEMP seq_headers = np.array(hdf5_in["test_headers"]) target_labels = np.array(hdf5_in["target_labels"]) except: seq_headers = None target_labels = None hdf5_in.close() # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] targets = targets[sample_i] # write sampled data to a new HDF5 file model_input_hdf5 = "%s/model_in.h5" % options.out_dir h5f = h5py.File(model_input_hdf5, "w") h5f.create_dataset("test_in", data=seqs_1hot) h5f.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error("Could not parse input file as FASTA or HDF5.") ################################################################# # Torch predict modifications ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = "%s/model_out.h5" % options.out_dir torch_cmd = "basset_sat_predict.lua -center_nt %d %s %s %s" % ( options.center_nt, model_file, model_input_hdf5, options.model_hdf5_file, ) if subprocess.call(torch_cmd, shell=True): message("Error running basset_sat_predict.lua", "error") ################################################################# # load modification predictions ################################################################# hdf5_in = h5py.File(options.model_hdf5_file, "r") seq_mod_preds = np.array(hdf5_in["seq_mod_preds"]) hdf5_in.close() # trim seqs to match seq_mod_preds length seq_len = len(seqs[0]) delta_start = 0 delta_len = seq_mod_preds.shape[2] if delta_len < seq_len: delta_start = (seq_len - delta_len) / 2 for i in range(len(seqs)): seqs[i] = seqs[i][delta_start : delta_start + delta_len] # decide which cells to plot if options.targets == "-1": plot_targets = xrange(seq_mod_preds.shape[3]) else: plot_targets = [int(ci) for ci in options.targets.split(",")] ################################################################# # plot ################################################################# table_out = open("%s/table.txt" % options.out_dir, "w") rdbu = sns.color_palette("RdBu_r", 10) nts = "ACGT" for si in range(seq_mod_preds.shape[0]): try: header = seq_headers[si] except TypeError: header = "seq%d" % si seq = seqs[si] # plot some descriptive heatmaps for each individual cell type for ci in plot_targets: seq_mod_preds_cell = seq_mod_preds[si, :, :, ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) # compute matrices norm_matrix = seq_mod_preds_cell - real_pred_cell min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell]) # prepare figure sns.set(style="white", font_scale=0.5) sns.axes_style({"axes.linewidth": 1}) heat_cols = 400 sad_start = 1 sad_end = 323 logo_start = 0 logo_end = 324 fig = plt.figure(figsize=(20, 3)) ax_logo = plt.subplot2grid((3, heat_cols), (0, logo_start), colspan=(logo_end - logo_start)) ax_sad = plt.subplot2grid((3, heat_cols), (1, sad_start), colspan=(sad_end - sad_start)) ax_heat = plt.subplot2grid((3, heat_cols), (2, 0), colspan=heat_cols) # print a WebLogo of the sequence vlim = max(options.min_limit, abs(minmax_matrix).max()) if options.gain_height: seq_heights = 0.25 + 1.75 / vlim * (abs(minmax_matrix).max(axis=0)) else: seq_heights = 0.25 + 1.75 / vlim * (-minmax_matrix[0]) logo_eps = "%s/%s_c%d_seq.eps" % (options.out_dir, header_filename(header), ci) seq_logo(seq, seq_heights, logo_eps) # add to figure logo_png = "%s.png" % logo_eps[:-4] logo_cmd = "convert -density 300 %s %s" % (logo_eps, logo_png) if subprocess.call(logo_cmd, shell=True): message("Error running convert", "error") logo = Image.open(logo_png) ax_logo.imshow(logo) ax_logo.set_axis_off() # plot loss and gain SAD scores ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label="loss", linewidth=1) ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label="gain", linewidth=1) ax_sad.set_xlim(0, minmax_matrix.shape[1]) ax_sad.legend() # ax_sad.grid(True, linestyle=':') for axis in ["top", "bottom", "left", "right"]: ax_sad.spines[axis].set_linewidth(0.5) # plot real-normalized scores vlim = max(options.min_limit, abs(norm_matrix).max()) sns.heatmap(norm_matrix, linewidths=0, cmap="RdBu_r", vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat) ax_heat.yaxis.set_ticklabels("TGCA", rotation="horizontal") # , size=10) # save final figure plt.tight_layout() plt.savefig("%s/%s_c%d_heat.pdf" % (options.out_dir, header.replace(":", "_"), ci), dpi=300) plt.close() ################################################################# # print table of nt variability for each cell ################################################################# for ci in range(seq_mod_preds.shape[3]): seq_mod_preds_cell = seq_mod_preds[si, :, :, ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0) gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell for pos in range(seq_mod_preds_cell.shape[1]): cols = [header, delta_start + pos, ci, loss_matrix[pos], gain_matrix[pos]] print >> table_out, "\t".join([str(c) for c in cols]) table_out.close()
def main(): usage = "usage: %prog [options] <model_th> <vcf_file>" parser = OptionParser(usage) parser.add_option('-f', dest='genome_fasta', default='%s/data/genomes/hg19.fa'%os.environ['BASSETDIR'], help='Genome FASTA from which sequences will be drawn [Default: %default]') parser.add_option('-b', dest='batchsize', default=128, help='Batch size for prediction. [Default: %default]') parser.add_option('-i', dest='index_snp', default=False, action='store_true', help='SNPs are labeled with their index SNP as column 6 [Default: %default]') parser.add_option('-s', dest='score', default=False, action='store_true', help='SNPs are labeled with scores as column 7 [Default: %default]') parser.add_option('-o', dest='out_dir', default='sad', help='Output directory for tables and plots [Default: %default]') parser.add_option('-l', dest='seq_len', type='int', default=600, help='Sequence length provided to the model [Default: %default]') parser.add_option('-t', dest='targets_file', default=None, help='File specifying target indexes and labels in table format') parser.add_option('--from', dest='from_coord', default=None, type='int', help='Process SNPs starting from this coord. Assume VCF sorted.') parser.add_option('--to', dest='to_coord', default=None, type='int', help='Process SNPs ending at this coord. Assume VCF sorted.') parser.add_option('--chrom', dest='chrom', default=None, type='str', help='Which chromosome is being processed.') parser.add_option('--only-generate-inputh5', dest='only_gen_inputh5', default=False, action='store_true', help='Do not run prediction step [Default: %default]') parser.add_option('--only-run-pred', dest='only_run_pred', default=False, action='store_true', help='Input h5 file already generated. Only run prediction [Default: %default]') parser.add_option('--only-make-sad', dest='only_make_sad', default=False, action='store_true', help='Input h5 file and model already generated. Only generate output [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide Torch model and VCF file') else: model_th = args[0] vcf_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.from_coord is not None and options.to_coord is not None: if options.to_coord <= options.from_coord: parser.error('to_coord must be greater than from_coord') ################################################################# # prep SNP sequences ################################################################# if not options.only_run_pred and not options.only_make_sad: prep_snp_seqs(vcf_file, options.out_dir, options.seq_len, options.genome_fasta, from_=options.from_coord, to_=options.to_coord) if options.only_gen_inputh5: sys.exit(0) ################################################################# # predict in Torch ################################################################# model_hdf5_file = '%s/model_out.txt' % options.out_dir if not options.only_make_sad: torch_predict(options.out_dir, options.batchsize, model_th, model_hdf5_file) ################################################################# # collect and print SADs ################################################################# message('collect and print SADs') if options.targets_file is not None: target_labels = [line.split()[0] for line in open(options.targets_file)] sad_out = open('%s/sad_scores_table.txt' % options.out_dir, 'w') header_cols = ['rsid', 'index', 'score', 'ref', 'alt'] + target_labels sad_out.write('\t'.join(header_cols)+'\n') # Read simultaneously from SNP and predictions file if vcf_file.endswith(".gz"): snp_reader = gzip.open(vcf_file, "r") else: snp_reader = open(vcf_file, "r") snpline = snp_reader.readline().strip() while snpline.startswith("#"): snpline = snp_reader.readline().strip() pred_reader = open(model_hdf5_file, "r") predline = pred_reader.readline().strip() # Iterate through SNPs while snpline != "" and predline != "": snp = vcf.SNP(snpline, index_snp=options.index_snp, score=options.score) if options.chrom is not None and snp.chrom != options.chrom: continue if (options.from_coord is not None and snp.pos < options.from_coord): snpline = snp_reader.readline().strip() continue if (options.to_coord is not None and snp.pos > options.to_coord): break ref_pred = np.array([float(p) for p in predline.split()]) predline = pred_reader.readline().strip() for alt_al in snp.alt_alleles: alt_pred = np.array([float(p) for p in predline.split()]) predline = pred_reader.readline().strip() alt_sad = alt_pred - ref_pred # TODO assuming biallelic sad_out.write('\t'.join(map(str, [snp.rsid, snp.index_snp, snp.score, snp.ref_allele, snp.alt_alleles[0]] + \ map(lambda x: '%7.4f'%x, list(alt_sad)))) + '\n') snpline = snp_reader.readline().strip() snp_reader.close() pred_reader.close() sad_out.close()
def torch_predict(out_dir, batchsize, model_th, model_hdf5_file): message('predict in torch') cuda_str = "" cmd = 'basset_predict_local.lua -batchsize %s -norm %s %s %s/model_in.h5 %s' % (batchsize, cuda_str, model_th, out_dir, model_hdf5_file) if subprocess.call(cmd, shell=True): message('Error running basset_predict.lua', 'error')
def main(): usage = 'usage: %prog [options] <model_file> <test_hdf5_file>' parser = OptionParser(usage) parser.add_option('-b', dest='batch_size', default=1000, type='int', help='Batch size (affects memory usage) [Default: %default]') parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5.') parser.add_option('-i', dest='informative_only', default=False, action='store_true', help='Plot informative filters only [Default: %default]') parser.add_option('-m', dest='motifs_file') parser.add_option('-n', dest='norm_targets', default=False, action='store_true', help='Use the norm of the target influences as the primary influence measure [Default: %default]') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('--subset', dest='subset_file', default=None, help='Subset targets to the ones in this file') parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option('--seqs', dest='seqs', default=False, action='store_true', help='Output sequence-specific influence [Default: %default]') parser.add_option('-t', dest='targets_file', default=None, help='File specifying target indexes and labels in table format') parser.add_option('--width', dest='heat_width', default=10, type='float') parser.add_option('--height', dest='heat_height', default=20, type='float') parser.add_option('--font', dest='heat_font', default=0.4, type='float') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide Basset model file and test data in HDF5 format.') else: model_file = args[0] test_hdf5_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # load data ################################################################# # load sequences test_hdf5_in = h5py.File(test_hdf5_file, 'r') seq_vecs = np.array(test_hdf5_in['test_in']) seq_targets = np.array(test_hdf5_in['test_out']) seq_headers = np.array(test_hdf5_in['test_headers']) test_hdf5_in.close() # name the targets target_names = name_targets(seq_targets.shape[1], options.targets_file) if options.subset_file: target_subset = set([line.rstrip() for line in open(options.subset_file)]) # get additional motif information df_motifs = None if options.motifs_file: df_motifs = pd.read_table(options.motifs_file, delim_whitespace=True) ################################################################# # sample ################################################################# if options.sample is not None: # choose sampled indexes sample_i = np.array(random.sample(xrange(seq_vecs.shape[0]), options.sample)) # filter seq_vecs = seq_vecs[sample_i] seq_targets = seq_targets[sample_i] seq_headers = seq_headers[sample_i] # create a new HDF5 file sample_hdf5_file = '%s/sample.h5' % options.out_dir sample_hdf5_out = h5py.File(sample_hdf5_file, 'w') sample_hdf5_out.create_dataset('test_in', data=seq_vecs) sample_hdf5_out.create_dataset('test_out', data=seq_targets) sample_hdf5_out.close() # update test HDF5 test_hdf5_file = sample_hdf5_file ################################################################# # Torch predict ################################################################# if options.model_hdf5_file is None: torch_opts = '' if options.seqs: torch_opts += '-seqs' options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_motifs_infl.lua -batch_size %d %s %s %s %s' % (options.batch_size, torch_opts, model_file, test_hdf5_file, options.model_hdf5_file) if subprocess.call(torch_cmd, shell=True): message('Error running basset_motifs_infl.lua', 'error') # load model output model_hdf5_in = h5py.File(options.model_hdf5_file, 'r') filter_means = np.array(model_hdf5_in['filter_means']) filter_stds = np.array(model_hdf5_in['filter_stds']) filter_infl = np.array(model_hdf5_in['filter_infl']) filter_infl_targets = np.array(model_hdf5_in['filter_infl_targets']) if options.seqs: seq_filter_targets = np.array(model_hdf5_in['seq_filter_targets']) model_hdf5_in.close() ############################################################# # use target-based influence ############################################################# if options.norm_targets: # save the loss-based influence filter_infl_loss = np.array(filter_infl, copy=True) # set to the target-based influence for fi in range(filter_infl_targets.shape[0]): filter_infl[fi] = np.mean(filter_infl_targets[fi]**2) # print to a table tnorm_out = open('%s/loss_target.txt' % options.out_dir, 'w') for fi in range(len(filter_infl)): cols = (fi, filter_infl_loss[fi], filter_infl[fi]) print >> tnorm_out, '%3d %7.4f %7.4f' % cols tnorm_out.close() # compare the two xmin, xmax = coord_range(filter_infl_loss, buf_pct=0.1) ymin, ymax = coord_range(filter_infl, buf_pct=0.1) sns.set(style='ticks', font_scale=1) plt.figure() g = sns.jointplot(x=filter_infl_loss, y=filter_infl, color='black', joint_kws={'alpha':0.7}) ax = g.ax_joint ax.set_xlim(xmin, xmax) ax.set_xlabel('loss-based influence') ax.xaxis.label.set_fontsize(18) map(lambda xl: xl.set_fontsize(15), ax.get_xticklabels()) ax.set_ylim(ymin, ymax) ax.set_ylabel('target-based influence') ax.yaxis.label.set_fontsize(18) map(lambda yl: yl.set_fontsize(15), ax.get_yticklabels()) ax.grid(True, linestyle=':') plt.tight_layout(w_pad=0, h_pad=0) plt.savefig('%s/loss_target.pdf' % options.out_dir) plt.close() ############################################################# # print filter influence table ############################################################# table_out = open('%s/table.txt' % options.out_dir, 'w') for i in range(len(filter_infl)): if df_motifs is not None: cols = (i, filter_infl[i], filter_means[i], filter_stds[i], df_motifs.ic.iloc[i], df_motifs.annotation.iloc[i]) print >> table_out, '%3d %7.4f %6.4f %6.3f %4.1f %8s' % cols else: cols = (i, filter_infl[i], filter_means[i], filter_stds[i]) print >> table_out, '%3d %7.4f %6.4f %6.3f' % cols table_out.close() ################################################################# # plot filter influence ################################################################# sb_blue = sns.color_palette('deep')[0] sns.set(style='ticks', font_scale=1) ymin, ymax = coord_range(filter_infl, buf_pct=0.1) if options.motifs_file: nonzero = np.array(df_motifs.ic > 0) xmin, xmax = coord_range(df_motifs.ic.loc[nonzero]) plt.figure() g = sns.jointplot(x=np.array(df_motifs.ic.loc[nonzero]), y=filter_infl[nonzero], color='black', stat_func=None, joint_kws={'alpha':0.8}) ax = g.ax_joint ax.set_xlim(xmin, xmax) ax.set_xlabel('Information content') ax.xaxis.label.set_fontsize(18) map(lambda xl: xl.set_fontsize(15), ax.get_xticklabels()) ax.set_ylim(ymin, ymax) ax.set_ylabel('Influence') ax.yaxis.label.set_fontsize(18) map(lambda yl: yl.set_fontsize(15), ax.get_yticklabels()) # ax.grid(True, linestyle=':') plt.tight_layout(w_pad=0, h_pad=0) plt.savefig('%s/ic_infl.pdf' % options.out_dir) plt.close() ############################################################# # prep for cell-specific analyses ############################################################# filter_names = name_filters(len(filter_infl), df_motifs) # construct a panda data frame of the target influences df_ft = pd.DataFrame(filter_infl_targets, index=filter_names, columns=target_names) # print filter influence per target table table_out = open('%s/table_target.txt' % options.out_dir, 'w') for i in range(df_ft.shape[0]): for ti in range(len(target_names)): cols = (i, ti, target_names[ti], df_ft.iloc[i,ti]) print >> table_out, '%-3d %3d %20s %7.4f' % cols table_out.close() # print sequence-specific filter influence per target table if options.seqs: table_out = open('%s/table_seqs.txt' % options.out_dir, 'w') for si in range(seq_filter_targets.shape[0]): for fi in range(seq_filter_targets.shape[1]): for ti in range(seq_filter_targets.shape[2]): cols = (seq_headers[si], fi, ti, seq_filter_targets[si][fi][ti]) print >> table_out, '%-25s %3d %3d %7.4f' % cols table_out.close() # use only high information filters if options.informative_only and df_motifs is not None: df_ft = df_ft[df_moitfs.ic > 6] elif df_ft.shape[1] >= 10: df_ft_stds = df_ft.std(axis=1) df_ft = df_ft[df_ft_stds > 0] ############################################################# # plot filter influence per cell heatmaps ############################################################# # subset targets before plotting if options.subset_file: subset_mask = df_ft.columns.isin(target_subset) df_ft_sub = df_ft.loc[:,subset_mask] plot_infl_heatmaps(df_ft_sub, options.out_dir, options.heat_width, options.heat_height, options.heat_font) # plot all cells else: plot_infl_heatmaps(df_ft, options.out_dir, options.heat_width, options.heat_height, options.heat_font)
def main(): usage = 'usage: %prog [options] <model_file> <test_hdf5_file>' parser = OptionParser(usage) parser.add_option( '-b', dest='batch_size', default=1000, type='int', help='Batch size (affects memory usage) [Default: %default]') parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5.') parser.add_option('-i', dest='informative_only', default=False, action='store_true', help='Plot informative filters only [Default: %default]') parser.add_option('-m', dest='motifs_file') parser.add_option( '-n', dest='norm_targets', default=False, action='store_true', help= 'Use the norm of the target influences as the primary influence measure [Default: %default]' ) parser.add_option('-o', dest='out_dir', default='.') parser.add_option('--subset', dest='subset_file', default=None, help='Subset targets to the ones in this file') parser.add_option( '-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option( '--seqs', dest='seqs', default=False, action='store_true', help='Output sequence-specific influence [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, help='File specifying target indexes and labels in table format') parser.add_option('--width', dest='heat_width', default=10, type='float') parser.add_option('--height', dest='heat_height', default=20, type='float') parser.add_option('--font', dest='heat_font', default=0.4, type='float') (options, args) = parser.parse_args() if len(args) != 2: parser.error( 'Must provide Basset model file and test data in HDF5 format.') else: model_file = args[0] test_hdf5_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # load data ################################################################# # load sequences test_hdf5_in = h5py.File(test_hdf5_file, 'r') seq_vecs = np.array(test_hdf5_in['test_in']) seq_targets = np.array(test_hdf5_in['test_out']) seq_headers = np.array(test_hdf5_in['test_headers']) test_hdf5_in.close() # name the targets target_names = name_targets(seq_targets.shape[1], options.targets_file) if options.subset_file: target_subset = set( [line.rstrip() for line in open(options.subset_file)]) # get additional motif information df_motifs = None if options.motifs_file: df_motifs = pd.read_table(options.motifs_file, delim_whitespace=True) ################################################################# # sample ################################################################# if options.sample is not None: # choose sampled indexes sample_i = np.array( random.sample(xrange(seq_vecs.shape[0]), options.sample)) # filter seq_vecs = seq_vecs[sample_i] seq_targets = seq_targets[sample_i] seq_headers = seq_headers[sample_i] # create a new HDF5 file sample_hdf5_file = '%s/sample.h5' % options.out_dir sample_hdf5_out = h5py.File(sample_hdf5_file, 'w') sample_hdf5_out.create_dataset('test_in', data=seq_vecs) sample_hdf5_out.create_dataset('test_out', data=seq_targets) sample_hdf5_out.close() # update test HDF5 test_hdf5_file = sample_hdf5_file ################################################################# # Torch predict ################################################################# if options.model_hdf5_file is None: torch_opts = '' if options.seqs: torch_opts += '-seqs' options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_motifs_infl.lua -batch_size %d %s %s %s %s' % ( options.batch_size, torch_opts, model_file, test_hdf5_file, options.model_hdf5_file) if subprocess.call(torch_cmd, shell=True): message('Error running basset_motifs_infl.lua', 'error') # load model output model_hdf5_in = h5py.File(options.model_hdf5_file, 'r') filter_means = np.array(model_hdf5_in['filter_means']) filter_stds = np.array(model_hdf5_in['filter_stds']) filter_infl = np.array(model_hdf5_in['filter_infl']) filter_infl_targets = np.array(model_hdf5_in['filter_infl_targets']) if options.seqs: seq_filter_targets = np.array(model_hdf5_in['seq_filter_targets']) model_hdf5_in.close() ############################################################# # use target-based influence ############################################################# if options.norm_targets: # save the loss-based influence filter_infl_loss = np.array(filter_infl, copy=True) # set to the target-based influence for fi in range(filter_infl_targets.shape[0]): filter_infl[fi] = np.mean(filter_infl_targets[fi]**2) # print to a table tnorm_out = open('%s/loss_target.txt' % options.out_dir, 'w') for fi in range(len(filter_infl)): cols = (fi, filter_infl_loss[fi], filter_infl[fi]) print >> tnorm_out, '%3d %7.4f %7.4f' % cols tnorm_out.close() # compare the two xmin, xmax = coord_range(filter_infl_loss, buf_pct=0.1) ymin, ymax = coord_range(filter_infl, buf_pct=0.1) sns.set(style='ticks', font_scale=1) plt.figure() g = sns.jointplot(x=filter_infl_loss, y=filter_infl, color='black', joint_kws={'alpha': 0.7}) ax = g.ax_joint ax.set_xlim(xmin, xmax) ax.set_xlabel('loss-based influence') ax.xaxis.label.set_fontsize(18) map(lambda xl: xl.set_fontsize(15), ax.get_xticklabels()) ax.set_ylim(ymin, ymax) ax.set_ylabel('target-based influence') ax.yaxis.label.set_fontsize(18) map(lambda yl: yl.set_fontsize(15), ax.get_yticklabels()) ax.grid(True, linestyle=':') plt.tight_layout(w_pad=0, h_pad=0) plt.savefig('%s/loss_target.pdf' % options.out_dir) plt.close() ############################################################# # print filter influence table ############################################################# table_out = open('%s/table.txt' % options.out_dir, 'w') for i in range(len(filter_infl)): if df_motifs is not None: cols = (i, filter_infl[i], filter_means[i], filter_stds[i], df_motifs.ic.iloc[i], df_motifs.annotation.iloc[i]) print >> table_out, '%3d %7.4f %6.4f %6.3f %4.1f %8s' % cols else: cols = (i, filter_infl[i], filter_means[i], filter_stds[i]) print >> table_out, '%3d %7.4f %6.4f %6.3f' % cols table_out.close() ################################################################# # plot filter influence ################################################################# sb_blue = sns.color_palette('deep')[0] sns.set(style='ticks', font_scale=1) ymin, ymax = coord_range(filter_infl, buf_pct=0.1) if options.motifs_file: nonzero = np.array(df_motifs.ic > 0) xmin, xmax = coord_range(df_motifs.ic.loc[nonzero]) plt.figure() g = sns.jointplot(x=np.array(df_motifs.ic.loc[nonzero]), y=filter_infl[nonzero], color='black', stat_func=None, joint_kws={'alpha': 0.8}) ax = g.ax_joint ax.set_xlim(xmin, xmax) ax.set_xlabel('Information content') ax.xaxis.label.set_fontsize(18) map(lambda xl: xl.set_fontsize(15), ax.get_xticklabels()) ax.set_ylim(ymin, ymax) ax.set_ylabel('Influence') ax.yaxis.label.set_fontsize(18) map(lambda yl: yl.set_fontsize(15), ax.get_yticklabels()) # ax.grid(True, linestyle=':') plt.tight_layout(w_pad=0, h_pad=0) plt.savefig('%s/ic_infl.pdf' % options.out_dir) plt.close() ############################################################# # prep for cell-specific analyses ############################################################# filter_names = name_filters(len(filter_infl), df_motifs) # construct a panda data frame of the target influences df_ft = pd.DataFrame(filter_infl_targets, index=filter_names, columns=target_names) # print filter influence per target table table_out = open('%s/table_target.txt' % options.out_dir, 'w') for i in range(df_ft.shape[0]): for ti in range(len(target_names)): cols = (i, ti, target_names[ti], df_ft.iloc[i, ti]) print >> table_out, '%-3d %3d %20s %7.4f' % cols table_out.close() # print sequence-specific filter influence per target table if options.seqs: table_out = open('%s/table_seqs.txt' % options.out_dir, 'w') for si in range(seq_filter_targets.shape[0]): for fi in range(seq_filter_targets.shape[1]): for ti in range(seq_filter_targets.shape[2]): cols = (seq_headers[si], fi, ti, seq_filter_targets[si][fi][ti]) print >> table_out, '%-25s %3d %3d %7.4f' % cols table_out.close() # use only high information filters if options.informative_only and df_motifs is not None: df_ft = df_ft[df_moitfs.ic > 6] elif df_ft.shape[1] >= 10: df_ft_stds = df_ft.std(axis=1) df_ft = df_ft[df_ft_stds > 0] ############################################################# # plot filter influence per cell heatmaps ############################################################# # subset targets before plotting if options.subset_file: subset_mask = df_ft.columns.isin(target_subset) df_ft_sub = df_ft.loc[:, subset_mask] plot_infl_heatmaps(df_ft_sub, options.out_dir, options.heat_width, options.heat_height, options.heat_font) # plot all cells else: plot_infl_heatmaps(df_ft, options.out_dir, options.heat_width, options.heat_height, options.heat_font)
def main(): usage = 'usage: %prog [options] <model_file> <test_hdf5_file>' parser = OptionParser(usage) parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5.') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-m', dest='meme_db', default='%s/data/motifs/Homo_sapiens.meme' % os.environ['BASSETDIR'], help='MEME database used to annotate motifs') parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option('-t', dest='trim_filters', default=False, action='store_true', help='Trim uninformative positions off the filter ends [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide Basset model file and test data in HDF5 format.') else: model_file = args[0] test_hdf5_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # load data ################################################################# # load sequences test_hdf5_in = h5py.File(test_hdf5_file, 'r') seq_vecs = np.array(test_hdf5_in['test_in']) seq_targets = np.array(test_hdf5_in['test_out']) try: target_names = list(test_hdf5_in['target_labels']) except KeyError: target_names = ['t%d'%ti for ti in range(seq_targets.shape[1])] test_hdf5_in.close() ################################################################# # sample ################################################################# if options.sample is not None: # choose sampled indexes sample_i = np.array(random.sample(xrange(seq_vecs.shape[0]), options.sample)) # filter seq_vecs = seq_vecs[sample_i] seq_targets = seq_targets[sample_i] # create a new HDF5 file sample_hdf5_file = '%s/sample.h5' % options.out_dir sample_hdf5_out = h5py.File(sample_hdf5_file, 'w') sample_hdf5_out.create_dataset('test_in', data=seq_vecs) sample_hdf5_out.close() # update test HDF5 test_hdf5_file = sample_hdf5_file # convert to letters seqs = dna_io.vecs2dna(seq_vecs) ################################################################# # Torch predict ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_motifs_predict.lua %s %s %s' % (model_file, test_hdf5_file, options.model_hdf5_file) if subprocess.call(torch_cmd, shell=True): message('Error running basset_motifs_predict.lua', 'error') # load model output model_hdf5_in = h5py.File(options.model_hdf5_file, 'r') filter_weights = np.array(model_hdf5_in['weights']) filter_outs = np.array(model_hdf5_in['outs']) model_hdf5_in.close() # store useful variables num_filters = filter_weights.shape[0] filter_size = filter_weights.shape[2] ################################################################# # individual filter plots ################################################################# # also save information contents filters_ic = [] meme_out = meme_intro('%s/filters_meme.txt'%options.out_dir, seqs) for f in range(num_filters): print 'Filter %d' % f # plot filter parameters as a heatmap plot_filter_heat(filter_weights[f,:,:], '%s/filter%d_heat.pdf' % (options.out_dir,f)) # plot weblogo of high scoring outputs plot_filter_logo(filter_outs[:,f,:], filter_size, seqs, '%s/filter%d_logo'%(options.out_dir,f), maxpct_t=0.5) # make a PWM for the filter filter_pwm, nsites = make_filter_pwm('%s/filter%d_logo.fa'%(options.out_dir,f)) if nsites < 10: # no information filters_ic.append(0) else: # compute and save information content filters_ic.append(info_content(filter_pwm)) # add to the meme motif file meme_add(meme_out, f, filter_pwm, nsites, options.trim_filters) meme_out.close() ################################################################# # annotate filters ################################################################# # run tomtom tomtom_cmd = 'tomtom -dist pearson -thresh 0.1 -oc %s/tomtom %s/filters_meme.txt %s' % (options.out_dir, options.out_dir, options.meme_db) if subprocess.call(tomtom_cmd, shell=True): message('Error running tomtom', 'error') # read in annotations filter_names = name_filters(num_filters, '%s/tomtom/tomtom.txt'%options.out_dir, options.meme_db) ################################################################# # print a table of information ################################################################# table_out = open('%s/table.txt'%options.out_dir, 'w') # print header for later panda reading header_cols = ('', 'consensus', 'annotation', 'ic', 'mean', 'std') print >> table_out, '%3s %19s %10s %5s %6s %6s' % header_cols for f in range(num_filters): # collapse to a consensus motif consensus = filter_motif(filter_weights[f,:,:]) # grab annotation annotation = '.' name_pieces = filter_names[f].split('_') if len(name_pieces) > 1: annotation = name_pieces[1] # plot density of filter output scores fmean, fstd = plot_score_density(np.ravel(filter_outs[:,f,:]), '%s/filter%d_dens.pdf' % (options.out_dir,f)) row_cols = (f, consensus, annotation, filters_ic[f], fmean, fstd) print >> table_out, '%-3d %19s %10s %5.2f %6.4f %6.4f' % row_cols table_out.close() ################################################################# # global filter plots ################################################################# # plot filter-sequence heatmap plot_filter_seq_heat(filter_outs, '%s/filter_seqs.pdf'%options.out_dir) # plot filter-segment heatmap plot_filter_seg_heat(filter_outs, '%s/filter_segs.pdf'%options.out_dir) plot_filter_seg_heat(filter_outs, '%s/filter_segs_raw.pdf'%options.out_dir, whiten=False) # plot filter-target correlation heatmap plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_mean.pdf'%options.out_dir, 'mean') plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_max.pdf'%options.out_dir, 'max')
def main(): usage = 'usage: %prog [options] <model_file> <test_hdf5_file>' parser = OptionParser(usage) parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5.') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-m', dest='meme_db', default='%s/data/motifs/Homo_sapiens.meme' % os.environ['BASSETDIR'], help='MEME database used to annotate motifs') parser.add_option( '-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option( '-t', dest='trim_filters', default=False, action='store_true', help= 'Trim uninformative positions off the filter ends [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error( 'Must provide Basset model file and test data in HDF5 format.') else: model_file = args[0] test_hdf5_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # load data ################################################################# # load sequences test_hdf5_in = h5py.File(test_hdf5_file, 'r') seq_vecs = np.array(test_hdf5_in['test_in']) seq_targets = np.array(test_hdf5_in['test_out']) try: target_names = list(test_hdf5_in['target_labels']) except KeyError: target_names = ['t%d' % ti for ti in range(seq_targets.shape[1])] test_hdf5_in.close() ################################################################# # sample ################################################################# if options.sample is not None: # choose sampled indexes sample_i = np.array( random.sample(xrange(seq_vecs.shape[0]), options.sample)) # filter seq_vecs = seq_vecs[sample_i] seq_targets = seq_targets[sample_i] # create a new HDF5 file sample_hdf5_file = '%s/sample.h5' % options.out_dir sample_hdf5_out = h5py.File(sample_hdf5_file, 'w') sample_hdf5_out.create_dataset('test_in', data=seq_vecs) sample_hdf5_out.close() # update test HDF5 test_hdf5_file = sample_hdf5_file # convert to letters seqs = dna_io.vecs2dna(seq_vecs) ################################################################# # Torch predict ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_motifs_predict.lua %s %s %s' % ( model_file, test_hdf5_file, options.model_hdf5_file) if subprocess.call(torch_cmd, shell=True): message('Error running basset_motifs_predict.lua', 'error') # load model output model_hdf5_in = h5py.File(options.model_hdf5_file, 'r') filter_weights = np.array(model_hdf5_in['weights']) filter_outs = np.array(model_hdf5_in['outs']) model_hdf5_in.close() # store useful variables num_filters = filter_weights.shape[0] filter_size = filter_weights.shape[2] ################################################################# # individual filter plots ################################################################# # also save information contents filters_ic = [] meme_out = meme_intro('%s/filters_meme.txt' % options.out_dir, seqs) for f in range(num_filters): print 'Filter %d' % f # plot filter parameters as a heatmap plot_filter_heat(filter_weights[f, :, :], '%s/filter%d_heat.pdf' % (options.out_dir, f)) # plot weblogo of high scoring outputs plot_filter_logo(filter_outs[:, f, :], filter_size, seqs, '%s/filter%d_logo' % (options.out_dir, f), maxpct_t=0.5) # make a PWM for the filter filter_pwm, nsites = make_filter_pwm('%s/filter%d_logo.fa' % (options.out_dir, f)) if nsites < 10: # no information filters_ic.append(0) else: # compute and save information content filters_ic.append(info_content(filter_pwm)) # add to the meme motif file meme_add(meme_out, f, filter_pwm, nsites, options.trim_filters) meme_out.close() ################################################################# # annotate filters ################################################################# # run tomtom tomtom_cmd = 'tomtom -dist pearson -thresh 0.1 -oc %s/tomtom %s/filters_meme.txt %s' % ( options.out_dir, options.out_dir, options.meme_db) if subprocess.call(tomtom_cmd, shell=True): message('Error running tomtom', 'error') # read in annotations filter_names = name_filters(num_filters, '%s/tomtom/tomtom.txt' % options.out_dir, options.meme_db) ################################################################# # print a table of information ################################################################# table_out = open('%s/table.txt' % options.out_dir, 'w') # print header for later panda reading header_cols = ('', 'consensus', 'annotation', 'ic', 'mean', 'std') print >> table_out, '%3s %19s %10s %5s %6s %6s' % header_cols for f in range(num_filters): # collapse to a consensus motif consensus = filter_motif(filter_weights[f, :, :]) # grab annotation annotation = '.' name_pieces = filter_names[f].split('_') if len(name_pieces) > 1: annotation = name_pieces[1] # plot density of filter output scores fmean, fstd = plot_score_density( np.ravel(filter_outs[:, f, :]), '%s/filter%d_dens.pdf' % (options.out_dir, f)) row_cols = (f, consensus, annotation, filters_ic[f], fmean, fstd) print >> table_out, '%-3d %19s %10s %5.2f %6.4f %6.4f' % row_cols table_out.close() ################################################################# # global filter plots ################################################################# # plot filter-sequence heatmap plot_filter_seq_heat(filter_outs, '%s/filter_seqs.pdf' % options.out_dir) # plot filter-segment heatmap plot_filter_seg_heat(filter_outs, '%s/filter_segs.pdf' % options.out_dir) plot_filter_seg_heat(filter_outs, '%s/filter_segs_raw.pdf' % options.out_dir, whiten=False) # plot filter-target correlation heatmap plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_mean.pdf' % options.out_dir, 'mean') plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_max.pdf' % options.out_dir, 'max')
def main(): usage = 'usage: %prog [options] <model_th> <vcf_file>' parser = OptionParser(usage) parser.add_option('--cuda', dest='cuda', default=False, action='store_true', help='Predict on the GPU [Default: %default]') parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option('-f', dest='genome_fasta', default='%s/data/genomes/hg19.fa'%os.environ['BASSETDIR'], help='Genome FASTA from which sequences will be drawn [Default: %default]') parser.add_option('-i', dest='index_snp', default=False, action='store_true', help='SNPs are labeled with their index SNP as column 6 [Default: %default]') parser.add_option('-l', dest='seq_len', type='int', default=600, help='Sequence length provided to the model [Default: %default]') parser.add_option('-m', dest='min_limit', default=0.1, type='float', help='Minimum heatmap limit [Default: %default]') parser.add_option('-o', dest='out_dir', default='sad', help='Output directory for tables and plots [Default: %default]') parser.add_option('-s', dest='score', default=False, action='store_true', help='SNPs are labeled with scores as column 7 [Default: %default]') parser.add_option('-t', dest='targets_file', default=None, help='File specifying target indexes and labels in table format') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide Torch model and VCF file') else: model_th = args[0] vcf_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # prep SNP sequences ################################################################# # load SNPs snps = vcf.vcf_snps(vcf_file, options.index_snp, options.score) if options.model_hdf5_file is None: # get one hot coded input sequences seq_vecs, seqs, seq_headers = vcf.snps_seq1(snps, options.genome_fasta, options.seq_len) # reshape sequences for torch seq_vecs = seq_vecs.reshape((seq_vecs.shape[0],4,1,seq_vecs.shape[1]/4)) # write to HDF5 h5f = h5py.File('%s/model_in.h5'%options.out_dir, 'w') h5f.create_dataset('test_in', data=seq_vecs) h5f.close() ################################################################# # predict in Torch ################################################################# if options.model_hdf5_file is None: if options.cuda: cuda_str = '-cuda' else: cuda_str = '' options.model_hdf5_file = '%s/model_out.txt' % options.out_dir cmd = 'basset_predict.lua -norm %s %s %s/model_in.h5 %s' % (cuda_str, model_th, options.out_dir, options.model_hdf5_file) if subprocess.call(cmd, shell=True): message('Error running basset_predict.lua', 'error') # read in predictions seq_preds = [] for line in open(options.model_hdf5_file): seq_preds.append(np.array([float(p) for p in line.split()])) seq_preds = np.array(seq_preds) ################################################################# # collect and print SADs ################################################################# if options.targets_file is not None: target_labels = [line.split()[0] for line in open(options.targets_file)] sad_out = open('%s/sad_table.txt' % options.out_dir, 'w') header_cols = ('rsid', 'index', 'score', 'ref', 'alt', 'target', 'ref_pred', 'alt_pred', 'sad') print >> sad_out, ' '.join(header_cols) # hash by index snp sad_matrices = {} sad_labels = {} sad_scores = {} pi = 0 for snp in snps: # get reference prediction ref_preds = seq_preds[pi,:] pi += 1 for alt_al in snp.alt_alleles: # get alternate prediction alt_preds = seq_preds[pi,:] pi += 1 # normalize by reference alt_sad = alt_preds - ref_preds sad_matrices.setdefault(snp.index_snp,[]).append(alt_sad) # label as mutation from reference alt_label = '%s_%s>%s' % (snp.rsid, vcf.cap_allele(snp.ref_allele), vcf.cap_allele(alt_al)) sad_labels.setdefault(snp.index_snp,[]).append(alt_label) # save scores sad_scores.setdefault(snp.index_snp,[]).append(snp.score) # print table lines for ti in range(len(alt_sad)): if options.index_snp and options.score: cols = (snp.rsid, snp.index_snp, snp.score, vcf.cap_allele(snp.ref_allele), vcf.cap_allele(alt_al), target_labels[ti], ref_preds[ti], alt_preds[ti], alt_sad[ti]) print >> sad_out, '%-13s %-13s %5.3f %6s %6s %12s %6.4f %6.4f %7.4f' % cols elif options.index_snp: cols = (snp.rsid, snp.index_snp, vcf.cap_allele(snp.ref_allele), vcf.cap_allele(alt_al), target_labels[ti], ref_preds[ti], alt_preds[ti], alt_sad[ti]) print >> sad_out, '%-13s %-13s %6s %6s %12s %6.4f %6.4f %7.4f' % cols elif options.score: cols = (snp.rsid, snp.score, vcf.cap_allele(snp.ref_allele), vcf.cap_allele(alt_al), target_labels[ti], ref_preds[ti], alt_preds[ti], alt_sad[ti]) print >> sad_out, '%-13s %5.3f %6s %6s %12s %6.4f %6.4f %7.4f' % cols else: cols = (snp.rsid, vcf.cap_allele(snp.ref_allele), vcf.cap_allele(alt_al), target_labels[ti], ref_preds[ti], alt_preds[ti], alt_sad[ti]) print >> sad_out, '%-13s %6s %6s %12s %6.4f %6.4f %7.4f' % cols sad_out.close() ################################################################# # plot SAD heatmaps ################################################################# for ii in sad_matrices: # convert fully to numpy arrays sad_matrix = abs(np.array(sad_matrices[ii])) print ii, sad_matrix.shape if sad_matrix.shape[0] > 1: vlim = max(options.min_limit, sad_matrix.max()) score_mat = np.reshape(np.array(sad_scores[ii]), (-1, 1)) if options.targets_file is None: # plot heatmap plt.figure(figsize=(20, 0.5*sad_matrix.shape[0])) # lay out scores cols = 12 ax_score = plt.subplot2grid((1,cols), (0,0)) ax_sad = plt.subplot2grid((1,cols), (0,1), colspan=(cols-1)) sns.heatmap(score_mat, xticklabels=False, yticklabels=False, vmin=0, vmax=1, cmap='Reds', cbar=False, ax=ax_score) sns.heatmap(sad_matrix, xticklabels=False, yticklabels=sad_labels[ii], vmin=0, vmax=vlim, ax=ax_sad) else: # plot heatmap plt.figure(figsize=(20, 0.5 + 0.5*sad_matrix.shape[0])) # lay out scores cols = 12 ax_score = plt.subplot2grid((1,cols), (0,0)) ax_sad = plt.subplot2grid((1,cols), (0,1), colspan=(cols-1)) sns.heatmap(score_mat, xticklabels=False, yticklabels=False, vmin=0, vmax=1, cmap='Reds', cbar=False, ax=ax_score) sns.heatmap(sad_matrix, xticklabels=target_labels, yticklabels=sad_labels[ii], vmin=0, vmax=vlim, ax=ax_sad) for tick in ax_sad.get_xticklabels(): tick.set_rotation(-45) tick.set_horizontalalignment('left') tick.set_fontsize(5) plt.tight_layout() if ii == '.': out_pdf = '%s/sad_heat.pdf' % options.out_dir else: out_pdf = '%s/sad_%s_heat.pdf' % (options.out_dir, ii) plt.savefig(out_pdf) plt.close()