示例#1
0
def plot_filter_logo(filter_outs,
                     filter_size,
                     seqs,
                     out_prefix,
                     raw_t=0,
                     maxpct_t=None):
    if maxpct_t:
        all_outs = np.ravel(filter_outs)
        all_outs_mean = all_outs.mean()
        all_outs_norm = all_outs - all_outs_mean
        raw_t = maxpct_t * all_outs_norm.max() + all_outs_mean

    # print fasta file of positive outputs
    filter_fasta_out = open('%s.fa' % out_prefix, 'w')
    filter_count = 0
    for i in range(filter_outs.shape[0]):
        for j in range(filter_outs.shape[1]):
            if filter_outs[i, j] > raw_t:
                kmer = seqs[i][j:j + filter_size]
                print >> filter_fasta_out, '>%d_%d' % (i, j)
                print >> filter_fasta_out, kmer
                filter_count += 1
    filter_fasta_out.close()

    # make weblogo
    if filter_count > 0:
        weblogo_cmd = 'weblogo %s < %s.fa > %s.eps' % (weblogo_opts,
                                                       out_prefix, out_prefix)
        if subprocess.call(weblogo_cmd, shell=True):
            message('Error running weblogo', 'error')
示例#2
0
def seq_logo(seq, heights, out_eps, weblogo_args=''):
    # print the sequence to a temp fasta file
    fasta_fd, fasta_file = tempfile.mkstemp()
    fasta_out = open(fasta_file, 'w')
    print >> fasta_out, '>seq\n%s' % seq
    fasta_out.close()

    # print figure to a temp eps file
    eps_fd, eps_file = tempfile.mkstemp()
    weblogo_cmd = 'weblogo --errorbars NO --show-xaxis NO --show-yaxis NO --fineprint "" -c classic -n %d %s < %s > %s' % (
        len(seq), weblogo_args, fasta_file, eps_file)
    if subprocess.call(weblogo_cmd, shell=True):
        message('Error running weblogo', 'error')

    # copy eps file over and write in my own heights
    start_stack_re = re.compile('^\(\d*\) StartStack')
    out_eps_open = open(out_eps, 'w')
    weblogo_eps_in = open(eps_file)
    line = weblogo_eps_in.readline()
    si = 0
    while line:
        start_stack_match = start_stack_re.search(line)

        # nt column begins
        if start_stack_match:
            print >> out_eps_open, line,

            # loop over 4 nt's
            for i in range(4):
                line = weblogo_eps_in.readline()
                a = line.split()

                nt = a[2][1:-1]
                if nt != seq[si]:
                    print >> out_eps_open, line,
                else:
                    # change the nt of seq
                    a[1] = '%.6f' % heights[si]
                    print >> out_eps_open, ' %s' % ' '.join(a)

            # move to next nucleotide
            si += 1
        else:
            print >> out_eps_open, line,

        # advance to next line
        line = weblogo_eps_in.readline()

    # clean
    os.close(fasta_fd)
    os.remove(fasta_file)
    os.close(eps_fd)
    os.remove(eps_file)
示例#3
0
def seq_logo(seq, heights, out_eps, weblogo_args=''):
	# print the sequence to a temp fasta file
	fasta_fd, fasta_file = tempfile.mkstemp()
	fasta_out = open(fasta_file, 'w')
	print >> fasta_out, '>seq\n%s' % seq
	fasta_out.close()

	# print figure to a temp eps file
	eps_fd, eps_file = tempfile.mkstemp()
	weblogo_cmd = 'weblogo --errorbars NO --show-xaxis NO --show-yaxis NO --fineprint "" -c classic -n %d %s < %s > %s' % (len(seq), weblogo_args, fasta_file, eps_file)
        if subprocess.call(weblogo_cmd, shell=True):
            message('Error running weblogo', 'error')

	# copy eps file over and write in my own heights
	start_stack_re = re.compile('^\(\d*\) StartStack')
	out_eps_open = open(out_eps, 'w')
	weblogo_eps_in = open(eps_file)
	line = weblogo_eps_in.readline()
	si = 0
	while line:
		start_stack_match = start_stack_re.search(line)

		# nt column begins
		if start_stack_match:
			print >> out_eps_open, line,

			# loop over 4 nt's
			for i in range(4):
				line = weblogo_eps_in.readline()
				a = line.split()

				nt = a[2][1:-1]
				if nt != seq[si]:
					print >> out_eps_open, line,
				else:
					# change the nt of seq
					a[1] = '%.6f' % heights[si]
					print >> out_eps_open, ' %s' % ' '.join(a)

			# move to next nucleotide
			si += 1
		else:
			print >> out_eps_open, line,

		# advance to next line
		line = weblogo_eps_in.readline()

	# clean
	os.close(fasta_fd)
	os.remove(fasta_file)
	os.close(eps_fd)
	os.remove(eps_file)
def prep_snp_seqs(vcf_file, out_dir, seq_len, genome_fasta, from_=None, to_=None):
    message('prep SNP sequences')

    # Prepare hdf5 file
    h5f = h5py.File('%s/model_in.h5'%out_dir, 'w')
    dset = h5f.create_dataset('test_in', (1, 4, 1, seq_len), maxshape=(None, 4, 1, seq_len))

    # Read through VCF
    current_shape = 0 # start off with 0 (init to 1 bc got error otherwise)
    with open(vcf_file, "r") as f, gzip.open(vcf_file, 'rb') as fz:
        if vcf_file.endswith(".gz"): f = fz
        for line in f:
            # Get one hot coded sequence
            snp = vcf.SNP(line)
            if from_ is not None and snp.pos < from_: continue
            if to_ is not None and snp.pos > to_: break
            seq_vecs, seqs, seq_headers = vcf.snps_seq1([snp], genome_fasta, seq_len)
            seq_vecs = seq_vecs.reshape((seq_vecs.shape[0],4,1,seq_vecs.shape[1]/4))
            # Add to hd5 file
            dset.resize(current_shape+seq_vecs.shape[0], axis=0)
            dset[current_shape:,...] = seq_vecs
            current_shape = dset.shape[0]
    h5f.close()
示例#5
0
def plot_filter_logo(filter_outs, filter_size, seqs, out_prefix, raw_t=0, maxpct_t=None):
    if maxpct_t:
        all_outs = np.ravel(filter_outs)
        all_outs_mean = all_outs.mean()
        all_outs_norm = all_outs - all_outs_mean
        raw_t = maxpct_t * all_outs_norm.max() + all_outs_mean

    # print fasta file of positive outputs
    filter_fasta_out = open('%s.fa' % out_prefix, 'w')
    filter_count = 0
    for i in range(filter_outs.shape[0]):
        for j in range(filter_outs.shape[1]):
            if filter_outs[i,j] > raw_t:
                kmer = seqs[i][j:j+filter_size]
                print >> filter_fasta_out, '>%d_%d' % (i,j)
                print >> filter_fasta_out, kmer
                filter_count += 1
    filter_fasta_out.close()

    # make weblogo
    if filter_count > 0:
        weblogo_cmd = 'weblogo %s < %s.fa > %s.eps' % (weblogo_opts, out_prefix, out_prefix)
        if subprocess.call(weblogo_cmd, shell=True):
            message('Error running weblogo', 'error')
示例#6
0
def main():
    usage = 'usage: %prog [options] <model_file> <vcf_file>'
    parser = OptionParser(usage)
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-f', dest='genome_fasta', default='%s/data/genomes/hg19.fa'%os.environ['BASSETDIR'], help='Genome FASTA from which sequences will be drawn [Default: %default]')
    parser.add_option('-g', dest='gain_height', default=False, action='store_true', help='Nucleotide heights determined by the max of loss and gain [Default: %default]')
    parser.add_option('-l', dest='seq_len', type='int', default=600, help='Sequence length provided to the model [Default: %default]')
    parser.add_option('-m', dest='min_limit', default=0.1, type='float', help='Minimum heatmap limit [Default: %default]')
    parser.add_option('-n', dest='center_nt', default=200, type='int', help='Nt around the SNP to mutate and plot in the heatmap [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]')
    parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide Basset model file and input SNPs in VCF format')
    else:
        model_file = args[0]
        vcf_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # prep SNP sequences
    #################################################################
    # load SNPs
    snps = vcf.vcf_snps(vcf_file)

    # get one hot coded input sequences
    seqs_1hot, seqs, seq_headers = vcf.snps_seq1(snps, options.genome_fasta, options.seq_len)

    # reshape sequences for torch
    seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4))

    # write to HDF5
    model_input_hdf5 = '%s/model_in.h5'%options.out_dir
    h5f = h5py.File(model_input_hdf5, 'w')
    h5f.create_dataset('test_in', data=seqs_1hot)
    h5f.close()


    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_sat_predict.lua -center_nt %d %s %s %s' % (options.center_nt, model_file, model_input_hdf5, options.model_hdf5_file)
        if subprocess.call(torch_cmd, shell=True):
            message('Error running basset_sat_predict.lua', 'error')

    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    seq_mod_preds = np.array(hdf5_in['seq_mod_preds'])
    hdf5_in.close()

    # trim seqs to match seq_mod_preds length
    delta_start = 0
    delta_len = seq_mod_preds.shape[2]
    if delta_len < options.seq_len:
        delta_start = (options.seq_len - delta_len)/2
        for si in range(len(seqs)):
            seqs[si] = seqs[si][delta_start:delta_start+delta_len]

    # decide which cells to plot
    if options.targets == '-1':
        plot_cells = xrange(seq_mod_preds.shape[3])
    else:
        plot_cells = [int(ci) for ci in options.targets.split(',')]


    #################################################################
    # plot
    #################################################################
    table_out = open('%s/table.txt' % options.out_dir, 'w')

    rdbu = sns.color_palette("RdBu_r", 10)

    nts = 'ACGT'
    for si in range(seq_mod_preds.shape[0]):
        header = seq_headers[si]
        seq = seqs[si]

        # plot some descriptive heatmaps for each individual cell type
        for ci in plot_cells:
            seq_mod_preds_cell = seq_mod_preds[si,:,:,ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            # compute matrices
            norm_matrix = seq_mod_preds_cell - real_pred_cell
            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)
            minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell])

            # prepare figure
            sns.set(style='white', font_scale=0.5)
            sns.axes_style({'axes.linewidth':1})
            heat_cols = 400
            sad_start = 1
            sad_end = 323
            logo_start = 0
            logo_end = 324
            fig = plt.figure(figsize=(20,3))
            ax_logo = plt.subplot2grid((3,heat_cols), (0,logo_start), colspan=(logo_end-logo_start))
            ax_sad = plt.subplot2grid((3,heat_cols), (1,sad_start), colspan=(sad_end-sad_start))
            ax_heat = plt.subplot2grid((3,heat_cols), (2,0), colspan=heat_cols)

            # print a WebLogo of the sequence
            vlim = max(options.min_limit, abs(minmax_matrix).max())
            if options.gain_height:
                seq_heights = 0.25 + 1.75/vlim*(abs(minmax_matrix).max(axis=0))
            else:
                seq_heights = 0.25 + 1.75/vlim*(-minmax_matrix[0])
            logo_eps = '%s/%s_c%d_seq.eps' % (options.out_dir, header.replace(':','_'), ci)
            seq_logo(seq, seq_heights, logo_eps)

            # add to figure
            logo_png = '%s.png' % logo_eps[:-4]
            logo_cmd = 'convert -density 300 %s %s' % (logo_eps, logo_png)
            if subprocess.call(logo_cmd, shell=True):
                message('Error running convert', 'error')
            logo = Image.open(logo_png)
            ax_logo.imshow(logo)
            ax_logo.set_axis_off()

            # plot loss and gain SAD scores
            ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label='loss', linewidth=1)
            ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label='gain', linewidth=1)
            ax_sad.set_xlim(0,minmax_matrix.shape[1])
            ax_sad.legend()
            # ax_sad.grid(True, linestyle=':')
            for axis in ['top','bottom','left','right']:
                ax_sad.spines[axis].set_linewidth(0.5)

            # plot real-normalized scores
            vlim = max(options.min_limit, abs(norm_matrix).max())
            sns.heatmap(norm_matrix, linewidths=0, cmap='RdBu_r', vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat)
            ax_heat.yaxis.set_ticklabels('TGCA', rotation='horizontal') # , size=10)

            # save final figure
            plt.tight_layout()
            plt.savefig('%s/%s_c%d_heat.pdf' % (options.out_dir,header.replace(':','_'), ci), dpi=300)
            plt.close()


        #################################################################
        # print table of nt variability for each cell
        #################################################################
        for ci in range(seq_mod_preds.shape[3]):
            seq_mod_preds_cell = seq_mod_preds[si,:,:,ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)

            loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0)
            gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell

            for pos in range(seq_mod_preds_cell.shape[1]):
                cols = [header, delta_start+pos, ci, loss_matrix[pos], gain_matrix[pos]]
                print >> table_out, '\t'.join([str(c) for c in cols])

    table_out.close()
示例#7
0
def main():
    usage = 'usage: %prog [options] <target_beds_file>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='db_act_file', help='Existing database activity table.')
    parser.add_option('-b', dest='db_bed', help='Existing database BED file.')
    parser.add_option('-c', dest='chrom_lengths_file', help='Table of chromosome lengths')
    parser.add_option('-m', dest='merge_overlap', default=200, type='int', help='Overlap length (after extension to feature_size) above which to merge features [Default: %default]')
    parser.add_option('-n', dest='no_db_activity', default=False, action='store_true', help='Do not pass along the activities of the database sequences [Default: %default]')
    parser.add_option('-o', dest='out_prefix', default='features', help='Output file prefix [Default: %default]')
    parser.add_option('-s', dest='feature_size', default=600, type='int', help='Extend features to this size [Default: %default]')
    parser.add_option('-y', dest='ignore_y', default=False, action='store_true', help='Ignore Y chromsosome features [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
    	parser.error('Must provide file labeling the targets and providing BED file paths.')
    else:
    	target_beds_file = args[0]

    # determine whether we'll add to an existing DB
    db_targets = []
    db_add = False
    if options.db_bed:
        db_add = True
        if not options.no_db_activity:
            if options.db_act_file is None:
                parser.error('Must provide both activity table or specify -n if you want to add to an existing database')
            else:
                # read db target names
                db_act_in = open(options.db_act_file)
                db_targets = db_act_in.readline().strip().split('\t')
                db_act_in.close()

    # read in targets and assign them indexes into the db
    target_beds = []
    target_dbi = []
    for line in open(target_beds_file):
    	a = line.split()
        if len(a) != 2:
            print a
            print >> sys.stderr, 'Each row of the target BEDS file must contain a label and BED file separated by whitespace'
            exit(1)
    	target_dbi.append(len(db_targets))
    	db_targets.append(a[0])
    	target_beds.append(a[1])

    # read in chromosome lengths
    chrom_lengths = {}
    if options.chrom_lengths_file:
        chrom_lengths = {}
        for line in open(options.chrom_lengths_file):
            a = line.split()
            chrom_lengths[a[0]] = int(a[1])
    else:
        print >> sys.stderr, 'Warning: chromosome lengths not provided, so regions near ends may be incorrect.'

    #################################################################
    # print peaks to chromosome-specific files
    #################################################################
    chrom_files = {}
    chrom_outs = {}

    peak_beds = target_beds
    if db_add:
        peak_beds.append(options.db_bed)

    for bi in range(len(peak_beds)):
        if peak_beds[bi][-3:] == '.gz':
            peak_bed_in = gzip.open(peak_beds[bi])
        else:
            peak_bed_in = open(peak_beds[bi])

        for line in peak_bed_in:
            a = line.split('\t')
            a[-1] = a[-1].rstrip()

            chrom = a[0]
            strand = '+'
            if len(a) > 5 and a[5] in '+-':
                strand = a[5]
            chrom_key = (chrom,strand)

            # open chromosome file
            if chrom_key not in chrom_outs:
                chrom_files[chrom_key] = '%s_%s_%s.bed' % (options.out_prefix, chrom, strand)
                chrom_outs[chrom_key] = open(chrom_files[chrom_key], 'w')

            # if it's the db bed
            if db_add and bi == len(peak_beds)-1:
                if options.no_db_activity:
                    # set activity to null
                    a[6] = '.'
                    print >> chrom_outs[chrom_key], '\t'.join(a[:7])
                else:
                    print >> chrom_outs[chrom_key], line,

            # if it's a new bed
            else:
                # specify the target index
                while len(a) < 7:
                    a.append('')
                a[5] = strand
                a[6] = str(target_dbi[bi])
                print >> chrom_outs[chrom_key], '\t'.join(a[:7])

        peak_bed_in.close()

    # close chromosome-specific files
    for chrom_key in chrom_outs:
        chrom_outs[chrom_key].close()

    # ignore Y
    if options.ignore_y:
        for orient in '+-':
            chrom_key = ('chrY',orient)
            if chrom_key in chrom_files:
                os.remove(chrom_files[chrom_key])
                del chrom_files[chrom_key]

    #################################################################
    # sort chromosome-specific files
    #################################################################
    for chrom_key in chrom_files:
        chrom,strand = chrom_key
        chrom_sbed = '%s_%s_%s_sort.bed' % (options.out_prefix,chrom,strand)
        sort_cmd = 'sortBed -i %s > %s' % (chrom_files[chrom_key], chrom_sbed)
        if subprocess.call(sort_cmd, shell=True):
            message('Error running SortBed. Is bedtools installed?', 'error')
        os.remove(chrom_files[chrom_key])
        chrom_files[chrom_key] = chrom_sbed


    #################################################################
    # parse chromosome-specific files
    #################################################################
    final_bed_out = open('%s.bed' % options.out_prefix, 'w')

    for chrom_key in chrom_files:
        chrom, strand = chrom_key

        open_peaks = []
        for line in open(chrom_files[chrom_key]):
            a = line.split('\t')
            a[-1] = a[-1].rstrip()

            # construct Peak
            peak_start = int(a[1])
            peak_end = int(a[2])
            peak_act = activity_set(a[6])
            peak = Peak(peak_start, peak_end, peak_act)
            peak.extend(options.feature_size, chrom_lengths.get(chrom,None))

            if len(open_peaks) == 0:
                # initialize open peak
                open_end = peak.end
                open_peaks = [peak]

            else:
                # operate on exiting open peak

                # if beyond existing open peak
                if open_end - options.merge_overlap <= peak.start:
                    # close open peak
                    mpeaks = merge_peaks(open_peaks, options.feature_size, options.merge_overlap, chrom_lengths.get(chrom,None))

                    # print to file
                    for mpeak in mpeaks:
                        print >> final_bed_out, mpeak.bed_str(chrom, strand)

                    # initialize open peak
                    open_end = peak.end
                    open_peaks = [peak]

                else:
                    # extend open peak
                    open_peaks.append(peak)
                    open_end = max(open_end, peak.end)

        if len(open_peaks) > 0:
            # close open peak
            mpeaks = merge_peaks(open_peaks, options.feature_size, options.merge_overlap, chrom_lengths.get(chrom,None))

            # print to file
            for mpeak in mpeaks:
                print >> final_bed_out, mpeak.bed_str(chrom, strand)

    final_bed_out.close()

    # clean
    for chrom_key in chrom_files:
        os.remove(chrom_files[chrom_key])


    #################################################################
    # construct/update activity table
    #################################################################
    final_act_out = open('%s_act.txt' % options.out_prefix, 'w')

    # print header
    cols = [''] + db_targets
    print >> final_act_out, '\t'.join(cols)

    # print sequences
    for line in open('%s.bed' % options.out_prefix):
        a = line.rstrip().split('\t')
        # index peak
        peak_id = '%s:%s-%s(%s)' % (a[0], a[1], a[2], a[5])

        # construct full activity vector
        peak_act = [0]*len(db_targets)
        for ai in a[6].split(','):
            if ai != '.':
                peak_act[int(ai)] = 1

        # print line
        cols = [peak_id] + peak_act
        print >> final_act_out, '\t'.join([str(c) for c in cols])

    final_act_out.close()
示例#8
0
def main():
    usage = 'usage: %prog [options] <model_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file')
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-g', dest='gain_height', default=False, action='store_true', help='Nucleotide heights determined by the max of loss and gain [Default: %default]')
    parser.add_option('-m', dest='min_limit', default=0.1, type='float', help='Minimum heatmap limit [Default: %default]')
    parser.add_option('-n', dest='center_nt', default=200, type='int', help='Center nt to mutate and plot in the heat map [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]')
    parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]')
    parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file')
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = '%s/model_in.h5'%options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False)

            # read in target names
            target_labels = open(options.input_activity_file).readline().strip().split('\t')

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')
            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            try: # TEMP
                seq_headers = np.array(hdf5_in['test_headers'])
                target_labels = np.array(hdf5_in['target_labels'])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = '%s/model_in.h5'%options.out_dir
                h5f = h5py.File(model_input_hdf5, 'w')
                h5f.create_dataset('test_in', data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)


        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')


    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_sat_predict.lua -center_nt %d %s %s %s' % (options.center_nt, model_file, model_input_hdf5, options.model_hdf5_file)
        if subprocess.call(torch_cmd, shell=True):
            message('Error running basset_sat_predict.lua', 'error')

    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    seq_mod_preds = np.array(hdf5_in['seq_mod_preds'])
    hdf5_in.close()

    # trim seqs to match seq_mod_preds length
    seq_len = len(seqs[0])
    delta_start = 0
    delta_len = seq_mod_preds.shape[2]
    if delta_len < seq_len:
        delta_start = (seq_len - delta_len)/2
        for i in range(len(seqs)):
            seqs[i] = seqs[i][delta_start:delta_start+delta_len]

    # decide which cells to plot
    if options.targets == '-1':
        plot_targets = xrange(seq_mod_preds.shape[3])
    else:
        plot_targets = [int(ci) for ci in options.targets.split(',')]


    #################################################################
    # plot
    #################################################################
    table_out = open('%s/table.txt' % options.out_dir, 'w')

    rdbu = sns.color_palette("RdBu_r", 10)

    nts = 'ACGT'
    for si in range(seq_mod_preds.shape[0]):
        try:
            header = seq_headers[si]
        except TypeError:
            header = 'seq%d' % si
        seq = seqs[si]

        # plot some descriptive heatmaps for each individual cell type
        for ci in plot_targets:
            seq_mod_preds_cell = seq_mod_preds[si,:,:,ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            # compute matrices
            norm_matrix = seq_mod_preds_cell - real_pred_cell
            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)
            minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell])

            # prepare figure
            sns.set(style='white', font_scale=0.5)
            sns.axes_style({'axes.linewidth':1})
            heat_cols = 400
            sad_start = 1
            sad_end = 323
            logo_start = 0
            logo_end = 324
            fig = plt.figure(figsize=(20,3))
            ax_logo = plt.subplot2grid((3,heat_cols), (0,logo_start), colspan=(logo_end-logo_start))
            ax_sad = plt.subplot2grid((3,heat_cols), (1,sad_start), colspan=(sad_end-sad_start))
            ax_heat = plt.subplot2grid((3,heat_cols), (2,0), colspan=heat_cols)

            # print a WebLogo of the sequence
            vlim = max(options.min_limit, abs(minmax_matrix).max())
            if options.gain_height:
                seq_heights = 0.25 + 1.75/vlim*(abs(minmax_matrix).max(axis=0))
            else:
                seq_heights = 0.25 + 1.75/vlim*(-minmax_matrix[0])
            logo_eps = '%s/%s_c%d_seq.eps' % (options.out_dir, header_filename(header), ci)
            seq_logo(seq, seq_heights, logo_eps)

            # add to figure
            logo_png = '%s.png' % logo_eps[:-4]
            logo_cmd = 'convert -density 300 %s %s' % (logo_eps, logo_png)
            if subprocess.call(logo_cmd, shell=True):
                message('Error running convert', 'error')
            logo = Image.open(logo_png)
            ax_logo.imshow(logo)
            ax_logo.set_axis_off()

            # plot loss and gain SAD scores
            ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label='loss', linewidth=1)
            ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label='gain', linewidth=1)
            ax_sad.set_xlim(0,minmax_matrix.shape[1])
            ax_sad.legend()
            # ax_sad.grid(True, linestyle=':')
            for axis in ['top','bottom','left','right']:
                ax_sad.spines[axis].set_linewidth(0.5)

            # plot real-normalized scores
            vlim = max(options.min_limit, abs(norm_matrix).max())
            sns.heatmap(norm_matrix, linewidths=0, cmap='RdBu_r', vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat)
            ax_heat.yaxis.set_ticklabels('TGCA', rotation='horizontal') # , size=10)

            # save final figure
            plt.tight_layout()
            plt.savefig('%s/%s_c%d_heat.pdf' % (options.out_dir,header.replace(':','_'), ci), dpi=300)
            plt.close()


        #################################################################
        # print table of nt variability for each cell
        #################################################################
        for ci in range(seq_mod_preds.shape[3]):
            seq_mod_preds_cell = seq_mod_preds[si,:,:,ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)

            loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0)
            gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell

            for pos in range(seq_mod_preds_cell.shape[1]):
                cols = [header, delta_start+pos, ci, loss_matrix[pos], gain_matrix[pos]]
                print >> table_out, '\t'.join([str(c) for c in cols])

    table_out.close()
示例#9
0
def main():
    usage = "usage: %prog [options] <model_file> <input_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-a", dest="input_activity_file", help="Optional activity table corresponding to an input FASTA file"
    )
    parser.add_option(
        "-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5 [Default: %default]"
    )
    parser.add_option(
        "-g",
        dest="gain_height",
        default=False,
        action="store_true",
        help="Nucleotide heights determined by the max of loss and gain [Default: %default]",
    )
    parser.add_option(
        "-m", dest="min_limit", default=0.1, type="float", help="Minimum heatmap limit [Default: %default]"
    )
    parser.add_option(
        "-n",
        dest="center_nt",
        default=200,
        type="int",
        help="Center nt to mutate and plot in the heat map [Default: %default]",
    )
    parser.add_option("-o", dest="out_dir", default="heat", help="Output directory [Default: %default]")
    parser.add_option(
        "-s", dest="sample", default=None, type="int", help="Sample sequences from the test set [Default:%default]"
    )
    parser.add_option(
        "-t",
        dest="targets",
        default="0",
        help="Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error("Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file")
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == ">":
                seq_headers.append(line[1:].rstrip())
                seqs.append("")
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = "%s/model_in.h5" % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False
            )

            # read in target names
            target_labels = open(options.input_activity_file).readline().strip().split("\t")

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, "w")
        h5f.create_dataset("test_in", data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, "r")
            seqs_1hot = np.array(hdf5_in["test_in"])
            targets = np.array(hdf5_in["test_out"])
            try:  # TEMP
                seq_headers = np.array(hdf5_in["test_headers"])
                target_labels = np.array(hdf5_in["target_labels"])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = "%s/model_in.h5" % options.out_dir
                h5f = h5py.File(model_input_hdf5, "w")
                h5f.create_dataset("test_in", data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error("Could not parse input file as FASTA or HDF5.")

    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = "%s/model_out.h5" % options.out_dir
        torch_cmd = "basset_sat_predict.lua -center_nt %d %s %s %s" % (
            options.center_nt,
            model_file,
            model_input_hdf5,
            options.model_hdf5_file,
        )
        if subprocess.call(torch_cmd, shell=True):
            message("Error running basset_sat_predict.lua", "error")

    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, "r")
    seq_mod_preds = np.array(hdf5_in["seq_mod_preds"])
    hdf5_in.close()

    # trim seqs to match seq_mod_preds length
    seq_len = len(seqs[0])
    delta_start = 0
    delta_len = seq_mod_preds.shape[2]
    if delta_len < seq_len:
        delta_start = (seq_len - delta_len) / 2
        for i in range(len(seqs)):
            seqs[i] = seqs[i][delta_start : delta_start + delta_len]

    # decide which cells to plot
    if options.targets == "-1":
        plot_targets = xrange(seq_mod_preds.shape[3])
    else:
        plot_targets = [int(ci) for ci in options.targets.split(",")]

    #################################################################
    # plot
    #################################################################
    table_out = open("%s/table.txt" % options.out_dir, "w")

    rdbu = sns.color_palette("RdBu_r", 10)

    nts = "ACGT"
    for si in range(seq_mod_preds.shape[0]):
        try:
            header = seq_headers[si]
        except TypeError:
            header = "seq%d" % si
        seq = seqs[si]

        # plot some descriptive heatmaps for each individual cell type
        for ci in plot_targets:
            seq_mod_preds_cell = seq_mod_preds[si, :, :, ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            # compute matrices
            norm_matrix = seq_mod_preds_cell - real_pred_cell
            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)
            minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell])

            # prepare figure
            sns.set(style="white", font_scale=0.5)
            sns.axes_style({"axes.linewidth": 1})
            heat_cols = 400
            sad_start = 1
            sad_end = 323
            logo_start = 0
            logo_end = 324
            fig = plt.figure(figsize=(20, 3))
            ax_logo = plt.subplot2grid((3, heat_cols), (0, logo_start), colspan=(logo_end - logo_start))
            ax_sad = plt.subplot2grid((3, heat_cols), (1, sad_start), colspan=(sad_end - sad_start))
            ax_heat = plt.subplot2grid((3, heat_cols), (2, 0), colspan=heat_cols)

            # print a WebLogo of the sequence
            vlim = max(options.min_limit, abs(minmax_matrix).max())
            if options.gain_height:
                seq_heights = 0.25 + 1.75 / vlim * (abs(minmax_matrix).max(axis=0))
            else:
                seq_heights = 0.25 + 1.75 / vlim * (-minmax_matrix[0])
            logo_eps = "%s/%s_c%d_seq.eps" % (options.out_dir, header_filename(header), ci)
            seq_logo(seq, seq_heights, logo_eps)

            # add to figure
            logo_png = "%s.png" % logo_eps[:-4]
            logo_cmd = "convert -density 300 %s %s" % (logo_eps, logo_png)
            if subprocess.call(logo_cmd, shell=True):
                message("Error running convert", "error")
            logo = Image.open(logo_png)
            ax_logo.imshow(logo)
            ax_logo.set_axis_off()

            # plot loss and gain SAD scores
            ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label="loss", linewidth=1)
            ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label="gain", linewidth=1)
            ax_sad.set_xlim(0, minmax_matrix.shape[1])
            ax_sad.legend()
            # ax_sad.grid(True, linestyle=':')
            for axis in ["top", "bottom", "left", "right"]:
                ax_sad.spines[axis].set_linewidth(0.5)

            # plot real-normalized scores
            vlim = max(options.min_limit, abs(norm_matrix).max())
            sns.heatmap(norm_matrix, linewidths=0, cmap="RdBu_r", vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat)
            ax_heat.yaxis.set_ticklabels("TGCA", rotation="horizontal")  # , size=10)

            # save final figure
            plt.tight_layout()
            plt.savefig("%s/%s_c%d_heat.pdf" % (options.out_dir, header.replace(":", "_"), ci), dpi=300)
            plt.close()

        #################################################################
        # print table of nt variability for each cell
        #################################################################
        for ci in range(seq_mod_preds.shape[3]):
            seq_mod_preds_cell = seq_mod_preds[si, :, :, ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)

            loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0)
            gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell

            for pos in range(seq_mod_preds_cell.shape[1]):
                cols = [header, delta_start + pos, ci, loss_matrix[pos], gain_matrix[pos]]
                print >> table_out, "\t".join([str(c) for c in cols])

    table_out.close()
def main():
    usage = "usage: %prog [options] <model_th> <vcf_file>"
    parser = OptionParser(usage)
    parser.add_option('-f', dest='genome_fasta', default='%s/data/genomes/hg19.fa'%os.environ['BASSETDIR'], help='Genome FASTA from which sequences will be drawn [Default: %default]')
    parser.add_option('-b', dest='batchsize', default=128, help='Batch size for prediction. [Default: %default]')
    parser.add_option('-i', dest='index_snp', default=False, action='store_true', help='SNPs are labeled with their index SNP as column 6 [Default: %default]')
    parser.add_option('-s', dest='score', default=False, action='store_true', help='SNPs are labeled with scores as column 7 [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='sad', help='Output directory for tables and plots [Default: %default]')
    parser.add_option('-l', dest='seq_len', type='int', default=600, help='Sequence length provided to the model [Default: %default]')
    parser.add_option('-t', dest='targets_file', default=None, help='File specifying target indexes and labels in table format')
    parser.add_option('--from', dest='from_coord', default=None, type='int', help='Process SNPs starting from this coord. Assume VCF sorted.')
    parser.add_option('--to', dest='to_coord', default=None, type='int', help='Process SNPs ending at this coord. Assume VCF sorted.')
    parser.add_option('--chrom', dest='chrom', default=None, type='str', help='Which chromosome is being processed.')
    parser.add_option('--only-generate-inputh5', dest='only_gen_inputh5', default=False, action='store_true', help='Do not run prediction step [Default: %default]')
    parser.add_option('--only-run-pred', dest='only_run_pred', default=False, action='store_true', help='Input h5 file already generated. Only run prediction [Default: %default]')
    parser.add_option('--only-make-sad', dest='only_make_sad', default=False, action='store_true', help='Input h5 file and model already generated. Only generate output [Default: %default]')

    (options,args) = parser.parse_args()
    
    if len(args) != 2:
        parser.error('Must provide Torch model and VCF file')
    else:
        model_th = args[0]
        vcf_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.from_coord is not None and options.to_coord is not None:
        if options.to_coord <= options.from_coord:
            parser.error('to_coord must be greater than from_coord')

    #################################################################
    # prep SNP sequences
    #################################################################
    if not options.only_run_pred and not options.only_make_sad:
        prep_snp_seqs(vcf_file, options.out_dir, options.seq_len, options.genome_fasta, from_=options.from_coord, to_=options.to_coord)

    if options.only_gen_inputh5:
        sys.exit(0)

    #################################################################
    # predict in Torch
    #################################################################
    model_hdf5_file = '%s/model_out.txt' % options.out_dir
    if not options.only_make_sad:
        torch_predict(options.out_dir, options.batchsize, model_th, model_hdf5_file)

    #################################################################
    # collect and print SADs
    #################################################################
    message('collect and print SADs')

    if options.targets_file is not None:
        target_labels = [line.split()[0] for line in open(options.targets_file)]

    sad_out = open('%s/sad_scores_table.txt' % options.out_dir, 'w')
    header_cols = ['rsid', 'index', 'score', 'ref', 'alt'] + target_labels
    sad_out.write('\t'.join(header_cols)+'\n')

    # Read simultaneously from SNP and predictions file
    if vcf_file.endswith(".gz"):
        snp_reader = gzip.open(vcf_file, "r")
    else: snp_reader = open(vcf_file, "r")
    snpline = snp_reader.readline().strip()
    while snpline.startswith("#"): snpline = snp_reader.readline().strip()
    pred_reader = open(model_hdf5_file, "r")
    predline = pred_reader.readline().strip()

    # Iterate through SNPs
    while snpline != "" and predline != "":
        snp = vcf.SNP(snpline, index_snp=options.index_snp, score=options.score)
        if options.chrom is not None and snp.chrom != options.chrom: continue
        if (options.from_coord is not None and snp.pos < options.from_coord):
            snpline = snp_reader.readline().strip()
            continue
        if (options.to_coord is not None and snp.pos > options.to_coord):
            break
        ref_pred = np.array([float(p) for p in predline.split()])
        predline = pred_reader.readline().strip()
        for alt_al in snp.alt_alleles:
            alt_pred = np.array([float(p) for p in predline.split()])
            predline = pred_reader.readline().strip()
        alt_sad = alt_pred - ref_pred # TODO assuming biallelic
        sad_out.write('\t'.join(map(str, [snp.rsid, snp.index_snp, snp.score, snp.ref_allele, snp.alt_alleles[0]] + \
                                   map(lambda x: '%7.4f'%x, list(alt_sad)))) + '\n')
        snpline = snp_reader.readline().strip()
    snp_reader.close()
    pred_reader.close()
    sad_out.close()
def torch_predict(out_dir, batchsize, model_th, model_hdf5_file):
    message('predict in torch')
    cuda_str = ""
    cmd = 'basset_predict_local.lua -batchsize %s -norm %s %s %s/model_in.h5 %s' % (batchsize, cuda_str, model_th, out_dir, model_hdf5_file)
    if subprocess.call(cmd, shell=True):
        message('Error running basset_predict.lua', 'error')
示例#12
0
def main():
    usage = 'usage: %prog [options] <model_file> <test_hdf5_file>'
    parser = OptionParser(usage)
    parser.add_option('-b', dest='batch_size', default=1000, type='int', help='Batch size (affects memory usage) [Default: %default]')
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5.')
    parser.add_option('-i', dest='informative_only', default=False, action='store_true', help='Plot informative filters only [Default: %default]')
    parser.add_option('-m', dest='motifs_file')
    parser.add_option('-n', dest='norm_targets', default=False, action='store_true', help='Use the norm of the target influences as the primary influence measure [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='.')
    parser.add_option('--subset', dest='subset_file', default=None, help='Subset targets to the ones in this file')
    parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]')
    parser.add_option('--seqs', dest='seqs', default=False, action='store_true', help='Output sequence-specific influence [Default: %default]')
    parser.add_option('-t', dest='targets_file', default=None, help='File specifying target indexes and labels in table format')
    parser.add_option('--width', dest='heat_width', default=10, type='float')
    parser.add_option('--height', dest='heat_height', default=20, type='float')
    parser.add_option('--font', dest='heat_font', default=0.4, type='float')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide Basset model file and test data in HDF5 format.')
    else:
        model_file = args[0]
        test_hdf5_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # load data
    #################################################################
    # load sequences
    test_hdf5_in = h5py.File(test_hdf5_file, 'r')
    seq_vecs = np.array(test_hdf5_in['test_in'])
    seq_targets = np.array(test_hdf5_in['test_out'])
    seq_headers = np.array(test_hdf5_in['test_headers'])
    test_hdf5_in.close()

    # name the targets
    target_names = name_targets(seq_targets.shape[1], options.targets_file)

    if options.subset_file:
        target_subset = set([line.rstrip() for line in open(options.subset_file)])

    # get additional motif information
    df_motifs = None
    if options.motifs_file:
        df_motifs = pd.read_table(options.motifs_file, delim_whitespace=True)


    #################################################################
    # sample
    #################################################################
    if options.sample is not None:
        # choose sampled indexes
        sample_i = np.array(random.sample(xrange(seq_vecs.shape[0]), options.sample))

        # filter
        seq_vecs = seq_vecs[sample_i]
        seq_targets = seq_targets[sample_i]
        seq_headers = seq_headers[sample_i]

        # create a new HDF5 file
        sample_hdf5_file = '%s/sample.h5' % options.out_dir
        sample_hdf5_out = h5py.File(sample_hdf5_file, 'w')
        sample_hdf5_out.create_dataset('test_in', data=seq_vecs)
        sample_hdf5_out.create_dataset('test_out', data=seq_targets)
        sample_hdf5_out.close()

        # update test HDF5
        test_hdf5_file = sample_hdf5_file


    #################################################################
    # Torch predict
    #################################################################
    if options.model_hdf5_file is None:
        torch_opts = ''
        if options.seqs:
            torch_opts += '-seqs'

        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_motifs_infl.lua -batch_size %d %s %s %s %s' % (options.batch_size, torch_opts, model_file, test_hdf5_file, options.model_hdf5_file)
        if subprocess.call(torch_cmd, shell=True):
            message('Error running basset_motifs_infl.lua', 'error')

    # load model output
    model_hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    filter_means = np.array(model_hdf5_in['filter_means'])
    filter_stds = np.array(model_hdf5_in['filter_stds'])
    filter_infl = np.array(model_hdf5_in['filter_infl'])
    filter_infl_targets = np.array(model_hdf5_in['filter_infl_targets'])
    if options.seqs:
        seq_filter_targets = np.array(model_hdf5_in['seq_filter_targets'])
    model_hdf5_in.close()


    #############################################################
    # use target-based influence
    #############################################################
    if options.norm_targets:
        # save the loss-based influence
        filter_infl_loss = np.array(filter_infl, copy=True)

        # set to the target-based influence
        for fi in range(filter_infl_targets.shape[0]):
            filter_infl[fi] = np.mean(filter_infl_targets[fi]**2)

        # print to a table
        tnorm_out = open('%s/loss_target.txt' % options.out_dir, 'w')
        for fi in range(len(filter_infl)):
            cols = (fi, filter_infl_loss[fi], filter_infl[fi])
            print >> tnorm_out, '%3d  %7.4f  %7.4f' % cols
        tnorm_out.close()

        # compare the two
        xmin, xmax = coord_range(filter_infl_loss, buf_pct=0.1)
        ymin, ymax = coord_range(filter_infl, buf_pct=0.1)

        sns.set(style='ticks', font_scale=1)
        plt.figure()
        g = sns.jointplot(x=filter_infl_loss, y=filter_infl, color='black', joint_kws={'alpha':0.7})
        ax = g.ax_joint
        ax.set_xlim(xmin, xmax)
        ax.set_xlabel('loss-based influence')
        ax.xaxis.label.set_fontsize(18)
        map(lambda xl: xl.set_fontsize(15), ax.get_xticklabels())
        ax.set_ylim(ymin, ymax)
        ax.set_ylabel('target-based influence')
        ax.yaxis.label.set_fontsize(18)
        map(lambda yl: yl.set_fontsize(15), ax.get_yticklabels())
        ax.grid(True, linestyle=':')
        plt.tight_layout(w_pad=0, h_pad=0)
        plt.savefig('%s/loss_target.pdf' % options.out_dir)
        plt.close()


    #############################################################
    # print filter influence table
    #############################################################
    table_out = open('%s/table.txt' % options.out_dir, 'w')
    for i in range(len(filter_infl)):
        if df_motifs is not None:
            cols = (i, filter_infl[i], filter_means[i], filter_stds[i], df_motifs.ic.iloc[i], df_motifs.annotation.iloc[i])
            print >> table_out, '%3d  %7.4f  %6.4f  %6.3f  %4.1f  %8s' % cols
        else:
            cols = (i, filter_infl[i], filter_means[i], filter_stds[i])
            print >> table_out, '%3d  %7.4f  %6.4f  %6.3f' % cols
    table_out.close()


    #################################################################
    # plot filter influence
    #################################################################
    sb_blue = sns.color_palette('deep')[0]
    sns.set(style='ticks', font_scale=1)
    ymin, ymax = coord_range(filter_infl, buf_pct=0.1)

    if options.motifs_file:
        nonzero = np.array(df_motifs.ic > 0)
        xmin, xmax = coord_range(df_motifs.ic.loc[nonzero])
        plt.figure()
        g = sns.jointplot(x=np.array(df_motifs.ic.loc[nonzero]), y=filter_infl[nonzero], color='black', stat_func=None, joint_kws={'alpha':0.8})
        ax = g.ax_joint
        ax.set_xlim(xmin, xmax)
        ax.set_xlabel('Information content')
        ax.xaxis.label.set_fontsize(18)
        map(lambda xl: xl.set_fontsize(15), ax.get_xticklabels())
        ax.set_ylim(ymin, ymax)
        ax.set_ylabel('Influence')
        ax.yaxis.label.set_fontsize(18)
        map(lambda yl: yl.set_fontsize(15), ax.get_yticklabels())
        # ax.grid(True, linestyle=':')
        plt.tight_layout(w_pad=0, h_pad=0)
        plt.savefig('%s/ic_infl.pdf' % options.out_dir)
        plt.close()


    #############################################################
    # prep for cell-specific analyses
    #############################################################
    filter_names = name_filters(len(filter_infl), df_motifs)

    # construct a panda data frame of the target influences
    df_ft = pd.DataFrame(filter_infl_targets, index=filter_names, columns=target_names)

    # print filter influence per target table
    table_out = open('%s/table_target.txt' % options.out_dir, 'w')
    for i in range(df_ft.shape[0]):
        for ti in range(len(target_names)):
            cols = (i, ti, target_names[ti], df_ft.iloc[i,ti])
            print >> table_out, '%-3d  %3d  %20s  %7.4f' % cols
    table_out.close()

    # print sequence-specific filter influence per target table
    if options.seqs:
        table_out = open('%s/table_seqs.txt' % options.out_dir, 'w')
        for si in range(seq_filter_targets.shape[0]):
            for fi in range(seq_filter_targets.shape[1]):
                for ti in range(seq_filter_targets.shape[2]):
                    cols = (seq_headers[si], fi, ti, seq_filter_targets[si][fi][ti])
                    print >> table_out, '%-25s  %3d  %3d  %7.4f' % cols
        table_out.close()

    # use only high information filters
    if options.informative_only and df_motifs is not None:
        df_ft = df_ft[df_moitfs.ic > 6]
    elif df_ft.shape[1] >= 10:
        df_ft_stds = df_ft.std(axis=1)
        df_ft = df_ft[df_ft_stds > 0]

    #############################################################
    # plot filter influence per cell heatmaps
    #############################################################
    # subset targets before plotting
    if options.subset_file:
        subset_mask = df_ft.columns.isin(target_subset)
        df_ft_sub = df_ft.loc[:,subset_mask]

        plot_infl_heatmaps(df_ft_sub, options.out_dir, options.heat_width, options.heat_height, options.heat_font)

    # plot all cells
    else:
        plot_infl_heatmaps(df_ft, options.out_dir, options.heat_width, options.heat_height, options.heat_font)
示例#13
0
def main():
    usage = 'usage: %prog [options] <model_file> <test_hdf5_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-b',
        dest='batch_size',
        default=1000,
        type='int',
        help='Batch size (affects memory usage) [Default: %default]')
    parser.add_option('-d',
                      dest='model_hdf5_file',
                      default=None,
                      help='Pre-computed model output as HDF5.')
    parser.add_option('-i',
                      dest='informative_only',
                      default=False,
                      action='store_true',
                      help='Plot informative filters only [Default: %default]')
    parser.add_option('-m', dest='motifs_file')
    parser.add_option(
        '-n',
        dest='norm_targets',
        default=False,
        action='store_true',
        help=
        'Use the norm of the target influences as the primary influence measure [Default: %default]'
    )
    parser.add_option('-o', dest='out_dir', default='.')
    parser.add_option('--subset',
                      dest='subset_file',
                      default=None,
                      help='Subset targets to the ones in this file')
    parser.add_option(
        '-s',
        dest='sample',
        default=None,
        type='int',
        help='Sample sequences from the test set [Default:%default]')
    parser.add_option(
        '--seqs',
        dest='seqs',
        default=False,
        action='store_true',
        help='Output sequence-specific influence [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        help='File specifying target indexes and labels in table format')
    parser.add_option('--width', dest='heat_width', default=10, type='float')
    parser.add_option('--height', dest='heat_height', default=20, type='float')
    parser.add_option('--font', dest='heat_font', default=0.4, type='float')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide Basset model file and test data in HDF5 format.')
    else:
        model_file = args[0]
        test_hdf5_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # load data
    #################################################################
    # load sequences
    test_hdf5_in = h5py.File(test_hdf5_file, 'r')
    seq_vecs = np.array(test_hdf5_in['test_in'])
    seq_targets = np.array(test_hdf5_in['test_out'])
    seq_headers = np.array(test_hdf5_in['test_headers'])
    test_hdf5_in.close()

    # name the targets
    target_names = name_targets(seq_targets.shape[1], options.targets_file)

    if options.subset_file:
        target_subset = set(
            [line.rstrip() for line in open(options.subset_file)])

    # get additional motif information
    df_motifs = None
    if options.motifs_file:
        df_motifs = pd.read_table(options.motifs_file, delim_whitespace=True)

    #################################################################
    # sample
    #################################################################
    if options.sample is not None:
        # choose sampled indexes
        sample_i = np.array(
            random.sample(xrange(seq_vecs.shape[0]), options.sample))

        # filter
        seq_vecs = seq_vecs[sample_i]
        seq_targets = seq_targets[sample_i]
        seq_headers = seq_headers[sample_i]

        # create a new HDF5 file
        sample_hdf5_file = '%s/sample.h5' % options.out_dir
        sample_hdf5_out = h5py.File(sample_hdf5_file, 'w')
        sample_hdf5_out.create_dataset('test_in', data=seq_vecs)
        sample_hdf5_out.create_dataset('test_out', data=seq_targets)
        sample_hdf5_out.close()

        # update test HDF5
        test_hdf5_file = sample_hdf5_file

    #################################################################
    # Torch predict
    #################################################################
    if options.model_hdf5_file is None:
        torch_opts = ''
        if options.seqs:
            torch_opts += '-seqs'

        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_motifs_infl.lua -batch_size %d %s %s %s %s' % (
            options.batch_size, torch_opts, model_file, test_hdf5_file,
            options.model_hdf5_file)
        if subprocess.call(torch_cmd, shell=True):
            message('Error running basset_motifs_infl.lua', 'error')

    # load model output
    model_hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    filter_means = np.array(model_hdf5_in['filter_means'])
    filter_stds = np.array(model_hdf5_in['filter_stds'])
    filter_infl = np.array(model_hdf5_in['filter_infl'])
    filter_infl_targets = np.array(model_hdf5_in['filter_infl_targets'])
    if options.seqs:
        seq_filter_targets = np.array(model_hdf5_in['seq_filter_targets'])
    model_hdf5_in.close()

    #############################################################
    # use target-based influence
    #############################################################
    if options.norm_targets:
        # save the loss-based influence
        filter_infl_loss = np.array(filter_infl, copy=True)

        # set to the target-based influence
        for fi in range(filter_infl_targets.shape[0]):
            filter_infl[fi] = np.mean(filter_infl_targets[fi]**2)

        # print to a table
        tnorm_out = open('%s/loss_target.txt' % options.out_dir, 'w')
        for fi in range(len(filter_infl)):
            cols = (fi, filter_infl_loss[fi], filter_infl[fi])
            print >> tnorm_out, '%3d  %7.4f  %7.4f' % cols
        tnorm_out.close()

        # compare the two
        xmin, xmax = coord_range(filter_infl_loss, buf_pct=0.1)
        ymin, ymax = coord_range(filter_infl, buf_pct=0.1)

        sns.set(style='ticks', font_scale=1)
        plt.figure()
        g = sns.jointplot(x=filter_infl_loss,
                          y=filter_infl,
                          color='black',
                          joint_kws={'alpha': 0.7})
        ax = g.ax_joint
        ax.set_xlim(xmin, xmax)
        ax.set_xlabel('loss-based influence')
        ax.xaxis.label.set_fontsize(18)
        map(lambda xl: xl.set_fontsize(15), ax.get_xticklabels())
        ax.set_ylim(ymin, ymax)
        ax.set_ylabel('target-based influence')
        ax.yaxis.label.set_fontsize(18)
        map(lambda yl: yl.set_fontsize(15), ax.get_yticklabels())
        ax.grid(True, linestyle=':')
        plt.tight_layout(w_pad=0, h_pad=0)
        plt.savefig('%s/loss_target.pdf' % options.out_dir)
        plt.close()

    #############################################################
    # print filter influence table
    #############################################################
    table_out = open('%s/table.txt' % options.out_dir, 'w')
    for i in range(len(filter_infl)):
        if df_motifs is not None:
            cols = (i, filter_infl[i], filter_means[i], filter_stds[i],
                    df_motifs.ic.iloc[i], df_motifs.annotation.iloc[i])
            print >> table_out, '%3d  %7.4f  %6.4f  %6.3f  %4.1f  %8s' % cols
        else:
            cols = (i, filter_infl[i], filter_means[i], filter_stds[i])
            print >> table_out, '%3d  %7.4f  %6.4f  %6.3f' % cols
    table_out.close()

    #################################################################
    # plot filter influence
    #################################################################
    sb_blue = sns.color_palette('deep')[0]
    sns.set(style='ticks', font_scale=1)
    ymin, ymax = coord_range(filter_infl, buf_pct=0.1)

    if options.motifs_file:
        nonzero = np.array(df_motifs.ic > 0)
        xmin, xmax = coord_range(df_motifs.ic.loc[nonzero])
        plt.figure()
        g = sns.jointplot(x=np.array(df_motifs.ic.loc[nonzero]),
                          y=filter_infl[nonzero],
                          color='black',
                          stat_func=None,
                          joint_kws={'alpha': 0.8})
        ax = g.ax_joint
        ax.set_xlim(xmin, xmax)
        ax.set_xlabel('Information content')
        ax.xaxis.label.set_fontsize(18)
        map(lambda xl: xl.set_fontsize(15), ax.get_xticklabels())
        ax.set_ylim(ymin, ymax)
        ax.set_ylabel('Influence')
        ax.yaxis.label.set_fontsize(18)
        map(lambda yl: yl.set_fontsize(15), ax.get_yticklabels())
        # ax.grid(True, linestyle=':')
        plt.tight_layout(w_pad=0, h_pad=0)
        plt.savefig('%s/ic_infl.pdf' % options.out_dir)
        plt.close()

    #############################################################
    # prep for cell-specific analyses
    #############################################################
    filter_names = name_filters(len(filter_infl), df_motifs)

    # construct a panda data frame of the target influences
    df_ft = pd.DataFrame(filter_infl_targets,
                         index=filter_names,
                         columns=target_names)

    # print filter influence per target table
    table_out = open('%s/table_target.txt' % options.out_dir, 'w')
    for i in range(df_ft.shape[0]):
        for ti in range(len(target_names)):
            cols = (i, ti, target_names[ti], df_ft.iloc[i, ti])
            print >> table_out, '%-3d  %3d  %20s  %7.4f' % cols
    table_out.close()

    # print sequence-specific filter influence per target table
    if options.seqs:
        table_out = open('%s/table_seqs.txt' % options.out_dir, 'w')
        for si in range(seq_filter_targets.shape[0]):
            for fi in range(seq_filter_targets.shape[1]):
                for ti in range(seq_filter_targets.shape[2]):
                    cols = (seq_headers[si], fi, ti,
                            seq_filter_targets[si][fi][ti])
                    print >> table_out, '%-25s  %3d  %3d  %7.4f' % cols
        table_out.close()

    # use only high information filters
    if options.informative_only and df_motifs is not None:
        df_ft = df_ft[df_moitfs.ic > 6]
    elif df_ft.shape[1] >= 10:
        df_ft_stds = df_ft.std(axis=1)
        df_ft = df_ft[df_ft_stds > 0]

    #############################################################
    # plot filter influence per cell heatmaps
    #############################################################
    # subset targets before plotting
    if options.subset_file:
        subset_mask = df_ft.columns.isin(target_subset)
        df_ft_sub = df_ft.loc[:, subset_mask]

        plot_infl_heatmaps(df_ft_sub, options.out_dir, options.heat_width,
                           options.heat_height, options.heat_font)

    # plot all cells
    else:
        plot_infl_heatmaps(df_ft, options.out_dir, options.heat_width,
                           options.heat_height, options.heat_font)
示例#14
0
def main():
    usage = 'usage: %prog [options] <model_file> <test_hdf5_file>'
    parser = OptionParser(usage)
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5.')
    parser.add_option('-o', dest='out_dir', default='.')
    parser.add_option('-m', dest='meme_db', default='%s/data/motifs/Homo_sapiens.meme' % os.environ['BASSETDIR'], help='MEME database used to annotate motifs')
    parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]')
    parser.add_option('-t', dest='trim_filters', default=False, action='store_true', help='Trim uninformative positions off the filter ends [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide Basset model file and test data in HDF5 format.')
    else:
        model_file = args[0]
        test_hdf5_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # load data
    #################################################################
    # load sequences
    test_hdf5_in = h5py.File(test_hdf5_file, 'r')
    seq_vecs = np.array(test_hdf5_in['test_in'])
    seq_targets = np.array(test_hdf5_in['test_out'])
    try:
        target_names = list(test_hdf5_in['target_labels'])
    except KeyError:
        target_names = ['t%d'%ti for ti in range(seq_targets.shape[1])]
    test_hdf5_in.close()


    #################################################################
    # sample
    #################################################################
    if options.sample is not None:
        # choose sampled indexes
        sample_i = np.array(random.sample(xrange(seq_vecs.shape[0]), options.sample))

        # filter
        seq_vecs = seq_vecs[sample_i]
        seq_targets = seq_targets[sample_i]

        # create a new HDF5 file
        sample_hdf5_file = '%s/sample.h5' % options.out_dir
        sample_hdf5_out = h5py.File(sample_hdf5_file, 'w')
        sample_hdf5_out.create_dataset('test_in', data=seq_vecs)
        sample_hdf5_out.close()

        # update test HDF5
        test_hdf5_file = sample_hdf5_file

    # convert to letters
    seqs = dna_io.vecs2dna(seq_vecs)


    #################################################################
    # Torch predict
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_motifs_predict.lua %s %s %s' % (model_file, test_hdf5_file, options.model_hdf5_file)
        if subprocess.call(torch_cmd, shell=True):
            message('Error running basset_motifs_predict.lua', 'error')

    # load model output
    model_hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    filter_weights = np.array(model_hdf5_in['weights'])
    filter_outs = np.array(model_hdf5_in['outs'])
    model_hdf5_in.close()

    # store useful variables
    num_filters = filter_weights.shape[0]
    filter_size = filter_weights.shape[2]


    #################################################################
    # individual filter plots
    #################################################################
    # also save information contents
    filters_ic = []
    meme_out = meme_intro('%s/filters_meme.txt'%options.out_dir, seqs)

    for f in range(num_filters):
        print 'Filter %d' % f

        # plot filter parameters as a heatmap
        plot_filter_heat(filter_weights[f,:,:], '%s/filter%d_heat.pdf' % (options.out_dir,f))

        # plot weblogo of high scoring outputs
        plot_filter_logo(filter_outs[:,f,:], filter_size, seqs, '%s/filter%d_logo'%(options.out_dir,f), maxpct_t=0.5)

        # make a PWM for the filter
        filter_pwm, nsites = make_filter_pwm('%s/filter%d_logo.fa'%(options.out_dir,f))

        if nsites < 10:
            # no information
            filters_ic.append(0)
        else:
            # compute and save information content
            filters_ic.append(info_content(filter_pwm))

            # add to the meme motif file
            meme_add(meme_out, f, filter_pwm, nsites, options.trim_filters)

    meme_out.close()


    #################################################################
    # annotate filters
    #################################################################
    # run tomtom
    tomtom_cmd = 'tomtom -dist pearson -thresh 0.1 -oc %s/tomtom %s/filters_meme.txt %s' % (options.out_dir, options.out_dir, options.meme_db)
    if subprocess.call(tomtom_cmd, shell=True):
        message('Error running tomtom', 'error')

    # read in annotations
    filter_names = name_filters(num_filters, '%s/tomtom/tomtom.txt'%options.out_dir, options.meme_db)


    #################################################################
    # print a table of information
    #################################################################
    table_out = open('%s/table.txt'%options.out_dir, 'w')

    # print header for later panda reading
    header_cols = ('', 'consensus', 'annotation', 'ic', 'mean', 'std')
    print >> table_out, '%3s  %19s  %10s  %5s  %6s  %6s' % header_cols

    for f in range(num_filters):
        # collapse to a consensus motif
        consensus = filter_motif(filter_weights[f,:,:])

        # grab annotation
        annotation = '.'
        name_pieces = filter_names[f].split('_')
        if len(name_pieces) > 1:
            annotation = name_pieces[1]

        # plot density of filter output scores
        fmean, fstd = plot_score_density(np.ravel(filter_outs[:,f,:]), '%s/filter%d_dens.pdf' % (options.out_dir,f))

        row_cols = (f, consensus, annotation, filters_ic[f], fmean, fstd)
        print >> table_out, '%-3d  %19s  %10s  %5.2f  %6.4f  %6.4f' % row_cols

    table_out.close()


    #################################################################
    # global filter plots
    #################################################################
    # plot filter-sequence heatmap
    plot_filter_seq_heat(filter_outs, '%s/filter_seqs.pdf'%options.out_dir)

    # plot filter-segment heatmap
    plot_filter_seg_heat(filter_outs, '%s/filter_segs.pdf'%options.out_dir)
    plot_filter_seg_heat(filter_outs, '%s/filter_segs_raw.pdf'%options.out_dir, whiten=False)

    # plot filter-target correlation heatmap
    plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_mean.pdf'%options.out_dir, 'mean')
    plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_max.pdf'%options.out_dir, 'max')
示例#15
0
def main():
    usage = 'usage: %prog [options] <model_file> <test_hdf5_file>'
    parser = OptionParser(usage)
    parser.add_option('-d',
                      dest='model_hdf5_file',
                      default=None,
                      help='Pre-computed model output as HDF5.')
    parser.add_option('-o', dest='out_dir', default='.')
    parser.add_option('-m',
                      dest='meme_db',
                      default='%s/data/motifs/Homo_sapiens.meme' %
                      os.environ['BASSETDIR'],
                      help='MEME database used to annotate motifs')
    parser.add_option(
        '-s',
        dest='sample',
        default=None,
        type='int',
        help='Sample sequences from the test set [Default:%default]')
    parser.add_option(
        '-t',
        dest='trim_filters',
        default=False,
        action='store_true',
        help=
        'Trim uninformative positions off the filter ends [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide Basset model file and test data in HDF5 format.')
    else:
        model_file = args[0]
        test_hdf5_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # load data
    #################################################################
    # load sequences
    test_hdf5_in = h5py.File(test_hdf5_file, 'r')
    seq_vecs = np.array(test_hdf5_in['test_in'])
    seq_targets = np.array(test_hdf5_in['test_out'])
    try:
        target_names = list(test_hdf5_in['target_labels'])
    except KeyError:
        target_names = ['t%d' % ti for ti in range(seq_targets.shape[1])]
    test_hdf5_in.close()

    #################################################################
    # sample
    #################################################################
    if options.sample is not None:
        # choose sampled indexes
        sample_i = np.array(
            random.sample(xrange(seq_vecs.shape[0]), options.sample))

        # filter
        seq_vecs = seq_vecs[sample_i]
        seq_targets = seq_targets[sample_i]

        # create a new HDF5 file
        sample_hdf5_file = '%s/sample.h5' % options.out_dir
        sample_hdf5_out = h5py.File(sample_hdf5_file, 'w')
        sample_hdf5_out.create_dataset('test_in', data=seq_vecs)
        sample_hdf5_out.close()

        # update test HDF5
        test_hdf5_file = sample_hdf5_file

    # convert to letters
    seqs = dna_io.vecs2dna(seq_vecs)

    #################################################################
    # Torch predict
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_motifs_predict.lua %s %s %s' % (
            model_file, test_hdf5_file, options.model_hdf5_file)
        if subprocess.call(torch_cmd, shell=True):
            message('Error running basset_motifs_predict.lua', 'error')

    # load model output
    model_hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    filter_weights = np.array(model_hdf5_in['weights'])
    filter_outs = np.array(model_hdf5_in['outs'])
    model_hdf5_in.close()

    # store useful variables
    num_filters = filter_weights.shape[0]
    filter_size = filter_weights.shape[2]

    #################################################################
    # individual filter plots
    #################################################################
    # also save information contents
    filters_ic = []
    meme_out = meme_intro('%s/filters_meme.txt' % options.out_dir, seqs)

    for f in range(num_filters):
        print 'Filter %d' % f

        # plot filter parameters as a heatmap
        plot_filter_heat(filter_weights[f, :, :],
                         '%s/filter%d_heat.pdf' % (options.out_dir, f))

        # plot weblogo of high scoring outputs
        plot_filter_logo(filter_outs[:, f, :],
                         filter_size,
                         seqs,
                         '%s/filter%d_logo' % (options.out_dir, f),
                         maxpct_t=0.5)

        # make a PWM for the filter
        filter_pwm, nsites = make_filter_pwm('%s/filter%d_logo.fa' %
                                             (options.out_dir, f))

        if nsites < 10:
            # no information
            filters_ic.append(0)
        else:
            # compute and save information content
            filters_ic.append(info_content(filter_pwm))

            # add to the meme motif file
            meme_add(meme_out, f, filter_pwm, nsites, options.trim_filters)

    meme_out.close()

    #################################################################
    # annotate filters
    #################################################################
    # run tomtom
    tomtom_cmd = 'tomtom -dist pearson -thresh 0.1 -oc %s/tomtom %s/filters_meme.txt %s' % (
        options.out_dir, options.out_dir, options.meme_db)
    if subprocess.call(tomtom_cmd, shell=True):
        message('Error running tomtom', 'error')

    # read in annotations
    filter_names = name_filters(num_filters,
                                '%s/tomtom/tomtom.txt' % options.out_dir,
                                options.meme_db)

    #################################################################
    # print a table of information
    #################################################################
    table_out = open('%s/table.txt' % options.out_dir, 'w')

    # print header for later panda reading
    header_cols = ('', 'consensus', 'annotation', 'ic', 'mean', 'std')
    print >> table_out, '%3s  %19s  %10s  %5s  %6s  %6s' % header_cols

    for f in range(num_filters):
        # collapse to a consensus motif
        consensus = filter_motif(filter_weights[f, :, :])

        # grab annotation
        annotation = '.'
        name_pieces = filter_names[f].split('_')
        if len(name_pieces) > 1:
            annotation = name_pieces[1]

        # plot density of filter output scores
        fmean, fstd = plot_score_density(
            np.ravel(filter_outs[:, f, :]),
            '%s/filter%d_dens.pdf' % (options.out_dir, f))

        row_cols = (f, consensus, annotation, filters_ic[f], fmean, fstd)
        print >> table_out, '%-3d  %19s  %10s  %5.2f  %6.4f  %6.4f' % row_cols

    table_out.close()

    #################################################################
    # global filter plots
    #################################################################
    # plot filter-sequence heatmap
    plot_filter_seq_heat(filter_outs, '%s/filter_seqs.pdf' % options.out_dir)

    # plot filter-segment heatmap
    plot_filter_seg_heat(filter_outs, '%s/filter_segs.pdf' % options.out_dir)
    plot_filter_seg_heat(filter_outs,
                         '%s/filter_segs_raw.pdf' % options.out_dir,
                         whiten=False)

    # plot filter-target correlation heatmap
    plot_target_corr(filter_outs, seq_targets, filter_names, target_names,
                     '%s/filter_target_cors_mean.pdf' % options.out_dir,
                     'mean')
    plot_target_corr(filter_outs, seq_targets, filter_names, target_names,
                     '%s/filter_target_cors_max.pdf' % options.out_dir, 'max')
示例#16
0
def main():
    usage = 'usage: %prog [options] <model_th> <vcf_file>'
    parser = OptionParser(usage)
    parser.add_option('--cuda', dest='cuda', default=False, action='store_true', help='Predict on the GPU [Default: %default]')
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-f', dest='genome_fasta', default='%s/data/genomes/hg19.fa'%os.environ['BASSETDIR'], help='Genome FASTA from which sequences will be drawn [Default: %default]')
    parser.add_option('-i', dest='index_snp', default=False, action='store_true', help='SNPs are labeled with their index SNP as column 6 [Default: %default]')
    parser.add_option('-l', dest='seq_len', type='int', default=600, help='Sequence length provided to the model [Default: %default]')
    parser.add_option('-m', dest='min_limit', default=0.1, type='float', help='Minimum heatmap limit [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='sad', help='Output directory for tables and plots [Default: %default]')
    parser.add_option('-s', dest='score', default=False, action='store_true', help='SNPs are labeled with scores as column 7 [Default: %default]')
    parser.add_option('-t', dest='targets_file', default=None, help='File specifying target indexes and labels in table format')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide Torch model and VCF file')
    else:
        model_th = args[0]
        vcf_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # prep SNP sequences
    #################################################################
    # load SNPs
    snps = vcf.vcf_snps(vcf_file, options.index_snp, options.score)

    if options.model_hdf5_file is None:
        # get one hot coded input sequences
        seq_vecs, seqs, seq_headers = vcf.snps_seq1(snps, options.genome_fasta, options.seq_len)

        # reshape sequences for torch
        seq_vecs = seq_vecs.reshape((seq_vecs.shape[0],4,1,seq_vecs.shape[1]/4))

        # write to HDF5
        h5f = h5py.File('%s/model_in.h5'%options.out_dir, 'w')
        h5f.create_dataset('test_in', data=seq_vecs)
        h5f.close()


    #################################################################
    # predict in Torch
    #################################################################
    if options.model_hdf5_file is None:
        if options.cuda:
            cuda_str = '-cuda'
        else:
            cuda_str = ''

        options.model_hdf5_file = '%s/model_out.txt' % options.out_dir
        cmd = 'basset_predict.lua -norm %s %s %s/model_in.h5 %s' % (cuda_str, model_th, options.out_dir, options.model_hdf5_file)
        if subprocess.call(cmd, shell=True):
            message('Error running basset_predict.lua', 'error')

    # read in predictions
    seq_preds = []
    for line in open(options.model_hdf5_file):
        seq_preds.append(np.array([float(p) for p in line.split()]))
    seq_preds = np.array(seq_preds)


    #################################################################
    # collect and print SADs
    #################################################################
    if options.targets_file is not None:
        target_labels = [line.split()[0] for line in open(options.targets_file)]

    sad_out = open('%s/sad_table.txt' % options.out_dir, 'w')

    header_cols = ('rsid', 'index', 'score', 'ref', 'alt', 'target', 'ref_pred', 'alt_pred', 'sad')
    print >> sad_out, ' '.join(header_cols)

    # hash by index snp
    sad_matrices = {}
    sad_labels = {}
    sad_scores = {}

    pi = 0
    for snp in snps:
        # get reference prediction
        ref_preds = seq_preds[pi,:]
        pi += 1

        for alt_al in snp.alt_alleles:
            # get alternate prediction
            alt_preds = seq_preds[pi,:]
            pi += 1

            # normalize by reference
            alt_sad = alt_preds - ref_preds
            sad_matrices.setdefault(snp.index_snp,[]).append(alt_sad)

            # label as mutation from reference
            alt_label = '%s_%s>%s' % (snp.rsid, vcf.cap_allele(snp.ref_allele), vcf.cap_allele(alt_al))
            sad_labels.setdefault(snp.index_snp,[]).append(alt_label)

            # save scores
            sad_scores.setdefault(snp.index_snp,[]).append(snp.score)

            # print table lines
            for ti in range(len(alt_sad)):
                if options.index_snp and options.score:
                    cols = (snp.rsid, snp.index_snp, snp.score, vcf.cap_allele(snp.ref_allele), vcf.cap_allele(alt_al), target_labels[ti], ref_preds[ti], alt_preds[ti], alt_sad[ti])
                    print >> sad_out, '%-13s %-13s %5.3f %6s %6s %12s %6.4f %6.4f %7.4f' % cols
                elif options.index_snp:
                    cols = (snp.rsid, snp.index_snp, vcf.cap_allele(snp.ref_allele), vcf.cap_allele(alt_al), target_labels[ti], ref_preds[ti], alt_preds[ti], alt_sad[ti])
                    print >> sad_out, '%-13s %-13s %6s %6s %12s %6.4f %6.4f %7.4f' % cols
                elif options.score:
                    cols = (snp.rsid, snp.score, vcf.cap_allele(snp.ref_allele), vcf.cap_allele(alt_al), target_labels[ti], ref_preds[ti], alt_preds[ti], alt_sad[ti])
                    print >> sad_out, '%-13s %5.3f %6s %6s %12s %6.4f %6.4f %7.4f' % cols
                else:
                    cols = (snp.rsid, vcf.cap_allele(snp.ref_allele), vcf.cap_allele(alt_al), target_labels[ti], ref_preds[ti], alt_preds[ti], alt_sad[ti])
                    print >> sad_out, '%-13s %6s %6s %12s %6.4f %6.4f %7.4f' % cols

    sad_out.close()


    #################################################################
    # plot SAD heatmaps
    #################################################################
    for ii in sad_matrices:
        # convert fully to numpy arrays
        sad_matrix = abs(np.array(sad_matrices[ii]))
        print ii, sad_matrix.shape

        if sad_matrix.shape[0] > 1:
            vlim = max(options.min_limit, sad_matrix.max())
            score_mat = np.reshape(np.array(sad_scores[ii]), (-1, 1))

            if options.targets_file is None:
                # plot heatmap
                plt.figure(figsize=(20, 0.5*sad_matrix.shape[0]))

                # lay out scores
                cols = 12
                ax_score = plt.subplot2grid((1,cols), (0,0))
                ax_sad = plt.subplot2grid((1,cols), (0,1), colspan=(cols-1))

                sns.heatmap(score_mat, xticklabels=False, yticklabels=False, vmin=0, vmax=1, cmap='Reds', cbar=False, ax=ax_score)
                sns.heatmap(sad_matrix, xticklabels=False, yticklabels=sad_labels[ii], vmin=0, vmax=vlim, ax=ax_sad)

            else:
                # plot heatmap
                plt.figure(figsize=(20, 0.5 + 0.5*sad_matrix.shape[0]))

                # lay out scores
                cols = 12
                ax_score = plt.subplot2grid((1,cols), (0,0))
                ax_sad = plt.subplot2grid((1,cols), (0,1), colspan=(cols-1))

                sns.heatmap(score_mat, xticklabels=False, yticklabels=False, vmin=0, vmax=1, cmap='Reds', cbar=False, ax=ax_score)
                sns.heatmap(sad_matrix, xticklabels=target_labels, yticklabels=sad_labels[ii], vmin=0, vmax=vlim, ax=ax_sad)

                for tick in ax_sad.get_xticklabels():
                    tick.set_rotation(-45)
                    tick.set_horizontalalignment('left')
                    tick.set_fontsize(5)

            plt.tight_layout()
            if ii == '.':
                out_pdf = '%s/sad_heat.pdf' % options.out_dir
            else:
                out_pdf = '%s/sad_%s_heat.pdf' % (options.out_dir, ii)
            plt.savefig(out_pdf)
            plt.close()