Exemplo n.º 1
0
def main():
    usage = 'usage: %prog [options] <hdf5_file>'
    parser = OptionParser(usage)
    parser.add_option('-s', dest='set', default='test', help='Set (train/valid/test) [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide data HDF5 file')
    else:
        hdf5_file = args[0]

    # load 1 hot coded seqeunces from HDF5
    hdf5_in = h5py.File(hdf5_file, 'r')
    seqs_1hot = np.array(hdf5_in['%s_in' % options.set])
    seq_headers = np.array(hdf5_in['test_headers'])
    hdf5_in.close()

    # convert to ACGT sequences
    seqs = dna_io.vecs2dna(seqs_1hot)

    for i, seq in enumerate(seqs):
        if seq_headers is None:
            header = 'seq%d' % i
        else:
            header = seq_headers[i]
        print '>%s\n%s' % (header, seq)
Exemplo n.º 2
0
def main():
    usage = 'usage: %prog [options] <hdf5_file>'
    parser = OptionParser(usage)
    parser.add_option('-s',
                      dest='set',
                      default='test',
                      help='Set (train/valid/test) [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error(
            'Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file'
        )
    else:
        hdf5_file = args[0]

    # load 1 hot coded seqeunces from HDF5
    hdf5_in = h5py.File(hdf5_file, 'r')
    seqs_1hot = np.array(hdf5_in['%s_in' % options.set])
    try:
        seq_headers = np.array(hdf5_in['test_headers'])
    except:
        seq_headers = None
    hdf5_in.close()

    # convert to ACGT sequences
    seqs = dna_io.vecs2dna(seqs_1hot)

    for i, seq in enumerate(seqs):
        if seq_headers is None:
            header = 'seq%d' % i
        else:
            header = seq_headers[i]
        print '>%s\n%s' % (header, seq)
Exemplo n.º 3
0
def main():
    usage = "usage: %prog [options] <hdf5_file>"
    parser = OptionParser(usage)
    parser.add_option("-s", dest="set", default="test", help="Set (train/valid/test) [Default: %default]")
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file")
    else:
        hdf5_file = args[0]

    # load 1 hot coded seqeunces from HDF5
    hdf5_in = h5py.File(hdf5_file, "r")
    seqs_1hot = np.array(hdf5_in["%s_in" % options.set])
    try:
        seq_headers = np.array(hdf5_in["test_headers"])
    except:
        seq_headers = None
    hdf5_in.close()

    # convert to ACGT sequences
    seqs = dna_io.vecs2dna(seqs_1hot)

    for i, seq in enumerate(seqs):
        if seq_headers is None:
            header = "seq%d" % i
        else:
            header = seq_headers[i]
        print ">%s\n%s" % (header, seq)
Exemplo n.º 4
0
def plot_weight_logo(filter_weights,
                     seq_tensor,
                     norm_beta,
                     norm_gamma,
                     norm_mean,
                     norm_var,
                     out_prefix,
                     maxpct_t=0.0):
    #Get convolution
    kmer_conv = np.tensordot(filter_weights, seq_tensor,
                             axes=((0, 1), (1, 2)))  #result in a 1D array

    #normalization using trained parameters
    kmer_conv_norm = (kmer_conv - norm_mean
                      ) * norm_beta / np.sqrt(norm_var + 0.00001) + norm_gamma

    #ReLU
    kmer_conv_relu = [0 if val < 0 else val for val in kmer_conv_norm]

    #calculate a ReLU cutoff to plot weblogo
    if maxpct_t == 0:
        relu_act = 0
    else:
        all_outs = np.ravel(kmer_conv_relu)
        all_outs_mean = all_outs.mean()
        all_outs_norm = all_outs - all_outs_mean
        relu_act = maxpct_t * all_outs_norm.max() + all_outs_mean

    #Find which kmer pass the cutoff
    kmer_ok_index = [
        i for i in range(len(kmer_conv_relu)) if kmer_conv_relu[i] > relu_act
    ]
    if len(kmer_ok_index) > 0:
        kmer_ok = np.array([seq_tensor[i, :, :]
                            for i in kmer_ok_index])  #shape (50,4,5)
        kmer_ok_4D = np.expand_dims(kmer_ok, axis=2)  #shape (50,4,1,5)
        kmer_seq = dna_io.vecs2dna(kmer_ok_4D)
        kmer_fasta = [
            ">" + str(i) + "\n" + seq + "\n"
            for i, seq in zip(range(len(kmer_seq)), kmer_seq)
        ]
        kmer_fasta_out = open("%s_activated_kmers.fa" % out_prefix, "w")
        kmer_fasta_out.writelines(kmer_fasta)
        kmer_fasta_out.close()
        if relu_act > 0:
            subprocess.call(
                "weblogo -X NO -Y NO -F pdf --resolution 300 --errorbars NO --fineprint '' -C '#CB2026' A A -C '#34459C' C C -C '#FBB116' G G -C '#0C8040' T T < %s_activated_kmers.fa > %s_weight_LOGO.pdf"
                % (out_prefix, out_prefix),
                shell=True)
        else:
            subprocess.call(
                "weblogo -U probability -F pdf --resolution 300 --errorbars NO --fineprint '' -C '#CB2026' A A -C '#34459C' C C -C '#FBB116' G G -C '#0C8040' T T < %s_activated_kmers.fa > %s_weight_LOGO.pdf"
                % (out_prefix, out_prefix),
                shell=True)
Exemplo n.º 5
0
def parse_filter_scores_multiple_base_hdf5(scores_hdf5_file):
    ### single motif scores = [activation for each filter][base_seq]
    ### paired motif scores = activation for [filter1][filter2][base_seq]
    ### seqs = motif that is used to represent each filter
    scores_hdf5_in = h5py.File(scores_hdf5_file, 'r')
    preds = np.array(scores_hdf5_in['preds'])
    seq_vecs = scores_hdf5_in['seqs']
    # print preds.shape
    # print seq_vecs.shape
    seqs = dna_io.vecs2dna(seq_vecs)
    scores_hdf5_in.close()

    assert(NUM_BASE_SEQS + NUM_BASE_SEQS * NUM_SEQS + NUM_BASE_SEQS * NUM_SEQS * NUM_SEQS == len(preds))
    num_seqs= NUM_SEQS

    base_scores = []
    single_motif_scores = []
    paired_motif_scores = []

    for i in range(num_seqs):
        single_motif_scores.append([False] * NUM_BASE_SEQS)
        paired_motif_scores.append([])
        for j in range(num_seqs):
            paired_motif_scores[i].append([False] * NUM_BASE_SEQS)

    z = 0

    for i in range(NUM_BASE_SEQS):
        base_scores.append(preds[i])
        base_seqs.append(seqs[i])
        z += 1

    motif_seqs = []
    for i in range(NUM_SEQS):
        motif_seqs.append(seqs[z][300:319])
        for j in range(NUM_BASE_SEQS):
            single_motif_scores[i][j] = preds[z]
            z += 1

    for i in range(num_seqs):
        for j in range(num_seqs):
            for k in range(NUM_BASE_SEQS):
                paired_motif_scores[i][j][k] = preds[z]
                z += 1

    return (base_seqs, base_scores, single_motif_scores, paired_motif_scores, motif_seqs)
Exemplo n.º 6
0
def parse_filter_scores_hdf5(scores_hdf5_file):

    ### single motif scores = [activation for each filter]
    ### paired motif scores = activation for [filter1][filter2][offset]
    ### seqs = motif that is used to represent each filter
    scores_hdf5_in = h5py.File(scores_hdf5_file, 'r')
    preds = np.array(scores_hdf5_in['preds'])
    seq_vecs = scores_hdf5_in['seqs']
    print preds.shape
    print seq_vecs.shape
    seqs = dna_io.vecs2dna(seq_vecs)
    scores_hdf5_in.close()

    # num_seqs = len(seqs)
    assert(NUM_SEQS + WINDOW_SIZE * NUM_SEQS * NUM_SEQS == len(preds))
    num_seqs= NUM_SEQS
    window_size = WINDOW_SIZE
    # window_size = (len(preds) - num_seqs) / (num_seqs * num_seqs)

    single_motif_scores = []
    paired_motif_scores = []


    for i in range(num_seqs):
        paired_motif_scores.append([])
        for j in range(num_seqs):
            paired_motif_scores[i].append([False] * window_size)

    for i in range(num_seqs):
        single_motif_scores.append(preds[i])

    z = num_seqs
    for i in range(num_seqs):
        for j in range(num_seqs):
            for k in range(window_size):
                paired_motif_scores[i][j][k] = preds[z]
                z += 1

    return (single_motif_scores, paired_motif_scores, seqs)
Exemplo n.º 7
0
def main():
    usage = 'usage: %prog [options] <model_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='targets_file',
        default=None,
        help='File labelings targets in the second column [Default: %default]')
    parser.add_option(
        '-c',
        dest='center_nt',
        default=50,
        help='Center nt to consider kmers from [Default: %default]')
    parser.add_option('-d',
                      dest='model_out_file',
                      default=None,
                      help='Pre-computed model output table.')
    parser.add_option('-k',
                      dest='kmer',
                      default=8,
                      type='int',
                      help='K-mer length [Default: %default]')
    parser.add_option('-l',
                      dest='seq_len',
                      default=1000,
                      type='int',
                      help='Input sequence length [Default: %default]')
    parser.add_option(
        '-n',
        dest='num_seqs',
        default=100000,
        type='int',
        help='Number of sequences to predict [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='.')
    parser.add_option(
        '-r',
        dest='rc',
        default=False,
        action='store_true',
        help='Consider k-mers w/ their reverse complements [Default: %default]'
    )
    parser.add_option(
        '-t',
        dest='targets',
        default=None,
        help=
        'Comma-separated list of targets to analyze in more depth [Default: %default]'
    )
    parser.add_option(
        '--top',
        dest='top_num',
        default=100,
        type='int',
        help=
        'Number of sequences with which to make a multiple sequence alignment')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide Basset model file.')
    else:
        model_file = args[0]

    random.seed(2)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.model_out_file is not None:
        seq_dna = []
        for line in open('%s/seqs.fa' % options.out_dir):
            if line[0] == '>':
                seq_dna.append('')
            else:
                seq_dna[-1] += line.rstrip()

    else:
        #################################################################
        # generate random sequences
        #################################################################
        # random sequences
        seq_vecs = np.zeros((options.num_seqs, 4, 1, options.seq_len),
                            dtype='float16')
        for si in range(options.num_seqs):
            for li in range(options.seq_len):
                ni = random.randint(0, 3)
                seq_vecs[si, ni, 0, li] = 1

        # create a new HDF5 file
        seq_hdf5_file = '%s/seqs.h5' % options.out_dir
        seq_hdf5_out = h5py.File(seq_hdf5_file, 'w')
        seq_hdf5_out.create_dataset('test_in', data=seq_vecs)
        seq_hdf5_out.close()

        # get fasta
        seq_dna = vecs2dna(seq_vecs)

        # print to file
        fasta_out = open('%s/seqs.fa' % options.out_dir, 'w')
        for i in range(len(seq_dna)):
            print >> fasta_out, '>%d\n%s' % (i, seq_dna[i])
        fasta_out.close()

        #################################################################
        # Torch predict
        #################################################################
        options.model_out_file = '%s/model_out.txt' % options.out_dir

        torch_cmd = 'basset_predict.lua -scores %s %s %s' % (
            model_file, seq_hdf5_file, options.model_out_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)

        # clean up sequence HDF5
        os.remove(seq_hdf5_file)

    # load scores
    seq_scores = np.loadtxt(options.model_out_file, dtype='float32')

    # read target labels
    if options.targets_file:
        target_labels = [
            line.split()[1] for line in open(options.targets_file)
        ]
    else:
        target_labels = ['t%d' % (ti + 1) for ti in range(seq_scores.shape[1])]

    if options.targets is None:
        options.targets = range(seq_scores.shape[1])
    else:
        options.targets = [int(ti) for ti in options.targets.split(',')]

    #################################################################
    # process and output
    #################################################################
    kmers_start = (options.seq_len - options.center_nt) / 2

    for ti in options.targets:
        print 'Working on target %d' % ti

        ##############################################
        # hash scores by k-mer
        ##############################################
        kmer_scores_raw = {}

        for si in range(len(seq_dna)):
            # get score
            sscore = seq_scores[si, ti]

            # hash to each center kmer
            for ki in range(kmers_start, kmers_start + options.center_nt):
                kmer = seq_dna[si][ki:ki + options.kmer]
                if options.rc:
                    kmer = consider_rc(kmer)

                kmer_scores_raw.setdefault(kmer, []).append(sscore)

        ##############################################
        # compute means and print table
        ##############################################
        table_out = open('%s/table%d.txt' % (options.out_dir, ti), 'w')

        kmer_means_raw = {}
        for kmer in kmer_scores_raw:
            kmer_means_raw[kmer] = np.mean(kmer_scores_raw[kmer])
            kmer_n = len(kmer_scores_raw[kmer])
            cols = (kmer, kmer_n, kmer_means_raw[kmer],
                    np.std(kmer_scores_raw[kmer]) / math.sqrt(kmer_n))
            print >> table_out, '%s  %4d  %6.3f  %6.3f' % cols

        table_out.close()

        ##############################################
        # plot density
        ##############################################
        plt.figure()
        sns.distplot(kmer_means_raw.values(), kde=False)
        plt.savefig('%s/density%d.pdf' % (options.out_dir, ti))
        plt.close()

        ##############################################
        # top k-mers distance matrix
        ##############################################
        kmer_means = {}
        kmer_means_mean = np.mean(kmer_means_raw.values())
        for kmer in kmer_means_raw:
            kmer_means[kmer] = kmer_means_raw[kmer] - kmer_means_mean

        # score by score
        scores_kmers = [(kmer_means[kmer], kmer) for kmer in kmer_means]
        scores_kmers.sort(reverse=True)

        # take top k-mers
        top_kmers = []
        top_kmers_scores = []
        for score, kmer in scores_kmers[:options.top_num]:
            top_kmers.append(kmer)
            top_kmers_scores.append(score)
        top_kmers = np.array(top_kmers)
        top_kmers_scores = np.array(top_kmers_scores)

        # compute distance matrix
        top_kmers_dists = np.zeros((options.top_num, options.top_num))
        for i in range(options.top_num):
            for j in range(i + 1, options.top_num):
                if options.rc:
                    top_kmers_dists[i, j] = kmer_distance_rc(
                        top_kmers[i], top_kmers[j])
                else:
                    top_kmers_dists[i,
                                    j] = kmer_distance(top_kmers[i],
                                                       top_kmers[j])
                top_kmers_dists[j, i] = top_kmers_dists[i, j]

        # clip the distances
        np.clip(top_kmers_dists, 0, 3, out=top_kmers_dists)

        # plot
        plot_kmer_dists(top_kmers_dists, top_kmers_scores, top_kmers,
                        '%s/top_kmers_heat%d.pdf' % (options.out_dir, ti))

        # cluster and plot
        cluster_kmer_dists(top_kmers_dists, top_kmers_scores, top_kmers,
                           '%s/top_kmers_clust%d.pdf' % (options.out_dir, ti))
Exemplo n.º 8
0
def main():
    usage = 'usage: %prog [options] <model_file> <test_hdf5_file>'
    parser = OptionParser(usage)
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5.')
    parser.add_option('-o', dest='out_dir', default='.')
    parser.add_option('-m', dest='meme_db', default='%s/data/motifs/Homo_sapiens.meme' % os.environ['BASSETDIR'], help='MEME database used to annotate motifs')
    parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]')
    parser.add_option('-t', dest='trim_filters', default=False, action='store_true', help='Trim uninformative positions off the filter ends [Default: %default]')
    parser.add_option('--skip-heat', dest='skip_heat', default=False, help="Skip plotting heat maps of filters")
    parser.add_option('--skip-logo', dest='skip_logo', default=False, help="Skip Weblogo plots for filters")
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide Basset model file and test data in HDF5 format.')
    else:
        model_file = args[0]
        test_hdf5_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # load data
    #################################################################
    # load sequences
    test_hdf5_in = h5py.File(test_hdf5_file, 'r')
    seq_vecs = np.array(test_hdf5_in['test_in'])
    # print seq_vecs.shape
    # print "with numpy", seq_vecs.nbytes
    seq_targets = np.array(test_hdf5_in['test_out'])
    try:
        target_names = list(test_hdf5_in['target_labels'])
    except KeyError:
        target_names = ['t%d'%ti for ti in range(seq_targets.shape[1])]
    test_hdf5_in.close()


    #################################################################
    # sample
    #################################################################
    if options.model_hdf5_file is not None:
        print "Model outs file specified. Do not resample, use sample sequences from model outs file."
    else:
        if options.sample is not None:
            # choose sampled indexes
            sample_i = np.array(random.sample(xrange(seq_vecs.shape[0]), options.sample))

            # filter
            seq_vecs = seq_vecs[sample_i]
            seq_targets = seq_targets[sample_i]

            # create a new HDF5 file
            sample_hdf5_file = '%s/sample.h5' % options.out_dir
            sample_hdf5_out = h5py.File(sample_hdf5_file, 'w')
            print seq_vecs.shape
            sample_hdf5_out.create_dataset('test_in', data=seq_vecs)
            sample_hdf5_out.close()

            # update test HDF5
            test_hdf5_file = sample_hdf5_file

        print "Finished creating sample file"

    #################################################################
    # Torch predict
    #################################################################
    if options.model_hdf5_file is None:
        print "No model hdf5 file specified"
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_motifs_predict.lua %s %s %s' % (model_file, test_hdf5_file, options.model_hdf5_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)

    # load model output
    model_hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    filter_weights = np.array(model_hdf5_in['weights'])
    filter_outs = np.array(model_hdf5_in['outs']) # 
    seq_vecs = model_hdf5_in['sample_seqs']
    seqs = dna_io.vecs2dna(seq_vecs)
    model_hdf5_in.close()

    # store useful variables
    num_filters = filter_weights.shape[0]
    filter_size = filter_weights.shape[2]


    #################################################################
    # individual filter plots
    #################################################################
    # also save information contents
    filters_ic = []
    meme_out_file = meme_intro('%s/filters_meme.txt'%options.out_dir, seqs)

    for f in range(num_filters):
        print 'Filter %d' % f

        # plot filter parameters as a heatmap
        if not options.skip_heat:
            plot_filter_heat(filter_weights[f,:,:], '%s/filter%d_heat.pdf' % (options.out_dir,f))

        # plot weblogo of high scoring outputs
        if not options.skip_logo:
            plot_filter_logo(filter_outs[:,f,:], filter_size, seqs, '%s/filter%d_logo'%(options.out_dir,f), maxpct_t=0.5)

        # make a PWM for the filter
        filter_pwm, nsites = make_filter_pwm('%s/filter%d_logo.fa'%(options.out_dir,f))

        if nsites < 10:
            # no information
            filters_ic.append(0)
            print "No information"
        else:
            # compute and save information content
            filters_ic.append(info_content(filter_pwm))

            # add to the meme motif file
            meme_add(meme_out_file, f, filter_pwm, nsites, options.trim_filters)

    meme_out_file.close()


    #################################################################
    # annotate filters
    #################################################################
    # run tomtom
    subprocess.call('tomtom -dist pearson -thresh 0.1 -oc %s/tomtom %s/filters_meme.txt %s' % (options.out_dir, options.out_dir, options.meme_db), shell=True)

    # read in annotations
    filter_names = name_filters(num_filters, '%s/tomtom/tomtom.txt'%options.out_dir, options.meme_db)


    #################################################################
    # print a table of information
    #################################################################
    table_out = open('%s/table.txt'%options.out_dir, 'w')

    # print header for later panda reading
    header_cols = ('', 'consensus', 'annotation', 'ic', 'mean', 'std')
    print >> table_out, '%3s  %19s  %10s  %5s  %6s  %6s' % header_cols

    for f in range(num_filters):
        # collapse to a consensus motif
        consensus = filter_motif(filter_weights[f,:,:])

        # grab annotation
        annotation = '.'
        name_pieces = filter_names[f].split('_')
        if len(name_pieces) > 1:
            annotation = name_pieces[1]

        # plot density of filter output scores
        fmean, fstd = plot_score_density(np.ravel(filter_outs[:,f,:]), '%s/filter%d_dens.pdf' % (options.out_dir,f))

        row_cols = (f, consensus, annotation, filters_ic[f], fmean, fstd)
        print >> table_out, '%-3d  %19s  %10s  %5.2f  %6.4f  %6.4f' % row_cols

    table_out.close()


    #################################################################
    # global filter plots
    #################################################################
    # plot filter-sequence heatmap
    plot_filter_seq_heat(filter_outs, '%s/filter_seqs.pdf'%options.out_dir)

    # plot filter-segment heatmap
    plot_filter_seg_heat(filter_outs, '%s/filter_segs.pdf'%options.out_dir)
    plot_filter_seg_heat(filter_outs, '%s/filter_segs_raw.pdf'%options.out_dir, whiten=False)

    # plot filter-target correlation heatmap
    plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_mean.pdf'%options.out_dir, 'mean')
    plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_max.pdf'%options.out_dir, 'max')
Exemplo n.º 9
0
def main():
    usage = 'usage: %prog [options] <model_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file')
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-g', dest='gain_height', default=False, action='store_true', help='Nucleotide heights determined by the max of loss and gain [Default: %default]')
    parser.add_option('-m', dest='min_limit', default=0.1, type='float', help='Minimum heatmap limit [Default: %default]')
    parser.add_option('-n', dest='center_nt', default=0, type='int', help='Center nt to mutate and plot in the heat map [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]')
    parser.add_option('-p', dest='print_table_all', default=False, action='store_true', help='Print all targets to the table [Default: %default]')
    parser.add_option('-r', dest='rng_seed', default=1, type='float', help='Random number generator seed [Default: %default]')
    parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]')
    parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file')
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(options.rng_seed)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = '%s/model_in.h5'%options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False)

            # read in target names
            target_labels = open(options.input_activity_file).readline().strip().split('\t')

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')
            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            try: # TEMP
                seq_headers = np.array(hdf5_in['test_headers'])
                target_labels = np.array(hdf5_in['target_labels'])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = '%s/model_in.h5'%options.out_dir
                h5f = h5py.File(model_input_hdf5, 'w')
                h5f.create_dataset('test_in', data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')


    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_sat_predict.lua -center_nt %d %s %s %s' % (options.center_nt, model_file, model_input_hdf5, options.model_hdf5_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)


    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    seq_mod_preds = np.array(hdf5_in['seq_mod_preds'])
    hdf5_in.close()

    # trim seqs to match seq_mod_preds length
    seq_len = len(seqs[0])
    delta_start = 0
    delta_len = seq_mod_preds.shape[2]
    if delta_len < seq_len:
        delta_start = (seq_len - delta_len)/2
        for i in range(len(seqs)):
            seqs[i] = seqs[i][delta_start:delta_start+delta_len]

    # decide which cells to plot
    if options.targets == '-1':
        plot_targets = xrange(seq_mod_preds.shape[3])
    else:
        plot_targets = [int(ci) for ci in options.targets.split(',')]


    #################################################################
    # plot
    #################################################################
    table_out = open('%s/table.txt' % options.out_dir, 'w')

    rdbu = sns.color_palette("RdBu_r", 10)

    nts = 'ACGT'
    for si in range(seq_mod_preds.shape[0]):
        try:
            header = seq_headers[si]
        except TypeError:
            header = 'seq%d' % si
        seq = seqs[si]

        # plot some descriptive heatmaps for each individual cell type
        for ci in plot_targets:
            seq_mod_preds_cell = seq_mod_preds[si,:,:,ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            # compute matrices
            norm_matrix = seq_mod_preds_cell - real_pred_cell
            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)
            minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell])

            # prepare figure
            sns.set(style='white', font_scale=0.5)
            sns.axes_style({'axes.linewidth':1})
            heat_cols = 400
            sad_start = 1
            sad_end = 323
            logo_start = 0
            logo_end = 324
            fig = plt.figure(figsize=(20,3))
            ax_logo = plt.subplot2grid((3,heat_cols), (0,logo_start), colspan=(logo_end-logo_start))
            ax_sad = plt.subplot2grid((3,heat_cols), (1,sad_start), colspan=(sad_end-sad_start))
            ax_heat = plt.subplot2grid((3,heat_cols), (2,0), colspan=heat_cols)

            # print a WebLogo of the sequence
            vlim = max(options.min_limit, abs(minmax_matrix).max())
            if options.gain_height:
                seq_heights = 0.25 + 1.75/vlim*(abs(minmax_matrix).max(axis=0))
            else:
                seq_heights = 0.25 + 1.75/vlim*(-minmax_matrix[0])
            logo_eps = '%s/%s_c%d_seq.eps' % (options.out_dir, header_filename(header), ci)
            seq_logo(seq, seq_heights, logo_eps)

            # add to figure
            logo_png = '%s.png' % logo_eps[:-4]
            subprocess.call('convert -density 300 %s %s' % (logo_eps, logo_png), shell=True)
            logo = Image.open(logo_png)
            ax_logo.imshow(logo)
            ax_logo.set_axis_off()

            # plot loss and gain SAD scores
            ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label='loss', linewidth=1)
            ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label='gain', linewidth=1)
            ax_sad.set_xlim(0,minmax_matrix.shape[1])
            ax_sad.legend()
            # ax_sad.grid(True, linestyle=':')
            for axis in ['top','bottom','left','right']:
                ax_sad.spines[axis].set_linewidth(0.5)

            # plot real-normalized scores
            vlim = max(options.min_limit, abs(norm_matrix).max())
            sns.heatmap(norm_matrix, linewidths=0, cmap='RdBu_r', vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat)
            ax_heat.yaxis.set_ticklabels('TGCA', rotation='horizontal') # , size=10)

            # save final figure
            plt.tight_layout()
            plt.savefig('%s/%s_c%d_heat.pdf' % (options.out_dir,header.replace(':','_'), ci), dpi=300)
            plt.close()


        #################################################################
        # print table of nt variability for each cell
        #################################################################
        print_targets = plot_targets
        if options.print_table_all:
            print_targets = range(seq_mod_preds.shape[3])

        for ci in print_targets:
            seq_mod_preds_cell = seq_mod_preds[si,:,:,ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)

            loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0)
            gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell

            for pos in range(seq_mod_preds_cell.shape[1]):
                cols = [header, delta_start+pos, ci, loss_matrix[pos], gain_matrix[pos]]
                print >> table_out, '\t'.join([str(c) for c in cols])

    table_out.close()
Exemplo n.º 10
0
def main():
    usage = 'usage: %prog [options] <model_file>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='targets_file', default=None, help='File labelings targets in the second column [Default: %default]')
    parser.add_option('-c', dest='center_nt', default=50, help='Center nt to consider kmers from [Default: %default]')
    parser.add_option('-d', dest='model_out_file', default=None, help='Pre-computed model output table.')
    parser.add_option('-k', dest='kmer', default=8, type='int', help='K-mer length [Default: %default]')
    parser.add_option('-l', dest='seq_len', default=1000, type='int', help='Input sequence length [Default: %default]')
    parser.add_option('-n', dest='num_seqs', default=100000, type='int', help='Number of sequences to predict [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='.')
    parser.add_option('-r', dest='rc', default=False, action='store_true', help='Consider k-mers w/ their reverse complements [Default: %default]')
    parser.add_option('-t', dest='targets', default=None, help='Comma-separated list of targets to analyze in more depth [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide Basset model file.')
    else:
        model_file = args[0]

    random.seed(2)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # generate random sequences
    #################################################################
    # random sequences
    seq_vecs = np.zeros((options.num_seqs,4,1,options.seq_len), dtype='float16')
    for si in range(options.num_seqs):
        for li in range(options.seq_len):
            ni = random.randint(0,3)
            seq_vecs[si,ni,0,li] = 1

    # create a new HDF5 file
    seq_hdf5_file = '%s/seqs.h5' % options.out_dir
    seq_hdf5_out = h5py.File(seq_hdf5_file, 'w')
    seq_hdf5_out.create_dataset('test_in', data=seq_vecs)
    seq_hdf5_out.close()

    # get fasta
    seq_dna = vecs2dna(seq_vecs)


    #################################################################
    # Torch predict
    #################################################################
    if options.model_out_file is None:
        options.model_out_file = '%s/model_out.txt' % options.out_dir

        torch_cmd = 'basset_predict.lua -scores %s %s %s' % (model_file, seq_hdf5_file, options.model_out_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)

    # load scores
    seq_scores = np.loadtxt(options.model_out_file, dtype='float32')

    # read target labels
    if options.targets_file:
        target_labels = [line.split()[1] for line in open(options.targets_file)]
    else:
        target_labels = ['t%d'%(ti+1) for ti in range(seq_scores.shape[1])]

    if options.targets == None:
        options.targets = range(seq_scores.shape[1])


    #################################################################
    # process and output
    #################################################################
    kmers_start = (options.seq_len - options.center_nt) / 2

    for ti in options.targets:
        ##############################################
        # hash scores by k-mer
        ##############################################
        kmer_scores = {}

        for si in range(len(seq_dna)):
            # get score
            sscore = seq_scores[si,ti]

            # hash to each center kmer
            for ki in range(kmers_start, kmers_start + options.center_nt):
                kmer = seq_dna[si][ki:ki+options.kmer]
                if options.rc:
                    kmer = consider_rc(kmer)

                kmer_scores.setdefault(kmer,[]).append(sscore)


        ##############################################
        # print table
        ##############################################
        table_out = open('%s/table%d.txt' % (options.out_dir,ti), 'w')

        for kmer in kmer_scores:
            cols = (kmer, len(kmer_scores[kmer]), np.mean(kmer_scores[kmer]), np.std(kmer_scores[kmer])/math.sqrt(len(kmer_scores[kmer])))
            print >> table_out, '%s  %4d  %6.3f  %6.3f' % cols

        table_out.close()
Exemplo n.º 11
0
def main():
    usage = "usage: %prog [options] <motif> <model_file> <test_hdf5_file>"
    parser = OptionParser(usage)
    parser.add_option("-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5.")
    parser.add_option("-f", dest="filters", default=None, help="Filters to plot length analysis [Default: %default]")
    parser.add_option("-o", dest="out_dir", default=".")
    parser.add_option(
        "-p",
        dest="pool",
        default=False,
        action="store_true",
        help="Take representation after pooling [Default: %default]",
    )
    parser.add_option("-s", dest="sample", default=None, type="int", help="Sequences to sample [Default: %default]")
    parser.add_option(
        "-t",
        dest="targets",
        default=None,
        help="Comma-separated list of targets to analyze in more depth [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error("Must provide motif, Basset model file, and test data in HDF5 format.")
    else:
        motif = args[0]
        model_file = args[1]
        test_hdf5_file = args[2]

    random.seed(2)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # load data
    #################################################################
    # load sequences
    test_hdf5_in = h5py.File(test_hdf5_file, "r")
    seq_vecs = np.array(test_hdf5_in["test_in"])
    seq_targets = np.array(test_hdf5_in["test_out"])
    seq_headers = np.array(test_hdf5_in["test_headers"])
    target_labels = np.array(test_hdf5_in["target_labels"])
    test_hdf5_in.close()

    #################################################################
    # sample
    #################################################################
    if options.sample is not None and options.sample < seq_vecs.shape[0]:
        # choose sampled indexes
        sample_i = np.array(random.sample(xrange(seq_vecs.shape[0]), options.sample))

        # filter
        seq_vecs = seq_vecs[sample_i]
        seq_targets = seq_targets[sample_i]
        seq_headers = seq_headers[sample_i]

        # create a new HDF5 file
        sample_hdf5_file = "%s/sample.h5" % options.out_dir
        sample_hdf5_out = h5py.File(sample_hdf5_file, "w")
        sample_hdf5_out.create_dataset("test_in", data=seq_vecs)
        sample_hdf5_out.close()

        # update test HDF5
        test_hdf5_file = sample_hdf5_file

    #################################################################
    # write in motif
    #################################################################
    # this code must match the Torch code
    seq_len = seq_vecs.shape[3]
    seq_mid = math.floor(seq_len / 2.0 - len(motif) / 2.0) - 1
    for si in range(seq_vecs.shape[0]):
        for pi in range(len(motif)):
            one_hot_set(seq_vecs[si], seq_mid + pi, motif[pi])

    # get fasta
    seq_dna = vecs2dna(seq_vecs)

    #################################################################
    # Torch predict
    #################################################################
    if options.model_hdf5_file is None:
        pool_str = ""
        if options.pool:
            pool_str = "-pool"

        options.model_hdf5_file = "%s/model_out.h5" % options.out_dir

        torch_cmd = "basset_anchor_predict.lua %s %s %s %s %s" % (
            pool_str,
            motif,
            model_file,
            test_hdf5_file,
            options.model_hdf5_file,
        )
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)

    # load model output
    model_hdf5_in = h5py.File(options.model_hdf5_file, "r")
    pre_preds = np.array(model_hdf5_in["pre_preds"])
    preds = np.array(model_hdf5_in["preds"])
    scores = np.array(model_hdf5_in["scores"])
    seq_filter_outs = np.array(model_hdf5_in["filter_outs"])
    pre_seq_filter_outs = np.array(model_hdf5_in["pre_filter_outs"])
    model_hdf5_in.close()

    # pre-process
    seq_filter_means = seq_filter_outs.mean(axis=2)
    filter_means = seq_filter_means.mean(axis=0)
    filter_msds = seq_filter_means.std(axis=0) + 1e-6

    num_seqs = seq_filter_means.shape[0]
    num_filters = seq_filter_means.shape[1]
    num_targets = len(target_labels)

    if options.filters is None:
        options.filters = range(num_filters)
    else:
        options.filters = [int(fi) for fi in options.filters.split(",")]

    if options.targets is None:
        options.targets = range(num_targets)
    else:
        options.targets = [int(ti) for ti in options.targets.split(",")]

    #################################################################
    # scatter plot prediction changes
    #################################################################
    sns.set(style="ticks", font_scale=1.5)
    lim_eps = 0.02

    for ti in options.targets:
        if num_seqs > 500:
            isample = np.array(random.sample(range(num_seqs), 500))
        else:
            isample = np.array(range(num_seqs))

        plt.figure(figsize=(8, 8))

        g = sns.jointplot(pre_preds[isample, ti], preds[isample, ti], color="black", stat_func=None, alpha=0.5, space=0)

        ax = g.ax_joint
        ax.plot([0, 1], [0, 1], c="black", linewidth=1, linestyle="--")

        ax.set_xlim((0 - lim_eps, 1 + lim_eps))
        ax.set_ylim((0 - lim_eps, 1 + lim_eps))

        ax.set_xlabel("Pre-insertion accessibility")
        ax.set_ylabel("Post-insertion accessibility")
        ax.grid(True, linestyle=":")

        ax_x = g.ax_marg_x
        ax_x.set_title(target_labels[ti])

        plt.savefig("%s/scatter_t%d.pdf" % (options.out_dir, ti))
        plt.close()

    #################################################################
    # plot sequences
    #################################################################
    for ti in options.targets:
        # sort sequences by score
        seqsi = np.argsort(scores[:, ti])[::-1]

        # print a fasta file with uniformly sampled sequences
        unif_i = np.array([int(sp) for sp in np.arange(0, num_seqs, num_seqs / 200.0)])
        seqsi_uniform = seqsi[unif_i]
        fasta_out = open("%s/seqs_t%d.fa" % (options.out_dir, ti), "w")
        for si in seqsi_uniform:
            print >> fasta_out, ">%s_gc%.2f_p%.2f\n%s" % (seq_headers[si], gc(seq_dna[si]), preds[si, ti], seq_dna[si])
        fasta_out.close()

        # print their filter/pos activations to a table
        #  this is slow and big, and I only need it when I'm trying
        #  to find a specific example.
        table_out = open("%s/seqs_t%d_table.txt" % (options.out_dir, ti), "w")
        for si in seqsi_uniform:
            for fi in range(num_filters):
                for pi in range(seq_filter_outs.shape[2]):
                    cols = (seq_headers[si], fi, pi, seq_filter_outs[si, fi, pi])
                    print >> table_out, "%-25s  %3d  %3d  %5.2f" % cols
        table_out.close()

        # sample fewer for heat map
        unif_i = np.array([int(sp) for sp in np.arange(0, num_seqs, num_seqs / 200.0)])
        seqsi_uniform = seqsi[unif_i]

        """ these kinda suck
        # plot heat map
        plt.figure()
        n = 20
        ax_sf = plt.subplot2grid((1,n), (0,0), colspan=n-1)
        ax_ss = plt.subplot2grid((1,n), (0,n-1))

        # filter heat
        sf_norm = seq_filter_means[seqsi_uniform,:] - filter_means
        # sf_norm = np.divide(seq_filter_means[seqsi_uniform,:] - filter_means, filter_msds)

        sns.heatmap(sf_norm, vmin=-.04, vmax=.04, xticklabels=False, yticklabels=False, ax=ax_sf)
        # scores heat
        sns.heatmap(scores[seqsi_uniform,ti].reshape(-1,1), xticklabels=False, yticklabels=False, ax=ax_ss)

        # this crashed the program, and I don't know why
        # plt.tight_layout()
        plt.savefig('%s/seqs_t%d.pdf' % (options.out_dir, ti))
        plt.close()
        """

    #################################################################
    # filter mean correlations
    #################################################################
    # compute and print
    table_out = open("%s/table.txt" % options.out_dir, "w")
    filter_target_cors = np.zeros((num_filters, num_targets))
    for fi in range(num_filters):
        for ti in range(num_targets):
            cor, p = spearmanr(seq_filter_means[:, fi], scores[:, ti])
            cols = (fi, ti, cor, p)
            print >> table_out, "%-3d  %3d  %6.3f  %6.1e" % cols
            if np.isnan(cor):
                cor = 0
            filter_target_cors[fi, ti] = cor
    table_out.close()

    # plot
    ftc_df = pd.DataFrame(filter_target_cors, columns=target_labels)
    plt.figure()
    g = sns.clustermap(ftc_df)
    for tick in g.ax_heatmap.get_xticklabels():
        tick.set_rotation(-45)
        tick.set_horizontalalignment("left")
        tick.set_fontsize(3)
    for tick in g.ax_heatmap.get_yticklabels():
        tick.set_fontsize(3)
    plt.savefig("%s/filters_targets.pdf" % options.out_dir)
    plt.close()

    #################################################################
    # filter position correlation
    #################################################################
    sns.set(style="ticks", font_scale=1.7)

    table_out = open("%s/filter_pos.txt" % options.out_dir, "w")

    for fi in options.filters:
        for ti in options.targets:
            print "Plotting f%d versus t%d" % (fi, ti)

            # compute correlations
            pos_cors = []
            pos_cors_pre = []
            nans = 0
            for pi in range(seq_filter_outs.shape[2]):
                # motif correlation
                cor, p = spearmanr(seq_filter_outs[:, fi, pi], preds[:, ti])
                if np.isnan(cor):
                    cor = 0
                    p = 1
                    nans += 1
                pos_cors.append(cor)

                # pre correlation
                cor_pre, p_pre = spearmanr(pre_seq_filter_outs[:, fi, pi], pre_preds[:, ti])
                if np.isnan(cor_pre):
                    cor_pre = 0
                    p_pre = 1
                pos_cors_pre.append(cor_pre)

                cols = (fi, pi, ti, cor, p, cor_pre, p_pre)
                print >> table_out, "%-3d  %3d  %3d  %6.3f  %6.1e  %6.3f  %6.1e" % cols

            if nans < 50:
                # plot
                # df_pc = pd.DataFrame({'Position':range(len(pos_cors)), 'Correlation':pos_cors})
                plt.figure(figsize=(9, 6))
                plt.title(target_labels[ti])
                # sns.regplot(x='Position', y='Correlation', data=df_pc, lowess=True)
                plt.scatter(
                    range(len(pos_cors)),
                    pos_cors_pre,
                    c=sns_colors[2],
                    alpha=0.8,
                    linewidths=0,
                    label="Before motif insertion",
                )
                plt.scatter(
                    range(len(pos_cors)),
                    pos_cors,
                    c=sns_colors[1],
                    alpha=0.8,
                    linewidths=0,
                    label="After motif insertion",
                )
                plt.axhline(y=0, linestyle="--", c="grey", linewidth=1)

                ax = plt.gca()
                ax.set_xlim(0, len(pos_cors))
                ax.set_xlabel("Position")
                ax.set_ylabel("Activation vs Prediction Correlation")
                ax.grid(True, linestyle=":")

                sns.despine()
                plt.legend()
                plt.tight_layout()
                plt.savefig("%s/f%d_t%d.pdf" % (options.out_dir, fi, ti))
                plt.close()

    table_out.close()
Exemplo n.º 12
0
def main():
    usage = "usage: %prog [options] <model_file> <input_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-a", dest="input_activity_file", help="Optional activity table corresponding to an input FASTA file"
    )
    parser.add_option(
        "-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5 [Default: %default]"
    )
    parser.add_option(
        "-g",
        dest="gain_height",
        default=False,
        action="store_true",
        help="Nucleotide heights determined by the max of loss and gain [Default: %default]",
    )
    parser.add_option(
        "-m", dest="min_limit", default=0.1, type="float", help="Minimum heatmap limit [Default: %default]"
    )
    parser.add_option(
        "-n",
        dest="center_nt",
        default=200,
        type="int",
        help="Center nt to mutate and plot in the heat map [Default: %default]",
    )
    parser.add_option("-o", dest="out_dir", default="heat", help="Output directory [Default: %default]")
    parser.add_option(
        "-s", dest="sample", default=None, type="int", help="Sample sequences from the test set [Default:%default]"
    )
    parser.add_option(
        "-t",
        dest="targets",
        default="0",
        help="Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error("Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file")
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == ">":
                seq_headers.append(line[1:].rstrip())
                seqs.append("")
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = "%s/model_in.h5" % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False
            )

            # read in target names
            target_labels = open(options.input_activity_file).readline().strip().split("\t")

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, "w")
        h5f.create_dataset("test_in", data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, "r")
            seqs_1hot = np.array(hdf5_in["test_in"])
            targets = np.array(hdf5_in["test_out"])
            try:  # TEMP
                seq_headers = np.array(hdf5_in["test_headers"])
                target_labels = np.array(hdf5_in["target_labels"])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = "%s/model_in.h5" % options.out_dir
                h5f = h5py.File(model_input_hdf5, "w")
                h5f.create_dataset("test_in", data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error("Could not parse input file as FASTA or HDF5.")

    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = "%s/model_out.h5" % options.out_dir
        torch_cmd = "basset_sat_predict.lua -center_nt %d %s %s %s" % (
            options.center_nt,
            model_file,
            model_input_hdf5,
            options.model_hdf5_file,
        )
        if subprocess.call(torch_cmd, shell=True):
            message("Error running basset_sat_predict.lua", "error")

    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, "r")
    seq_mod_preds = np.array(hdf5_in["seq_mod_preds"])
    hdf5_in.close()

    # trim seqs to match seq_mod_preds length
    seq_len = len(seqs[0])
    delta_start = 0
    delta_len = seq_mod_preds.shape[2]
    if delta_len < seq_len:
        delta_start = (seq_len - delta_len) / 2
        for i in range(len(seqs)):
            seqs[i] = seqs[i][delta_start : delta_start + delta_len]

    # decide which cells to plot
    if options.targets == "-1":
        plot_targets = xrange(seq_mod_preds.shape[3])
    else:
        plot_targets = [int(ci) for ci in options.targets.split(",")]

    #################################################################
    # plot
    #################################################################
    table_out = open("%s/table.txt" % options.out_dir, "w")

    rdbu = sns.color_palette("RdBu_r", 10)

    nts = "ACGT"
    for si in range(seq_mod_preds.shape[0]):
        try:
            header = seq_headers[si]
        except TypeError:
            header = "seq%d" % si
        seq = seqs[si]

        # plot some descriptive heatmaps for each individual cell type
        for ci in plot_targets:
            seq_mod_preds_cell = seq_mod_preds[si, :, :, ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            # compute matrices
            norm_matrix = seq_mod_preds_cell - real_pred_cell
            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)
            minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell])

            # prepare figure
            sns.set(style="white", font_scale=0.5)
            sns.axes_style({"axes.linewidth": 1})
            heat_cols = 400
            sad_start = 1
            sad_end = 323
            logo_start = 0
            logo_end = 324
            fig = plt.figure(figsize=(20, 3))
            ax_logo = plt.subplot2grid((3, heat_cols), (0, logo_start), colspan=(logo_end - logo_start))
            ax_sad = plt.subplot2grid((3, heat_cols), (1, sad_start), colspan=(sad_end - sad_start))
            ax_heat = plt.subplot2grid((3, heat_cols), (2, 0), colspan=heat_cols)

            # print a WebLogo of the sequence
            vlim = max(options.min_limit, abs(minmax_matrix).max())
            if options.gain_height:
                seq_heights = 0.25 + 1.75 / vlim * (abs(minmax_matrix).max(axis=0))
            else:
                seq_heights = 0.25 + 1.75 / vlim * (-minmax_matrix[0])
            logo_eps = "%s/%s_c%d_seq.eps" % (options.out_dir, header_filename(header), ci)
            seq_logo(seq, seq_heights, logo_eps)

            # add to figure
            logo_png = "%s.png" % logo_eps[:-4]
            logo_cmd = "convert -density 300 %s %s" % (logo_eps, logo_png)
            if subprocess.call(logo_cmd, shell=True):
                message("Error running convert", "error")
            logo = Image.open(logo_png)
            ax_logo.imshow(logo)
            ax_logo.set_axis_off()

            # plot loss and gain SAD scores
            ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label="loss", linewidth=1)
            ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label="gain", linewidth=1)
            ax_sad.set_xlim(0, minmax_matrix.shape[1])
            ax_sad.legend()
            # ax_sad.grid(True, linestyle=':')
            for axis in ["top", "bottom", "left", "right"]:
                ax_sad.spines[axis].set_linewidth(0.5)

            # plot real-normalized scores
            vlim = max(options.min_limit, abs(norm_matrix).max())
            sns.heatmap(norm_matrix, linewidths=0, cmap="RdBu_r", vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat)
            ax_heat.yaxis.set_ticklabels("TGCA", rotation="horizontal")  # , size=10)

            # save final figure
            plt.tight_layout()
            plt.savefig("%s/%s_c%d_heat.pdf" % (options.out_dir, header.replace(":", "_"), ci), dpi=300)
            plt.close()

        #################################################################
        # print table of nt variability for each cell
        #################################################################
        for ci in range(seq_mod_preds.shape[3]):
            seq_mod_preds_cell = seq_mod_preds[si, :, :, ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)

            loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0)
            gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell

            for pos in range(seq_mod_preds_cell.shape[1]):
                cols = [header, delta_start + pos, ci, loss_matrix[pos], gain_matrix[pos]]
                print >> table_out, "\t".join([str(c) for c in cols])

    table_out.close()
Exemplo n.º 13
0
def parse_interaction_scores_hdf5(scores_hdf5_file):
    ### single motif scores = [activation for each filter][base_seq]
    ### paired motif scores = activation for [filter1][filter2][base_seq]
    ### motif_seqs = motif that is used to represent each filter

    def get_base_seqs(seqs):
        return seqs[:NUM_BASE_SEQS]

    def get_motifs(seqs):
        motif_seqs = []
        z = NUM_BASE_SEQS
        for i in range(NUM_MOTIFS):
            motif_seqs.append(seqs[z][300:319])
            for j in range(NUM_BASE_SEQS):
                z += 1
        return motif_seqs

    def get_base_scores(preds):
        base_scores = []
        for i in range(NUM_BASE_SEQS):
            base_scores.append(preds[i])
        return base_scores

    def get_single_motif_scores(preds):
        single_motif_scores = []
        z = NUM_BASE_SEQS
        for i in range(NUM_MOTIFS):
            single_motif_scores.append([False] * NUM_BASE_SEQS)
            for j in range(NUM_BASE_SEQS):
                single_motif_scores[i][j] = preds[z]
                z += 1
        return single_motif_scores

    def get_single_motif_scores_offset(preds):
        z = NUM_BASE_SEQS + NUM_BASE_SEQS * NUM_MOTIFS
        single_motif_scores = []
        for i in range(NUM_MOTIFS):
            single_motif_scores.append([False] * NUM_BASE_SEQS)
            for j in range(NUM_BASE_SEQS):
                single_motif_scores.append(preds[z])
                z += 1
        return single_motif_scores

    def get_paired_motif_scores(preds):
        paired_motif_scores = []
        for i in range(NUM_MOTIFS):
            paired_motif_scores.append([])
            for j in range(NUM_MOTIFS):
                paired_motif_scores[i].append([False] * NUM_BASE_SEQS)

        z = NUM_BASE_SEQS + 2 * NUM_BASE_SEQS * NUM_MOTIFS
        for i in range(NUM_MOTIFS):
            for j in range(NUM_MOTIFS):
                for k in range(NUM_BASE_SEQS):
                    paired_motif_scores[i][j][k] = preds[z]
                    z += 1
        return paired_motif_scores

    ### Read in file
    scores_hdf5_in = h5py.File(scores_hdf5_file, "r")
    preds = np.array(scores_hdf5_in["preds"])
    seq_vecs = scores_hdf5_in["seqs"]
    seqs = dna_io.vecs2dna(seq_vecs)
    scores_hdf5_in.close()

    ### Make sure global variables are set properly
    assert NUM_BASE_SEQS + 2 * NUM_BASE_SEQS * NUM_MOTIFS + NUM_BASE_SEQS * NUM_MOTIFS * NUM_MOTIFS == len(preds)

    base_seqs = get_base_seqs(seqs)
    base_scores = get_base_scores(preds)
    single_motif_scores = get_single_motif_scores(preds)
    single_motif_scores_offset = get_single_motif_scores(preds)
    paired_motif_scores = get_paired_motif_scores(preds)
    motif_seqs = get_motifs(seqs)

    return (base_seqs, base_scores, single_motif_scores, single_motif_scores_offset, paired_motif_scores, motif_seqs)
Exemplo n.º 14
0
def main():
    usage = 'usage: %prog [options] <model_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='input_activity_file',
        help='Optional activity table corresponding to an input FASTA file')
    parser.add_option(
        '-d',
        dest='model_hdf5_file',
        default=None,
        help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='heat',
                      help='Output directory [Default: %default]')
    parser.add_option('-r',
                      dest='rng_seed',
                      default=1,
                      type='float',
                      help='Random number generator seed [Default: %default]')
    parser.add_option(
        '-s',
        dest='sample',
        default=None,
        type='int',
        help='Sample sequences from the test set [Default:%default]')
    parser.add_option(
        '-t',
        dest='targets',
        default='0',
        help=
        'Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file'
        )
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(options.rng_seed)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = '%s/model_in.h5' % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file,
                options.input_activity_file,
                mean_norm=False,
                whiten=False,
                permute=False,
                sort=False)

            # read in target names
            target_labels = open(
                options.input_activity_file).readline().strip().split('\t')

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(
                random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape(
            (seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')
            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            try:  # TEMP
                seq_headers = np.array(hdf5_in['test_headers'])
                target_labels = np.array(hdf5_in['target_labels'])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(
                    random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = '%s/model_in.h5' % options.out_dir
                h5f = h5py.File(model_input_hdf5, 'w')
                h5f.create_dataset('test_in', data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')

    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_net_predict.lua %s %s %s' % (
            model_file, model_input_hdf5, options.model_hdf5_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)

    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    reprs = []
    l = 1
    while 'reprs%d' % l in hdf5_in.keys():
        reprs.append(np.array(hdf5_in['reprs%d' % l]))
        l += 1
    hdf5_in.close()

    #################################################################
    # plot
    #################################################################
    print len(reprs)
    for l in range(len(reprs)):
        for si in range(len(seq_headers)):
            plt.figure()

            # just write the sequence out above it
            # or maybe I'll ultimately want to write an
            # influence version. yea probably.

            print reprs[l][si].shape
            sns.heatmap(reprs[l][si], linewidths=0, xticklabels=False)
            plt.savefig('%s/%s_l%d.pdf' %
                        (options.out_dir, header_filename(seq_headers[si]), l))
            plt.close()
Exemplo n.º 15
0
def main():
    usage = 'usage: %prog [options] <model_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file')
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]')
    parser.add_option('-r', dest='rng_seed', default=1, type='float', help='Random number generator seed [Default: %default]')
    parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]')
    parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file')
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(options.rng_seed)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = '%s/model_in.h5'%options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False)

            # read in target names
            target_labels = open(options.input_activity_file).readline().strip().split('\t')

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')
            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            try: # TEMP
                seq_headers = np.array(hdf5_in['test_headers'])
                target_labels = np.array(hdf5_in['target_labels'])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = '%s/model_in.h5'%options.out_dir
                h5f = h5py.File(model_input_hdf5, 'w')
                h5f.create_dataset('test_in', data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')


    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_net_predict.lua %s %s %s' % (model_file, model_input_hdf5, options.model_hdf5_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)


    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    reprs = []
    l = 1
    while 'reprs%d'%l in hdf5_in.keys():
        reprs.append(np.array(hdf5_in['reprs%d'%l]))
        l += 1
    hdf5_in.close()


    #################################################################
    # plot
    #################################################################
    print len(reprs)
    for l in range(len(reprs)):
        for si in range(len(seq_headers)):
            plt.figure()

            # just write the sequence out above it
            # or maybe I'll ultimately want to write an
            # influence version. yea probably.

            print reprs[l][si].shape
            sns.heatmap(reprs[l][si], linewidths=0, xticklabels=False)
            plt.savefig('%s/%s_l%d.pdf' % (options.out_dir, header_filename(seq_headers[si]), l))
            plt.close()
Exemplo n.º 16
0
def main():
    usage = 'usage: %prog [options] <motif> <model_file> <test_hdf5_file>'
    parser = OptionParser(usage)
    parser.add_option('-d',
                      dest='model_hdf5_file',
                      default=None,
                      help='Pre-computed model output as HDF5.')
    parser.add_option(
        '-f',
        dest='filters',
        default=None,
        help='Filters to plot length analysis [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='.')
    parser.add_option(
        '-p',
        dest='pool',
        default=False,
        action='store_true',
        help='Take representation after pooling [Default: %default]')
    parser.add_option('-s',
                      dest='sample',
                      default=None,
                      type='int',
                      help='Sequences to sample [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets',
        default=None,
        help=
        'Comma-separated list of targets to analyze in more depth [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            'Must provide motif, Basset model file, and test data in HDF5 format.'
        )
    else:
        motif = args[0]
        model_file = args[1]
        test_hdf5_file = args[2]

    random.seed(2)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # load data
    #################################################################
    # load sequences
    test_hdf5_in = h5py.File(test_hdf5_file, 'r')
    seq_vecs = np.array(test_hdf5_in['test_in'])
    seq_targets = np.array(test_hdf5_in['test_out'])
    seq_headers = np.array(test_hdf5_in['test_headers'])
    target_labels = np.array(test_hdf5_in['target_labels'])
    test_hdf5_in.close()

    #################################################################
    # sample
    #################################################################
    if options.sample is not None and options.sample < seq_vecs.shape[0]:
        # choose sampled indexes
        sample_i = np.array(
            random.sample(xrange(seq_vecs.shape[0]), options.sample))

        # filter
        seq_vecs = seq_vecs[sample_i]
        seq_targets = seq_targets[sample_i]
        seq_headers = seq_headers[sample_i]

        # create a new HDF5 file
        sample_hdf5_file = '%s/sample.h5' % options.out_dir
        sample_hdf5_out = h5py.File(sample_hdf5_file, 'w')
        sample_hdf5_out.create_dataset('test_in', data=seq_vecs)
        sample_hdf5_out.close()

        # update test HDF5
        test_hdf5_file = sample_hdf5_file

    #################################################################
    # write in motif
    #################################################################
    # this code must match the Torch code
    seq_len = seq_vecs.shape[3]
    seq_mid = math.floor(seq_len / 2.0 - len(motif) / 2.0) - 1
    for si in range(seq_vecs.shape[0]):
        for pi in range(len(motif)):
            one_hot_set(seq_vecs[si], seq_mid + pi, motif[pi])

    # get fasta
    seq_dna = vecs2dna(seq_vecs)

    #################################################################
    # Torch predict
    #################################################################
    if options.model_hdf5_file is None:
        pool_str = ''
        if options.pool:
            pool_str = '-pool'

        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir

        torch_cmd = 'basset_anchor_predict.lua %s %s %s %s %s' % (
            pool_str, motif, model_file, test_hdf5_file,
            options.model_hdf5_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)

    # load model output
    model_hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    pre_preds = np.array(model_hdf5_in['pre_preds'])
    preds = np.array(model_hdf5_in['preds'])
    scores = np.array(model_hdf5_in['scores'])
    seq_filter_outs = np.array(model_hdf5_in['filter_outs'])
    pre_seq_filter_outs = np.array(model_hdf5_in['pre_filter_outs'])
    model_hdf5_in.close()

    # pre-process
    seq_filter_means = seq_filter_outs.mean(axis=2)
    filter_means = seq_filter_means.mean(axis=0)
    filter_msds = seq_filter_means.std(axis=0) + 1e-6

    num_seqs = seq_filter_means.shape[0]
    num_filters = seq_filter_means.shape[1]
    num_targets = len(target_labels)

    if options.filters is None:
        options.filters = range(num_filters)
    else:
        options.filters = [int(fi) for fi in options.filters.split(',')]

    if options.targets is None:
        options.targets = range(num_targets)
    else:
        options.targets = [int(ti) for ti in options.targets.split(',')]

    #################################################################
    # scatter plot prediction changes
    #################################################################
    sns.set(style='ticks', font_scale=1.5)
    lim_eps = 0.02

    for ti in options.targets:
        if num_seqs > 500:
            isample = np.array(random.sample(range(num_seqs), 500))
        else:
            isample = np.array(range(num_seqs))

        plt.figure(figsize=(8, 8))

        g = sns.jointplot(pre_preds[isample, ti],
                          preds[isample, ti],
                          color='black',
                          stat_func=None,
                          alpha=0.5,
                          space=0)

        ax = g.ax_joint
        ax.plot([0, 1], [0, 1], c='black', linewidth=1, linestyle='--')

        ax.set_xlim((0 - lim_eps, 1 + lim_eps))
        ax.set_ylim((0 - lim_eps, 1 + lim_eps))

        ax.set_xlabel('Pre-insertion accessibility')
        ax.set_ylabel('Post-insertion accessibility')
        ax.grid(True, linestyle=':')

        ax_x = g.ax_marg_x
        ax_x.set_title(target_labels[ti])

        plt.savefig('%s/scatter_t%d.pdf' % (options.out_dir, ti))
        plt.close()

    #################################################################
    # plot sequences
    #################################################################
    for ti in options.targets:
        # sort sequences by score
        seqsi = np.argsort(scores[:, ti])[::-1]

        # print a fasta file with uniformly sampled sequences
        unif_i = np.array(
            [int(sp) for sp in np.arange(0, num_seqs, num_seqs / 200.0)])
        seqsi_uniform = seqsi[unif_i]
        fasta_out = open('%s/seqs_t%d.fa' % (options.out_dir, ti), 'w')
        for si in seqsi_uniform:
            print >> fasta_out, '>%s_gc%.2f_p%.2f\n%s' % (
                seq_headers[si], gc(seq_dna[si]), preds[si, ti], seq_dna[si])
        fasta_out.close()

        # print their filter/pos activations to a table
        #  this is slow and big, and I only need it when I'm trying
        #  to find a specific example.
        table_out = open('%s/seqs_t%d_table.txt' % (options.out_dir, ti), 'w')
        for si in seqsi_uniform:
            for fi in range(num_filters):
                for pi in range(seq_filter_outs.shape[2]):
                    cols = (seq_headers[si], fi, pi, seq_filter_outs[si, fi,
                                                                     pi])
                    print >> table_out, '%-25s  %3d  %3d  %5.2f' % cols
        table_out.close()

        # sample fewer for heat map
        unif_i = np.array(
            [int(sp) for sp in np.arange(0, num_seqs, num_seqs / 200.0)])
        seqsi_uniform = seqsi[unif_i]
        ''' these kinda suck
        # plot heat map
        plt.figure()
        n = 20
        ax_sf = plt.subplot2grid((1,n), (0,0), colspan=n-1)
        ax_ss = plt.subplot2grid((1,n), (0,n-1))

        # filter heat
        sf_norm = seq_filter_means[seqsi_uniform,:] - filter_means
        # sf_norm = np.divide(seq_filter_means[seqsi_uniform,:] - filter_means, filter_msds)

        sns.heatmap(sf_norm, vmin=-.04, vmax=.04, xticklabels=False, yticklabels=False, ax=ax_sf)
        # scores heat
        sns.heatmap(scores[seqsi_uniform,ti].reshape(-1,1), xticklabels=False, yticklabels=False, ax=ax_ss)

        # this crashed the program, and I don't know why
        # plt.tight_layout()
        plt.savefig('%s/seqs_t%d.pdf' % (options.out_dir, ti))
        plt.close()
        '''

    #################################################################
    # filter mean correlations
    #################################################################
    # compute and print
    table_out = open('%s/table.txt' % options.out_dir, 'w')
    filter_target_cors = np.zeros((num_filters, num_targets))
    for fi in range(num_filters):
        for ti in range(num_targets):
            cor, p = spearmanr(seq_filter_means[:, fi], scores[:, ti])
            cols = (fi, ti, cor, p)
            print >> table_out, '%-3d  %3d  %6.3f  %6.1e' % cols
            if np.isnan(cor):
                cor = 0
            filter_target_cors[fi, ti] = cor
    table_out.close()

    # plot
    ftc_df = pd.DataFrame(filter_target_cors, columns=target_labels)
    plt.figure()
    g = sns.clustermap(ftc_df)
    for tick in g.ax_heatmap.get_xticklabels():
        tick.set_rotation(-45)
        tick.set_horizontalalignment('left')
        tick.set_fontsize(3)
    for tick in g.ax_heatmap.get_yticklabels():
        tick.set_fontsize(3)
    plt.savefig('%s/filters_targets.pdf' % options.out_dir)
    plt.close()

    #################################################################
    # filter position correlation
    #################################################################
    sns.set(style='ticks', font_scale=1.7)

    table_out = open('%s/filter_pos.txt' % options.out_dir, 'w')

    for fi in options.filters:
        for ti in options.targets:
            print 'Plotting f%d versus t%d' % (fi, ti)

            # compute correlations
            pos_cors = []
            pos_cors_pre = []
            nans = 0
            for pi in range(seq_filter_outs.shape[2]):
                # motif correlation
                cor, p = spearmanr(seq_filter_outs[:, fi, pi], preds[:, ti])
                if np.isnan(cor):
                    cor = 0
                    p = 1
                    nans += 1
                pos_cors.append(cor)

                # pre correlation
                cor_pre, p_pre = spearmanr(pre_seq_filter_outs[:, fi, pi],
                                           pre_preds[:, ti])
                if np.isnan(cor_pre):
                    cor_pre = 0
                    p_pre = 1
                pos_cors_pre.append(cor_pre)

                cols = (fi, pi, ti, cor, p, cor_pre, p_pre)
                print >> table_out, '%-3d  %3d  %3d  %6.3f  %6.1e  %6.3f  %6.1e' % cols

            if nans < 50:
                # plot
                # df_pc = pd.DataFrame({'Position':range(len(pos_cors)), 'Correlation':pos_cors})
                plt.figure(figsize=(9, 6))
                plt.title(target_labels[ti])
                # sns.regplot(x='Position', y='Correlation', data=df_pc, lowess=True)
                plt.scatter(range(len(pos_cors)),
                            pos_cors_pre,
                            c=sns_colors[2],
                            alpha=0.8,
                            linewidths=0,
                            label='Before motif insertion')
                plt.scatter(range(len(pos_cors)),
                            pos_cors,
                            c=sns_colors[1],
                            alpha=0.8,
                            linewidths=0,
                            label='After motif insertion')
                plt.axhline(y=0, linestyle='--', c='grey', linewidth=1)

                ax = plt.gca()
                ax.set_xlim(0, len(pos_cors))
                ax.set_xlabel('Position')
                ax.set_ylabel('Activation vs Prediction Correlation')
                ax.grid(True, linestyle=':')

                sns.despine()
                plt.legend()
                plt.tight_layout()
                plt.savefig('%s/f%d_t%d.pdf' % (options.out_dir, fi, ti))
                plt.close()

    table_out.close()
Exemplo n.º 17
0
def main():
    usage = 'usage: %prog [options] <model_file>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='targets_file', default=None, help='File labelings targets in the second column [Default: %default]')
    parser.add_option('-c', dest='center_nt', default=50, help='Center nt to consider kmers from [Default: %default]')
    parser.add_option('-d', dest='model_out_file', default=None, help='Pre-computed model output table.')
    parser.add_option('-k', dest='kmer', default=8, type='int', help='K-mer length [Default: %default]')
    parser.add_option('-l', dest='seq_len', default=1000, type='int', help='Input sequence length [Default: %default]')
    parser.add_option('-n', dest='num_seqs', default=100000, type='int', help='Number of sequences to predict [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='.')
    parser.add_option('-r', dest='rc', default=False, action='store_true', help='Consider k-mers w/ their reverse complements [Default: %default]')
    parser.add_option('-t', dest='targets', default=None, help='Comma-separated list of targets to analyze in more depth [Default: %default]')
    parser.add_option('--top', dest='top_num', default=100, type='int', help='Number of sequences with which to make a multiple sequence alignment')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide Basset model file.')
    else:
        model_file = args[0]

    random.seed(2)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.model_out_file is not None:
        seq_dna = []
        for line in open('%s/seqs.fa' % options.out_dir):
            if line[0] == '>':
                seq_dna.append('')
            else:
                seq_dna[-1] += line.rstrip()

    else:
        #################################################################
        # generate random sequences
        #################################################################
        # random sequences
        seq_vecs = np.zeros((options.num_seqs,4,1,options.seq_len), dtype='float16')
        for si in range(options.num_seqs):
            for li in range(options.seq_len):
                ni = random.randint(0,3)
                seq_vecs[si,ni,0,li] = 1

        # create a new HDF5 file
        seq_hdf5_file = '%s/seqs.h5' % options.out_dir
        seq_hdf5_out = h5py.File(seq_hdf5_file, 'w')
        seq_hdf5_out.create_dataset('test_in', data=seq_vecs)
        seq_hdf5_out.close()

        # get fasta
        seq_dna = vecs2dna(seq_vecs)

        # print to file
        fasta_out = open('%s/seqs.fa' % options.out_dir, 'w')
        for i in range(len(seq_dna)):
            print >> fasta_out, '>%d\n%s' % (i,seq_dna[i])
        fasta_out.close()

        #################################################################
        # Torch predict
        #################################################################
        options.model_out_file = '%s/model_out.txt' % options.out_dir

        torch_cmd = 'basset_predict.lua -scores %s %s %s' % (model_file, seq_hdf5_file, options.model_out_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)

        # clean up sequence HDF5
        os.remove(seq_hdf5_file)

    # load scores
    seq_scores = np.loadtxt(options.model_out_file, dtype='float32')

    # read target labels
    if options.targets_file:
        target_labels = [line.split()[1] for line in open(options.targets_file)]
    else:
        target_labels = ['t%d'%(ti+1) for ti in range(seq_scores.shape[1])]

    if options.targets is None:
        options.targets = range(seq_scores.shape[1])
    else:
        options.targets = [int(ti) for ti in options.targets.split(',')]


    #################################################################
    # process and output
    #################################################################
    kmers_start = (options.seq_len - options.center_nt) / 2

    for ti in options.targets:
        print 'Working on target %d' % ti

        ##############################################
        # hash scores by k-mer
        ##############################################
        kmer_scores_raw = {}

        for si in range(len(seq_dna)):
            # get score
            sscore = seq_scores[si,ti]

            # hash to each center kmer
            for ki in range(kmers_start, kmers_start + options.center_nt):
                kmer = seq_dna[si][ki:ki+options.kmer]
                if options.rc:
                    kmer = consider_rc(kmer)

                kmer_scores_raw.setdefault(kmer,[]).append(sscore)

        ##############################################
        # compute means and print table
        ##############################################
        table_out = open('%s/table%d.txt' % (options.out_dir,ti), 'w')

        kmer_means_raw = {}
        for kmer in kmer_scores_raw:
            kmer_means_raw[kmer] = np.mean(kmer_scores_raw[kmer])
            kmer_n = len(kmer_scores_raw[kmer])
            cols = (kmer, kmer_n, kmer_means_raw[kmer], np.std(kmer_scores_raw[kmer])/math.sqrt(kmer_n))
            print >> table_out, '%s  %4d  %6.3f  %6.3f' % cols

        table_out.close()

        ##############################################
        # plot density
        ##############################################
        plt.figure()
        sns.distplot(kmer_means_raw.values(), kde=False)
        plt.savefig('%s/density%d.pdf' % (options.out_dir,ti))
        plt.close()

        ##############################################
        # top k-mers distance matrix
        ##############################################
        kmer_means = {}
        kmer_means_mean = np.mean(kmer_means_raw.values())
        for kmer in kmer_means_raw:
            kmer_means[kmer] = kmer_means_raw[kmer] - kmer_means_mean

        # score by score
        scores_kmers = [(kmer_means[kmer],kmer) for kmer in kmer_means]
        scores_kmers.sort(reverse=True)

        # take top k-mers
        top_kmers = []
        top_kmers_scores = []
        for score, kmer in scores_kmers[:options.top_num]:
            top_kmers.append(kmer)
            top_kmers_scores.append(score)
        top_kmers = np.array(top_kmers)
        top_kmers_scores = np.array(top_kmers_scores)

        # compute distance matrix
        top_kmers_dists = np.zeros((options.top_num, options.top_num))
        for i in range(options.top_num):
            for j in range(i+1,options.top_num):
                if options.rc:
                    top_kmers_dists[i,j] = kmer_distance_rc(top_kmers[i], top_kmers[j])
                else:
                    top_kmers_dists[i,j] = kmer_distance(top_kmers[i], top_kmers[j])
                top_kmers_dists[j,i] = top_kmers_dists[i,j]

        # clip the distances
        np.clip(top_kmers_dists, 0, 3, out=top_kmers_dists)

        # plot
        plot_kmer_dists(top_kmers_dists, top_kmers_scores, top_kmers, '%s/top_kmers_heat%d.pdf'%(options.out_dir,ti))

        # cluster and plot
        cluster_kmer_dists(top_kmers_dists, top_kmers_scores, top_kmers, '%s/top_kmers_clust%d.pdf'%(options.out_dir,ti))
Exemplo n.º 18
0
def main():
    usage = "usage: %prog [options] <model_file> <profile_file> <input_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-a", dest="input_activity_file", help="Optional activity table corresponding to an input FASTA file"
    )
    parser.add_option(
        "--all",
        dest="all_data",
        default=False,
        action="store_true",
        help="Search all training/valid/test sequences. By default we search only the test set. [Default: %default]",
    )
    parser.add_option(
        "--cuda", dest="cuda", default=False, action="store_true", help="Run on GPGPU [Default: %default]"
    )
    parser.add_option(
        "--cudnn", dest="cudnn", default=False, action="store_true", help="Run on GPGPU w/cuDNN [Default: %default]"
    )
    parser.add_option(
        "-d",
        dest="model_out_file",
        default=None,
        help="Pre-computed model predictions output table [Default: %default]",
    )
    parser.add_option(
        "-e",
        dest="norm_even",
        default=False,
        action="store_true",
        help="Normalize the weights for the positive and negative datasets to be even [Default: %default]",
    )
    parser.add_option("-f", dest="font_heat", default=6, type="int", help="Heat map axis font size [Default: %default]")
    parser.add_option(
        "-n", dest="num_dissect", default=10, type="int", help="Dissect the top n hits [Default: %default]"
    )
    parser.add_option("-o", dest="out_dir", default="profile", help="Output directory [Default: %default]")
    parser.add_option(
        "-r",
        dest="norm_preds",
        default=False,
        action="store_true",
        help="Normalize predictions to have equal frequency [Default: %default]",
    )
    parser.add_option(
        "-z",
        dest="weight_zero",
        default=1.0,
        type="float",
        help="Adjust the weights for the zero samples by this value [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            "Must provide Basset model file, activity profile file, and input sequences (as a FASTA file or test data in an HDF file)"
        )
    else:
        model_file = args[0]
        profile_file = args[1]
        input_file = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == ">":
                seq_headers.append(line[1:].rstrip())
                seqs.append("")
            else:
                seqs[-1] += line.rstrip()

        # convert to arrays
        seqs = np.array(seqs)
        seq_headers = np.array(seq_headers)

        model_input_hdf5 = "%s/model_in.h5" % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False
            )

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, "w")
        h5f.create_dataset("test_in", data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError, UnicodeDecodeError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, "r")

            seqs_1hot = np.array(hdf5_in["test_in"])
            targets = np.array(hdf5_in["test_out"])
            seq_headers = np.array([h.decode("UTF-8") for h in hdf5_in["test_headers"]])

            hdf5_in.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error("Could not parse input file as FASTA or HDF5.")

    #################################################################
    # Torch predict modifications
    #################################################################
    # GPU options (needed below, too)
    gpgpu_str = ""
    if options.cudnn:
        gpgpu_str = "-cudnn"
    elif options.cuda:
        gpgpu_str = "-cuda"

    if options.model_out_file is None:
        options.model_out_file = "%s/preds.txt" % options.out_dir

        torch_cmd = "basset_predict.lua -mc_n 10 -rc %s %s %s %s" % (
            gpgpu_str,
            model_file,
            model_input_hdf5,
            options.model_out_file,
        )
        print(torch_cmd)
        subprocess.call(torch_cmd, shell=True)

    # read in predictions
    seqs_preds = np.loadtxt(options.model_out_file)

    num_targets = seqs_preds.shape[1]

    #################################################################
    # parse profile file
    #################################################################
    activity_profile, profile_weights, profile_mask, target_labels = load_profile(
        profile_file, num_targets, options.norm_even, options.weight_zero
    )

    # normalize predictions
    if options.norm_preds:
        pred_means = seqs_preds.mean(axis=0)

        # save to file for basset_refine.py
        np.save("%s/pred_means" % options.out_dir, pred_means)

        # aim for profile weighted average
        aim_mean = np.average(pred_means[profile_mask], weights=profile_weights[profile_mask])

        # normalize
        for ti in range(seqs_preds.shape[1]):
            ratio_ti = pred_means[ti] / aim_mean
            if profile_mask[ti] and (ratio_ti < 1 / 4 or ratio_ti > 4):
                print(
                    "WARNING: target %d with mean %.4f differs 4-fold from the median %.3f"
                    % (ti, pred_means[ti], aim_mean),
                    file=sys.stderr,
                )
            seqs_preds[:, ti] = znorm(seqs_preds[:, ti], pred_means[ti], aim_mean)

    #################################################################
    # plot clustered heat map limited to relevant targets
    #################################################################
    seqs_preds_prof = seqs_preds[:, profile_mask]
    seqs_preds_var = seqs_preds_prof.var(axis=1)
    seqs_sort_var = np.argsort(seqs_preds_var)[::-1]

    # heat map
    plt.figure()
    g = sns.clustermap(
        np.transpose(seqs_preds_prof[seqs_sort_var[:1500]]),
        metric="cosine",
        linewidths=0,
        yticklabels=target_labels[profile_mask],
        xticklabels=False,
    )
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    for label in g.ax_heatmap.yaxis.get_majorticklabels():
        label.set_fontsize(options.font_heat)
    plt.savefig("%s/heat_clust.pdf" % options.out_dir)
    plt.close()

    # dimension reduction
    # model_pca = PCA(n_components=50)
    # spp_pca = model.fit_transform(np.transpose(seqs_preds_prof))
    # model = TSNE(n_components=2, perplexity=5, metric='euclidean')
    # spp_dr = model.fit_transform(spp_pca)
    model = PCA(n_components=2)
    spp_dr = model.fit_transform(np.transpose(seqs_preds_prof))
    plt.figure()
    plt.scatter(spp_dr[:, 0], spp_dr[:, 1], c="black", s=5)
    target_labels_prof_concise = [tl.split(":")[-1] for tl in target_labels[profile_mask]]
    for label, x, y, activity in zip(
        target_labels_prof_concise, spp_dr[:, 0], spp_dr[:, 1], activity_profile[profile_mask]
    ):
        plt.annotate(label, xy=(x, y), size=10, color=sns.color_palette("deep")[int(activity)])
    plt.savefig("%s/dim_red.pdf" % options.out_dir)
    plt.close()

    #################################################################
    # compute profile distances
    #################################################################
    # compute prediction distances
    seqs_pdists = []
    for si in range(seqs_preds.shape[0]):
        # sd = np.power(seqs_preds[si,profile_mask]-activity_profile[profile_mask], 2).sum()
        sd = log_loss(
            activity_profile[profile_mask], seqs_preds[si, profile_mask], sample_weight=profile_weights[profile_mask]
        )
        seqs_pdists.append(sd)
    seqs_pdists = np.array(seqs_pdists)

    # obtain sorted indexes
    seqs_sort_dist = np.argsort(seqs_pdists)

    # compute target distances
    seqs_tdists = []
    for si in range(seqs_preds.shape[0]):
        tdists = np.absolute(targets[si, profile_mask] - activity_profile[profile_mask])
        tdists_weight = np.multiply(tdists, profile_weights[profile_mask])
        td = tdists_weight.sum()
        seqs_tdists.append(td)
    seqs_tdists = np.array(seqs_tdists)

    # print as table
    table_out = open("%s/table.txt" % options.out_dir, "w")
    for si in seqs_sort_dist:
        cols = [si, seqs_pdists[si], seqs_tdists[si]] + list(seqs_preds[si, profile_mask])
        print("\t".join([str(c) for c in cols]), file=table_out)
    table_out.close()

    #################################################################
    # plot sorted heat map
    #################################################################
    plt.figure()
    g = sns.clustermap(
        np.transpose(seqs_preds_prof[seqs_sort_dist[:1000]]),
        col_cluster=False,
        metric="cosine",
        linewidths=0,
        yticklabels=target_labels[profile_mask],
        xticklabels=False,
    )
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    for label in g.ax_heatmap.yaxis.get_majorticklabels():
        label.set_fontsize(options.font_heat)
    plt.savefig("%s/heat_rank.pdf" % options.out_dir)
    plt.close()

    #################################################################
    # dissect the top hits
    #################################################################
    satmut_targets = ",".join([str(ti) for ti in range(len(activity_profile)) if profile_mask[ti]])

    if gpgpu_str != "":
        gpgpu_str = "-%s" % gpgpu_str

    for ni in range(options.num_dissect):
        si = seqs_sort_dist[ni]

        # print FASTA
        fasta_file = "%s/seq%d.fa" % (options.out_dir, ni)
        fasta_out = open(fasta_file, "w")
        print(">%s\n%s" % (seq_headers[si], seqs[si]), file=fasta_out)
        fasta_out.close()

        # saturated mutagenesis
        cmd = "basset_sat.py %s --mc_n 10 -n 500 -o %s/satmut%d -t %s %s %s" % (
            gpgpu_str,
            options.out_dir,
            ni,
            satmut_targets,
            model_file,
            fasta_file,
        )
        subprocess.call(cmd, shell=True)

        # predictions and targets heat
        profile_sort = np.argsort(activity_profile[profile_mask])
        heat_mat = np.array([activity_profile[profile_mask], targets[si, profile_mask], seqs_preds_prof[si]])
        heat_mat = heat_mat[:, profile_sort]

        plt.figure()
        ax = sns.heatmap(
            np.transpose(heat_mat),
            yticklabels=target_labels[profile_mask][profile_sort],
            xticklabels=["Desired", "Experiment", "Prediction"],
        )
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=-45)
        plt.setp(ax.yaxis.get_majorticklabels(), rotation=-0)
        for label in ax.yaxis.get_majorticklabels():
            label.set_fontsize(options.font_heat)
        plt.savefig("%s/heat%d.pdf" % (options.out_dir, ni))
        plt.close()
Exemplo n.º 19
0
def main():
    usage = 'usage: %prog [options] <model_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='input_activity_file',
        help='Optional activity table corresponding to an input FASTA file')
    parser.add_option(
        '-d',
        dest='model_hdf5_file',
        default=None,
        help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option(
        '-g',
        dest='gain_height',
        default=False,
        action='store_true',
        help=
        'Nucleotide heights determined by the max of loss and gain [Default: %default]'
    )
    parser.add_option('-m',
                      dest='min_limit',
                      default=0.1,
                      type='float',
                      help='Minimum heatmap limit [Default: %default]')
    parser.add_option(
        '-n',
        dest='center_nt',
        default=200,
        type='int',
        help='Center nt to mutate and plot in the heat map [Default: %default]'
    )
    parser.add_option('-o',
                      dest='out_dir',
                      default='heat',
                      help='Output directory [Default: %default]')
    parser.add_option(
        '-s',
        dest='sample',
        default=None,
        type='int',
        help='Sample sequences from the test set [Default:%default]')
    parser.add_option(
        '-t',
        dest='targets',
        default='0',
        help=
        'Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file'
        )
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = '%s/model_in.h5' % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file,
                options.input_activity_file,
                mean_norm=False,
                whiten=False,
                permute=False,
                sort=False)

            # read in target names
            target_labels = open(
                options.input_activity_file).readline().strip().split('\t')

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(
                random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape(
            (seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')
            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            try:  # TEMP
                seq_headers = np.array(hdf5_in['test_headers'])
                target_labels = np.array(hdf5_in['target_labels'])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(
                    random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = '%s/model_in.h5' % options.out_dir
                h5f = h5py.File(model_input_hdf5, 'w')
                h5f.create_dataset('test_in', data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')

    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_sat_predict.lua -center_nt %d %s %s %s' % (
            options.center_nt, model_file, model_input_hdf5,
            options.model_hdf5_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)

    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    seq_mod_preds = np.array(hdf5_in['seq_mod_preds'])
    hdf5_in.close()

    # trim seqs to match seq_mod_preds length
    seq_len = len(seqs[0])
    delta_start = 0
    delta_len = seq_mod_preds.shape[2]
    if delta_len < seq_len:
        delta_start = (seq_len - delta_len) / 2
        for i in range(len(seqs)):
            seqs[i] = seqs[i][delta_start:delta_start + delta_len]

    # decide which cells to plot
    if options.targets == '-1':
        plot_targets = xrange(seq_mod_preds.shape[3])
    else:
        plot_targets = [int(ci) for ci in options.targets.split(',')]

    #################################################################
    # plot
    #################################################################
    table_out = open('%s/table.txt' % options.out_dir, 'w')

    rdbu = sns.color_palette("RdBu_r", 10)

    nts = 'ACGT'
    for si in range(seq_mod_preds.shape[0]):
        try:
            header = seq_headers[si]
        except TypeError:
            header = 'seq%d' % si
        seq = seqs[si]

        # plot some descriptive heatmaps for each individual cell type
        for ci in plot_targets:
            seq_mod_preds_cell = seq_mod_preds[si, :, :, ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            # compute matrices
            norm_matrix = seq_mod_preds_cell - real_pred_cell
            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)
            minmax_matrix = np.vstack(
                [min_scores - real_pred_cell, max_scores - real_pred_cell])

            # prepare figure
            sns.set(style='white', font_scale=0.5)
            sns.axes_style({'axes.linewidth': 1})
            heat_cols = 400
            sad_start = 1
            sad_end = 323
            logo_start = 0
            logo_end = 324
            fig = plt.figure(figsize=(20, 3))
            ax_logo = plt.subplot2grid((3, heat_cols), (0, logo_start),
                                       colspan=(logo_end - logo_start))
            ax_sad = plt.subplot2grid((3, heat_cols), (1, sad_start),
                                      colspan=(sad_end - sad_start))
            ax_heat = plt.subplot2grid((3, heat_cols), (2, 0),
                                       colspan=heat_cols)

            # print a WebLogo of the sequence
            vlim = max(options.min_limit, abs(minmax_matrix).max())
            if options.gain_height:
                seq_heights = 0.25 + 1.75 / vlim * (abs(minmax_matrix).max(
                    axis=0))
            else:
                seq_heights = 0.25 + 1.75 / vlim * (-minmax_matrix[0])
            logo_eps = '%s/%s_c%d_seq.eps' % (options.out_dir,
                                              header_filename(header), ci)
            seq_logo(seq, seq_heights, logo_eps)

            # add to figure
            logo_png = '%s.png' % logo_eps[:-4]
            subprocess.call('convert -density 300 %s %s' %
                            (logo_eps, logo_png),
                            shell=True)
            logo = Image.open(logo_png)
            ax_logo.imshow(logo)
            ax_logo.set_axis_off()

            # plot loss and gain SAD scores
            ax_sad.plot(-minmax_matrix[0],
                        c=rdbu[0],
                        label='loss',
                        linewidth=1)
            ax_sad.plot(minmax_matrix[1],
                        c=rdbu[-1],
                        label='gain',
                        linewidth=1)
            ax_sad.set_xlim(0, minmax_matrix.shape[1])
            ax_sad.legend()
            # ax_sad.grid(True, linestyle=':')
            for axis in ['top', 'bottom', 'left', 'right']:
                ax_sad.spines[axis].set_linewidth(0.5)

            # plot real-normalized scores
            vlim = max(options.min_limit, abs(norm_matrix).max())
            sns.heatmap(norm_matrix,
                        linewidths=0,
                        cmap='RdBu_r',
                        vmin=-vlim,
                        vmax=vlim,
                        xticklabels=False,
                        ax=ax_heat)
            ax_heat.yaxis.set_ticklabels('TGCA',
                                         rotation='horizontal')  # , size=10)

            # save final figure
            plt.tight_layout()
            plt.savefig('%s/%s_c%d_heat.pdf' %
                        (options.out_dir, header.replace(':', '_'), ci),
                        dpi=300)
            plt.close()

        #################################################################
        # print table of nt variability for each cell
        #################################################################
        for ci in range(seq_mod_preds.shape[3]):
            seq_mod_preds_cell = seq_mod_preds[si, :, :, ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)

            loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0)
            gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell

            for pos in range(seq_mod_preds_cell.shape[1]):
                cols = [
                    header, delta_start + pos, ci, loss_matrix[pos],
                    gain_matrix[pos]
                ]
                print >> table_out, '\t'.join([str(c) for c in cols])

    table_out.close()
Exemplo n.º 20
0
def main():
    usage = 'usage: %prog [options] <model_file> <profile_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='input_activity_file',
        help='Optional activity table corresponding to an input FASTA file')
    parser.add_option(
        '--all',
        dest='all_data',
        default=False,
        action='store_true',
        help=
        'Search all training/valid/test sequences. By default we search only the test set. [Default: %default]'
    )
    parser.add_option('--cuda',
                      dest='cuda',
                      default=False,
                      action='store_true',
                      help='Run on GPGPU [Default: %default]')
    parser.add_option('--cudnn',
                      dest='cudnn',
                      default=False,
                      action='store_true',
                      help='Run on GPGPU w/cuDNN [Default: %default]')
    parser.add_option(
        '-d',
        dest='model_out_file',
        default=None,
        help='Pre-computed model predictions output table [Default: %default]')
    parser.add_option(
        '-e',
        dest='norm_even',
        default=False,
        action='store_true',
        help=
        'Normalize the weights for the positive and negative datasets to be even [Default: %default]'
    )
    parser.add_option('-f',
                      dest='font_heat',
                      default=6,
                      type='int',
                      help='Heat map axis font size [Default: %default]')
    parser.add_option('-n',
                      dest='num_dissect',
                      default=10,
                      type='int',
                      help='Dissect the top n hits [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='profile',
                      help='Output directory [Default: %default]')
    parser.add_option(
        '-r',
        dest='norm_preds',
        default=False,
        action='store_true',
        help='Normalize predictions to have equal frequency [Default: %default]'
    )
    parser.add_option(
        '-z',
        dest='weight_zero',
        default=1.0,
        type='float',
        help=
        'Adjust the weights for the zero samples by this value [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            'Must provide Basset model file, activity profile file, and input sequences (as a FASTA file or test data in an HDF file)'
        )
    else:
        model_file = args[0]
        profile_file = args[1]
        input_file = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        # convert to arrays
        seqs = np.array(seqs)
        seq_headers = np.array(seq_headers)

        model_input_hdf5 = '%s/model_in.h5' % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file,
                options.input_activity_file,
                mean_norm=False,
                whiten=False,
                permute=False,
                sort=False)

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape(
            (seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError, UnicodeDecodeError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')

            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            seq_headers = np.array(
                [h.decode('UTF-8') for h in hdf5_in['test_headers']])

            hdf5_in.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')

    #################################################################
    # Torch predict modifications
    #################################################################
    # GPU options (needed below, too)
    gpgpu_str = ''
    if options.cudnn:
        gpgpu_str = '-cudnn'
    elif options.cuda:
        gpgpu_str = '-cuda'

    if options.model_out_file is None:
        options.model_out_file = '%s/preds.txt' % options.out_dir

        torch_cmd = 'basset_predict.lua -mc_n 10 -rc %s %s %s %s' % (
            gpgpu_str, model_file, model_input_hdf5, options.model_out_file)
        print(torch_cmd)
        subprocess.call(torch_cmd, shell=True)

    # read in predictions
    seqs_preds = np.loadtxt(options.model_out_file)

    num_targets = seqs_preds.shape[1]

    #################################################################
    # parse profile file
    #################################################################
    activity_profile, profile_weights, profile_mask, target_labels = load_profile(
        profile_file, num_targets, options.norm_even, options.weight_zero)

    # normalize predictions
    if options.norm_preds:
        pred_means = seqs_preds.mean(axis=0)

        # save to file for basset_refine.py
        np.save('%s/pred_means' % options.out_dir, pred_means)

        # aim for profile weighted average
        aim_mean = np.average(pred_means[profile_mask],
                              weights=profile_weights[profile_mask])

        # normalize
        for ti in range(seqs_preds.shape[1]):
            ratio_ti = pred_means[ti] / aim_mean
            if profile_mask[ti] and (ratio_ti < 1 / 4 or ratio_ti > 4):
                print(
                    'WARNING: target %d with mean %.4f differs 4-fold from the median %.3f'
                    % (ti, pred_means[ti], aim_mean),
                    file=sys.stderr)
            seqs_preds[:, ti] = znorm(seqs_preds[:, ti], pred_means[ti],
                                      aim_mean)

    #################################################################
    # plot clustered heat map limited to relevant targets
    #################################################################
    seqs_preds_prof = seqs_preds[:, profile_mask]
    seqs_preds_var = seqs_preds_prof.var(axis=1)
    seqs_sort_var = np.argsort(seqs_preds_var)[::-1]

    # heat map
    plt.figure()
    g = sns.clustermap(np.transpose(seqs_preds_prof[seqs_sort_var[:1500]]),
                       metric='cosine',
                       linewidths=0,
                       yticklabels=target_labels[profile_mask],
                       xticklabels=False)
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    for label in g.ax_heatmap.yaxis.get_majorticklabels():
        label.set_fontsize(options.font_heat)
    plt.savefig('%s/heat_clust.pdf' % options.out_dir)
    plt.close()

    # dimension reduction
    # model_pca = PCA(n_components=50)
    # spp_pca = model.fit_transform(np.transpose(seqs_preds_prof))
    # model = TSNE(n_components=2, perplexity=5, metric='euclidean')
    # spp_dr = model.fit_transform(spp_pca)
    model = PCA(n_components=2)
    spp_dr = model.fit_transform(np.transpose(seqs_preds_prof))
    plt.figure()
    plt.scatter(spp_dr[:, 0], spp_dr[:, 1], c='black', s=5)
    target_labels_prof_concise = [
        tl.split(':')[-1] for tl in target_labels[profile_mask]
    ]
    for label, x, y, activity in zip(target_labels_prof_concise, spp_dr[:, 0],
                                     spp_dr[:, 1],
                                     activity_profile[profile_mask]):
        plt.annotate(label,
                     xy=(x, y),
                     size=10,
                     color=sns.color_palette('deep')[int(activity)])
    plt.savefig('%s/dim_red.pdf' % options.out_dir)
    plt.close()

    #################################################################
    # compute profile distances
    #################################################################
    # compute prediction distances
    seqs_pdists = []
    for si in range(seqs_preds.shape[0]):
        # sd = np.power(seqs_preds[si,profile_mask]-activity_profile[profile_mask], 2).sum()
        sd = log_loss(activity_profile[profile_mask],
                      seqs_preds[si, profile_mask],
                      sample_weight=profile_weights[profile_mask])
        seqs_pdists.append(sd)
    seqs_pdists = np.array(seqs_pdists)

    # obtain sorted indexes
    seqs_sort_dist = np.argsort(seqs_pdists)

    # compute target distances
    seqs_tdists = []
    for si in range(seqs_preds.shape[0]):
        tdists = np.absolute(targets[si, profile_mask] -
                             activity_profile[profile_mask])
        tdists_weight = np.multiply(tdists, profile_weights[profile_mask])
        td = tdists_weight.sum()
        seqs_tdists.append(td)
    seqs_tdists = np.array(seqs_tdists)

    # print as table
    table_out = open('%s/table.txt' % options.out_dir, 'w')
    for si in seqs_sort_dist:
        cols = [si, seqs_pdists[si], seqs_tdists[si]] + list(
            seqs_preds[si, profile_mask])
        print('\t'.join([str(c) for c in cols]), file=table_out)
    table_out.close()

    #################################################################
    # plot sorted heat map
    #################################################################
    plt.figure()
    g = sns.clustermap(np.transpose(seqs_preds_prof[seqs_sort_dist[:1000]]),
                       col_cluster=False,
                       metric='cosine',
                       linewidths=0,
                       yticklabels=target_labels[profile_mask],
                       xticklabels=False)
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    for label in g.ax_heatmap.yaxis.get_majorticklabels():
        label.set_fontsize(options.font_heat)
    plt.savefig('%s/heat_rank.pdf' % options.out_dir)
    plt.close()

    #################################################################
    # dissect the top hits
    #################################################################
    satmut_targets = ','.join(
        [str(ti) for ti in range(len(activity_profile)) if profile_mask[ti]])

    if gpgpu_str != '':
        gpgpu_str = '-%s' % gpgpu_str

    for ni in range(options.num_dissect):
        si = seqs_sort_dist[ni]

        # print FASTA
        fasta_file = '%s/seq%d.fa' % (options.out_dir, ni)
        fasta_out = open(fasta_file, 'w')
        print('>%s\n%s' % (seq_headers[si], seqs[si]), file=fasta_out)
        fasta_out.close()

        # saturated mutagenesis
        cmd = 'basset_sat.py %s --mc_n 10 -n 500 -o %s/satmut%d -t %s %s %s' % (
            gpgpu_str, options.out_dir, ni, satmut_targets, model_file,
            fasta_file)
        subprocess.call(cmd, shell=True)

        # predictions and targets heat
        profile_sort = np.argsort(activity_profile[profile_mask])
        heat_mat = np.array([
            activity_profile[profile_mask], targets[si, profile_mask],
            seqs_preds_prof[si]
        ])
        heat_mat = heat_mat[:, profile_sort]

        plt.figure()
        ax = sns.heatmap(np.transpose(heat_mat),
                         yticklabels=target_labels[profile_mask][profile_sort],
                         xticklabels=['Desired', 'Experiment', 'Prediction'])
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=-45)
        plt.setp(ax.yaxis.get_majorticklabels(), rotation=-0)
        for label in ax.yaxis.get_majorticklabels():
            label.set_fontsize(options.font_heat)
        plt.savefig('%s/heat%d.pdf' % (options.out_dir, ni))
        plt.close()
Exemplo n.º 21
0
def main():
    usage = 'usage: %prog [options] <model_file> <test_hdf5_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='act_t',
        default=0.5,
        type='float',
        help=
        'Activation threshold (as proportion of max) to consider for PWM [Default: %default]'
    )
    parser.add_option('-d',
                      dest='model_hdf5_file',
                      default=None,
                      help='Pre-computed model output as HDF5.')
    parser.add_option('-o', dest='out_dir', default='.')
    parser.add_option('-m',
                      dest='meme_db',
                      default='%s/data/motifs/Homo_sapiens.meme' %
                      os.environ['BASSETDIR'],
                      help='MEME database used to annotate motifs')
    parser.add_option(
        '-p',
        dest='plot_heats',
        default=False,
        action='store_true',
        help=
        'Plot heat maps describing filter activations in the test sequences [Default: %default]'
    )
    parser.add_option(
        '-s',
        dest='sample',
        default=None,
        type='int',
        help='Sample sequences from the test set [Default:%default]')
    parser.add_option(
        '-t',
        dest='trim_filters',
        default=False,
        action='store_true',
        help=
        'Trim uninformative positions off the filter ends [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide Basset model file and test data in HDF5 format.')
    else:
        model_file = args[0]
        test_hdf5_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # load data
    #################################################################
    # load sequences
    test_hdf5_in = h5py.File(test_hdf5_file, 'r')
    seq_vecs = np.array(test_hdf5_in['test_in'])
    seq_targets = np.array(test_hdf5_in['test_out'])
    try:
        target_names = list(test_hdf5_in['target_labels'])
    except KeyError:
        target_names = ['t%d' % ti for ti in range(seq_targets.shape[1])]
    test_hdf5_in.close()

    #################################################################
    # sample
    #################################################################
    if options.sample is not None:
        # choose sampled indexes
        sample_i = np.array(
            random.sample(xrange(seq_vecs.shape[0]), options.sample))

        # filter
        seq_vecs = seq_vecs[sample_i]
        seq_targets = seq_targets[sample_i]

        # create a new HDF5 file
        sample_hdf5_file = '%s/sample.h5' % options.out_dir
        sample_hdf5_out = h5py.File(sample_hdf5_file, 'w')
        sample_hdf5_out.create_dataset('test_in', data=seq_vecs)
        sample_hdf5_out.close()

        # update test HDF5
        test_hdf5_file = sample_hdf5_file

    # convert to letters
    seqs = dna_io.vecs2dna(seq_vecs)

    #################################################################
    # Torch predict
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_motifs_predict.lua %s %s %s' % (
            model_file, test_hdf5_file, options.model_hdf5_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)

    # load model output
    model_hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    filter_weights = np.array(model_hdf5_in['weights'])
    filter_outs = np.array(model_hdf5_in['outs'])
    model_hdf5_in.close()

    # store useful variables
    num_filters = filter_weights.shape[0]
    filter_size = filter_weights.shape[2]

    #################################################################
    # individual filter plots
    #################################################################
    # also save information contents
    filters_ic = []
    meme_out = meme_intro('%s/filters_meme.txt' % options.out_dir, seqs)

    for f in range(num_filters):
        print 'Filter %d' % f

        # plot filter parameters as a heatmap
        plot_filter_heat(filter_weights[f, :, :],
                         '%s/filter%d_heat.pdf' % (options.out_dir, f))

        # write possum motif file
        filter_possum(filter_weights[f, :, :], 'filter%d' % f,
                      '%s/filter%d_possum.txt' % (options.out_dir, f),
                      options.trim_filters)

        # plot weblogo of high scoring outputs
        plot_filter_logo(filter_outs[:, f, :],
                         filter_size,
                         seqs,
                         '%s/filter%d_logo' % (options.out_dir, f),
                         maxpct_t=options.act_t)

        # make a PWM for the filter
        filter_pwm, nsites = make_filter_pwm('%s/filter%d_logo.fa' %
                                             (options.out_dir, f))

        if nsites < 10:
            # no information
            filters_ic.append(0)
        else:
            # compute and save information content
            filters_ic.append(info_content(filter_pwm))

            # add to the meme motif file
            meme_add(meme_out, f, filter_pwm, nsites, options.trim_filters)

    meme_out.close()

    #################################################################
    # annotate filters
    #################################################################
    # run tomtom
    subprocess.call(
        'tomtom -dist pearson -thresh 0.1 -oc %s/tomtom %s/filters_meme.txt %s'
        % (options.out_dir, options.out_dir, options.meme_db),
        shell=True)

    # read in annotations
    filter_names = name_filters(num_filters,
                                '%s/tomtom/tomtom.txt' % options.out_dir,
                                options.meme_db)

    #################################################################
    # print a table of information
    #################################################################
    table_out = open('%s/table.txt' % options.out_dir, 'w')

    # print header for later panda reading
    header_cols = ('', 'consensus', 'annotation', 'ic', 'mean', 'std')
    print >> table_out, '%3s  %19s  %10s  %5s  %6s  %6s' % header_cols

    for f in range(num_filters):
        # collapse to a consensus motif
        consensus = filter_motif(filter_weights[f, :, :])

        # grab annotation
        annotation = '.'
        name_pieces = filter_names[f].split('_')
        if len(name_pieces) > 1:
            annotation = name_pieces[1]

        # plot density of filter output scores
        fmean, fstd = plot_score_density(
            np.ravel(filter_outs[:, f, :]),
            '%s/filter%d_dens.pdf' % (options.out_dir, f))

        row_cols = (f, consensus, annotation, filters_ic[f], fmean, fstd)
        print >> table_out, '%-3d  %19s  %10s  %5.2f  %6.4f  %6.4f' % row_cols

    table_out.close()

    #################################################################
    # global filter plots
    #################################################################
    if options.plot_heats:
        # plot filter-sequence heatmap
        plot_filter_seq_heat(filter_outs,
                             '%s/filter_seqs.pdf' % options.out_dir)

        # plot filter-segment heatmap
        plot_filter_seg_heat(filter_outs,
                             '%s/filter_segs.pdf' % options.out_dir)
        plot_filter_seg_heat(filter_outs,
                             '%s/filter_segs_raw.pdf' % options.out_dir,
                             whiten=False)

        # plot filter-target correlation heatmap
        plot_target_corr(filter_outs, seq_targets, filter_names, target_names,
                         '%s/filter_target_cors_mean.pdf' % options.out_dir,
                         'mean')
        plot_target_corr(filter_outs, seq_targets, filter_names, target_names,
                         '%s/filter_target_cors_max.pdf' % options.out_dir,
                         'max')
Exemplo n.º 22
0
def main():
    usage = 'usage: %prog [options] <model_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='targets_file',
        default=None,
        help='File labelings targets in the second column [Default: %default]')
    parser.add_option(
        '-c',
        dest='center_nt',
        default=50,
        help='Center nt to consider kmers from [Default: %default]')
    parser.add_option('-d',
                      dest='model_out_file',
                      default=None,
                      help='Pre-computed model output table.')
    parser.add_option('-k',
                      dest='kmer',
                      default=8,
                      type='int',
                      help='K-mer length [Default: %default]')
    parser.add_option('-l',
                      dest='seq_len',
                      default=1000,
                      type='int',
                      help='Input sequence length [Default: %default]')
    parser.add_option(
        '-n',
        dest='num_seqs',
        default=100000,
        type='int',
        help='Number of sequences to predict [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='.')
    parser.add_option(
        '-r',
        dest='rc',
        default=False,
        action='store_true',
        help='Consider k-mers w/ their reverse complements [Default: %default]'
    )
    parser.add_option(
        '-t',
        dest='targets',
        default=None,
        help=
        'Comma-separated list of targets to analyze in more depth [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide Basset model file.')
    else:
        model_file = args[0]

    random.seed(2)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # generate random sequences
    #################################################################
    # random sequences
    seq_vecs = np.zeros((options.num_seqs, 4, 1, options.seq_len),
                        dtype='float16')
    for si in range(options.num_seqs):
        for li in range(options.seq_len):
            ni = random.randint(0, 3)
            seq_vecs[si, ni, 0, li] = 1

    # create a new HDF5 file
    seq_hdf5_file = '%s/seqs.h5' % options.out_dir
    seq_hdf5_out = h5py.File(seq_hdf5_file, 'w')
    seq_hdf5_out.create_dataset('test_in', data=seq_vecs)
    seq_hdf5_out.close()

    # get fasta
    seq_dna = vecs2dna(seq_vecs)

    #################################################################
    # Torch predict
    #################################################################
    if options.model_out_file is None:
        options.model_out_file = '%s/model_out.txt' % options.out_dir

        torch_cmd = 'basset_predict.lua -scores %s %s %s' % (
            model_file, seq_hdf5_file, options.model_out_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)

    # load scores
    seq_scores = np.loadtxt(options.model_out_file, dtype='float32')

    # read target labels
    if options.targets_file:
        target_labels = [
            line.split()[1] for line in open(options.targets_file)
        ]
    else:
        target_labels = ['t%d' % (ti + 1) for ti in range(seq_scores.shape[1])]

    if options.targets == None:
        options.targets = range(seq_scores.shape[1])

    #################################################################
    # process and output
    #################################################################
    kmers_start = (options.seq_len - options.center_nt) / 2

    for ti in options.targets:
        ##############################################
        # hash scores by k-mer
        ##############################################
        kmer_scores = {}

        for si in range(len(seq_dna)):
            # get score
            sscore = seq_scores[si, ti]

            # hash to each center kmer
            for ki in range(kmers_start, kmers_start + options.center_nt):
                kmer = seq_dna[si][ki:ki + options.kmer]
                if options.rc:
                    kmer = consider_rc(kmer)

                kmer_scores.setdefault(kmer, []).append(sscore)

        ##############################################
        # print table
        ##############################################
        table_out = open('%s/table%d.txt' % (options.out_dir, ti), 'w')

        for kmer in kmer_scores:
            cols = (kmer, len(kmer_scores[kmer]), np.mean(kmer_scores[kmer]),
                    np.std(kmer_scores[kmer]) /
                    math.sqrt(len(kmer_scores[kmer])))
            print >> table_out, '%s  %4d  %6.3f  %6.3f' % cols

        table_out.close()