示例#1
0
def bigwig_write(snp, seq_len, preds, model, bw_file, genome_file):
    bw_open = bigwig_open(bw_file, genome_file)

    seq_chrom = snp.chrom
    seq_start = snp.pos - seq_len // 2

    bw_chroms = [seq_chrom] * len(preds)
    bw_starts = [
        int(seq_start + model.batch_buffer + bi * model.target_pool)
        for bi in range(len(preds))
    ]
    bw_ends = [int(bws + model.target_pool) for bws in bw_starts]

    preds_list = [float(p) for p in preds]
    bw_open.addEntries(bw_chroms, bw_starts, ends=bw_ends, values=preds_list)

    bw_open.close()
示例#2
0
def score_write(sess, model, options, seqs_1hot, seqs_chrom, seqs_start):
    ''' Compute scores and write them as BigWigs for a set of sequences. '''

    for si in range(seqs_1hot.shape[0]):
        # initialize batcher
        batcher = basenji.batcher.Batcher(seqs_1hot[si:si + 1],
                                          batch_size=model.batch_size,
                                          pool_width=model.target_pool)

        # get layer representations
        t0 = time.time()
        print('Computing gradients.', end='', flush=True)
        _, _, _, batch_grads, batch_reprs, _ = model.gradients(
            sess,
            batcher,
            rc=options.rc,
            shifts=options.shifts,
            mc_n=options.mc_n,
            return_all=True)
        print(' Done in %ds.' % (time.time() - t0), flush=True)

        # only layer
        batch_reprs = batch_reprs[0]
        batch_grads = batch_grads[0]

        # increase resolution
        batch_reprs = batch_reprs.astype('float32')
        batch_grads = batch_grads.astype('float32')

        # S (sequences) x T (targets) x P (seq position) x U (units layer i) x E (ensembles)
        print('batch_grads', batch_grads.shape)
        pooled_length = batch_grads.shape[2]

        # S (sequences) x P (seq position) x U (Units layer i) x E (ensembles)
        print('batch_reprs', batch_reprs.shape)

        # write bigwigs
        t0 = time.time()
        print('Writing BigWigs.', end='', flush=True)

        # for each target
        for tii in range(len(options.target_indexes)):
            ti = options.target_indexes[tii]

            # compute scores
            if options.norm is None:
                batch_grads_scores = np.multiply(
                    batch_reprs[0], batch_grads[0, tii, :, :, :]).sum(axis=1)
            else:
                batch_grads_scores = np.multiply(batch_reprs[0],
                                                 batch_grads[0, tii, :, :, :])
                batch_grads_scores = np.power(np.abs(batch_grads_scores),
                                              options.norm)
                batch_grads_scores = batch_grads_scores.sum(axis=1)
                batch_grads_scores = np.power(batch_grads_scores,
                                              1. / options.norm)

            # compute score statistics
            batch_grads_mean = batch_grads_scores.mean(axis=1)

            if options.norm is None:
                batch_grads_pval = ttest_1samp(batch_grads_scores, 0,
                                               axis=1)[1]
            else:
                batch_grads_pval = ttest_1samp(batch_grads_scores, 0,
                                               axis=1)[1]
                # batch_grads_pval = chi2(df=)
                batch_grads_pval /= 2

            # open bigwig
            bws_file = '%s/s%d_t%d_scores.bw' % (options.out_dir, si, ti)
            bwp_file = '%s/s%d_t%d_pvals.bw' % (options.out_dir, si, ti)
            bws_open = bigwig_open(bws_file, options.genome_file)
            # bwp_open = bigwig_open(bwp_file, options.genome_file)

            # specify bigwig locations and values
            bw_chroms = [seqs_chrom[si]] * pooled_length
            bw_starts = [
                int(seqs_start[si] + pi * model.target_pool)
                for pi in range(pooled_length)
            ]
            bw_ends = [int(bws + model.target_pool) for bws in bw_starts]
            bws_values = [float(bgs) for bgs in batch_grads_mean]
            # bwp_values = [float(bgp) for bgp in batch_grads_pval]

            # write
            bws_open.addEntries(bw_chroms,
                                bw_starts,
                                ends=bw_ends,
                                values=bws_values)
            # bwp_open.addEntries(bw_chroms, bw_starts, ends=bw_ends, values=bwp_values)

            # close
            bws_open.close()
            # bwp_open.close()

        print(' Done in %ds.' % (time.time() - t0), flush=True)
        gc.collect()
示例#3
0
def main():
    usage = "usage: %prog [options] <params_file> <model_file> <genes_hdf5_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-g",
        dest="genome_file",
        default="%s/data/human.hg19.genome" % os.environ["BASENJIDIR"],
        help="Chromosome lengths file [Default: %default]",
    )
    parser.add_option("-l",
                      dest="gene_list",
                      help="Process only gene ids in the given file")
    parser.add_option(
        "-o",
        dest="out_dir",
        default="grad_mapg",
        help="Output directory [Default: %default]",
    )
    parser.add_option("-t",
                      dest="target_indexes",
                      default=None,
                      help="Target indexes to plot")
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error("Must provide parameters, model, and genomic position")
    else:
        params_file = args[0]
        model_file = args[1]
        genes_hdf5_file = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # reads in genes HDF5

    gene_data = genedata.GeneData(genes_hdf5_file)

    # subset gene sequences
    genes_subset = set()
    if options.gene_list:
        for line in open(options.gene_list):
            genes_subset.add(line.rstrip())

        gene_data.subset_genes(genes_subset)
        print("Filtered to %d sequences" % gene_data.num_seqs)

    #######################################################
    # model parameters and placeholders

    job = params.read_job_params(params_file)

    job["seq_length"] = gene_data.seq_length
    job["seq_depth"] = gene_data.seq_depth
    job["target_pool"] = gene_data.pool_width

    if "num_targets" not in job:
        print(
            "Must specify number of targets (num_targets) in the parameters file.",
            file=sys.stderr,
        )
        exit(1)

    # set target indexes
    if options.target_indexes is not None:
        options.target_indexes = [
            int(ti) for ti in options.target_indexes.split(",")
        ]
        target_subset = options.target_indexes
    else:
        options.target_indexes = list(range(job["num_targets"]))
        target_subset = None

    # build model
    model = seqnn.SeqNN()
    model.build(job, target_subset=target_subset)

    # determine latest pre-dilated layer
    cnn_dilation = np.array([cp.dilation for cp in model.hp.cnn_params])
    dilated_mask = cnn_dilation > 1
    dilated_indexes = np.where(dilated_mask)[0]
    pre_dilated_layer = np.min(dilated_indexes)
    print("Pre-dilated layer: %d" % pre_dilated_layer)

    # build gradients ops
    t0 = time.time()
    print("Building target/position-specific gradient ops.", end="")
    model.build_grads_genes(gene_data.gene_seqs, layers=[pre_dilated_layer])
    print(" Done in %ds" % (time.time() - t0), flush=True)

    #######################################################
    # acquire gradients

    # initialize saver
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # load variables into session
        saver.restore(sess, model_file)

        for si in range(gene_data.num_seqs):
            # initialize batcher
            batcher_si = batcher.Batcher(
                gene_data.seqs_1hot[si:si + 1],
                batch_size=model.hp.batch_size,
                pool_width=model.hp.target_pool,
            )

            # get layer representations
            t0 = time.time()
            print("Computing gradients.", end="", flush=True)
            batch_grads, batch_reprs = model.gradients_genes(
                sess, batcher_si, gene_data.gene_seqs[si:si + 1])
            print(" Done in %ds." % (time.time() - t0), flush=True)

            # only layer
            batch_reprs = batch_reprs[0]
            batch_grads = batch_grads[0]

            # G (TSSs) x T (targets) x P (seq position) x U (Units layer i)
            print("batch_grads", batch_grads.shape)
            pooled_length = batch_grads.shape[2]

            # S (sequences) x P (seq position) x U (Units layer i)
            print("batch_reprs", batch_reprs.shape)

            # write bigwigs
            t0 = time.time()
            print("Writing BigWigs.", end="", flush=True)

            # for each TSS
            for tss_i in range(batch_grads.shape[0]):
                tss = gene_data.gene_seqs[si].tss_list[tss_i]

                # for each target
                for tii in range(len(options.target_indexes)):
                    ti = options.target_indexes[tii]

                    # dot representation and gradient
                    batch_grads_score = np.multiply(
                        batch_reprs[0], batch_grads[tss_i,
                                                    tii, :, :]).sum(axis=1)

                    # open bigwig
                    bw_file = "%s/%s-%s_t%d.bw" % (
                        options.out_dir,
                        tss.gene_id,
                        tss.identifier,
                        ti,
                    )
                    bw_open = bigwig_open(bw_file, options.genome_file)

                    # access gene sequence information
                    seq_chrom = gene_data.gene_seqs[si].chrom
                    seq_start = gene_data.gene_seqs[si].start

                    # specify bigwig locations and values
                    bw_chroms = [seq_chrom] * pooled_length
                    bw_starts = [
                        int(seq_start + li * model.hp.target_pool)
                        for li in range(pooled_length)
                    ]
                    bw_ends = [
                        int(bws + model.hp.target_pool) for bws in bw_starts
                    ]
                    bw_values = [float(bgs) for bgs in batch_grads_score]

                    # write
                    bw_open.addEntries(bw_chroms,
                                       bw_starts,
                                       ends=bw_ends,
                                       values=bw_values)

                    # close
                    bw_open.close()

            print(" Done in %ds." % (time.time() - t0), flush=True)
            gc.collect()
示例#4
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <genes_hdf5_file> <vcf_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='all_sed',
        default=False,
        action='store_true',
        help=
        'Print all variant-gene pairs, as opposed to only nonzero [Default: %default]'
    )
    parser.add_option('-b',
                      dest='batch_size',
                      default=None,
                      type='int',
                      help='Batch size [Default: %default]')
    parser.add_option('-c',
                      dest='csv',
                      default=False,
                      action='store_true',
                      help='Print table as CSV [Default: %default]')
    parser.add_option('-g',
                      dest='genome_file',
                      default='%s/assembly/human.hg19.genome' %
                      os.environ['HG19'],
                      help='Chromosome lengths file [Default: %default]')
    parser.add_option(
        '-o',
        dest='out_dir',
        default='sed',
        help='Output directory for tables and plots [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number of processes, passed by multi script')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Average the forward and reverse complement predictions when testing [Default: %default]'
    )
    parser.add_option(
        '--ti',
        dest='track_indexes',
        help='Comma-separated list of target indexes to output BigWig tracks')
    parser.add_option(
        '-x',
        dest='transcript_table',
        default=False,
        action='store_true',
        help='Print transcript table in addition to gene [Default: %default]')
    parser.add_option(
        '-w',
        dest='tss_width',
        default=1,
        type='int',
        help=
        'Width of bins considered to quantify TSS transcription [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) == 4:
        # single worker
        params_file = args[0]
        model_file = args[1]
        genes_hdf5_file = args[2]
        vcf_file = args[3]

    elif len(args) == 6:
        # multi worker
        options_pkl_file = args[0]
        params_file = args[1]
        model_file = args[2]
        genes_hdf5_file = args[3]
        vcf_file = args[4]
        worker_index = int(args[5])

        # load options
        options_pkl = open(options_pkl_file, 'rb')
        options = pickle.load(options_pkl)
        options_pkl.close()

        # update output directory
        options.out_dir = '%s/job%d' % (options.out_dir, worker_index)

    else:
        parser.error(
            'Must provide parameters and model files, genes HDF5 file, and QTL VCF file'
        )

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.track_indexes is None:
        options.track_indexes = []
    else:
        options.track_indexes = [
            int(ti) for ti in options.track_indexes.split(',')
        ]
        if not os.path.isdir('%s/tracks' % options.out_dir):
            os.mkdir('%s/tracks' % options.out_dir)

    #################################################################
    # reads in genes HDF5

    gene_data = basenji.genes.GeneData(genes_hdf5_file)

    # filter for worker sequences
    if options.processes is not None:
        gene_data.worker(worker_index, options.processes)

    #################################################################
    # prep SNPs

    # load SNPs
    snps = basenji.vcf.vcf_snps(vcf_file)

    # intersect w/ segments
    print('Intersecting gene sequences with SNPs...', flush=True, end='')
    seqs_snps = basenji.vcf.intersect_seqs_snps(vcf_file,
                                                gene_data.seq_coords,
                                                vision_p=0.5)
    print('done', flush=True)

    #################################################################
    # determine SNP sequences to be needed

    seqs_snps_list = []
    for seq_i in range(gene_data.num_seqs):
        seq_chrom, seq_start, seq_end = gene_data.seq_coords[seq_i]

        if seqs_snps[seq_i]:
            # add major allele
            seqs_snps_list.append((seq_i, None, None))

            # add minor alleles
            for snp_i in seqs_snps[seq_i]:
                # determine SNP position wrt sequence
                snp_seq_pos = snps[snp_i].pos - 1 - seq_start

                # update primary sequence to use major allele
                basenji.dna_io.hot1_set(gene_data.seqs_1hot[seq_i],
                                        snp_seq_pos, snps[snp_i].ref_allele)
                assert (basenji.dna_io.hot1_get(
                    gene_data.seqs_1hot[seq_i],
                    snp_seq_pos) == snps[snp_i].ref_allele)

                # append descriptive tuple to list
                seqs_snps_list.append(
                    (seq_i, snp_seq_pos, snps[snp_i].alt_alleles[0]))

    #################################################################
    # setup model

    job = basenji.dna_io.read_job_params(params_file)

    job['batch_length'] = gene_data.seq_length
    job['seq_depth'] = gene_data.seq_depth
    job['target_pool'] = gene_data.pool_width

    if 'num_targets' not in job and gene_data.num_targets is not None:
        job['num_targets'] = gene_data.num_targets

    if 'num_targets' not in job:
        print(
            "Must specify number of targets (num_targets) in the parameters file. I know, it's annoying. Sorry.",
            file=sys.stderr)
        exit(1)

    # build model
    model = basenji.seqnn.SeqNN()
    model.build(job)

    #################################################################
    # compute, collect, and print SEDs

    header_cols = ('rsid', 'ref', 'alt', 'gene', 'tss_dist', 'target',
                   'ref_pred', 'alt_pred', 'sed', 'ser')
    if options.csv:
        sed_gene_out = open('%s/sed_gene.csv' % options.out_dir, 'w')
        print(','.join(header_cols), file=sed_gene_out)
        if options.transcript_table:
            sed_tx_out = open('%s/sed_tx.csv' % options.out_dir, 'w')
            print(','.join(header_cols), file=sed_tx_out)

    else:
        sed_gene_out = open('%s/sed_gene.txt' % options.out_dir, 'w')
        print(' '.join(header_cols), file=sed_gene_out)
        if options.transcript_table:
            sed_tx_out = open('%s/sed_tx.txt' % options.out_dir, 'w')
            print(' '.join(header_cols), file=sed_tx_out)

    # helper variables
    adj = options.tss_width // 2
    pred_buffer = model.batch_buffer // model.target_pool

    # initialize saver
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # load variables into session
        saver.restore(sess, model_file)

        # initialize prediction stream
        seq_preds = PredStream(sess, model, gene_data.seqs_1hot,
                               seqs_snps_list, 128, options.rc)

        # prediction index
        pi = 0

        for seq_i in range(gene_data.num_seqs):
            if seqs_snps[seq_i]:
                # get reference prediction (LxT)
                ref_preds = seq_preds[pi]
                pi += 1

                for snp_i in seqs_snps[seq_i]:
                    snp = snps[snp_i]

                    # get alternate prediction (LxT)
                    alt_preds = seq_preds[pi]
                    pi += 1

                    # initialize gene data structures
                    gene_pos_preds = {}  # gene -> pos -> (ref_preds,alt_preds)
                    snp_dist_gene = {}

                    # process transcripts
                    for transcript, tx_pos in gene_data.seq_transcripts[seq_i]:
                        # get gene id
                        gene = gene_data.transcript_genes[transcript]

                        # compute distance between SNP and TSS
                        tx_gpos = gene_data.seq_coords[seq_i][1] + (
                            tx_pos + 0.5) * model.target_pool
                        snp_dist = abs(tx_gpos - snp.pos)
                        if gene in snp_dist_gene:
                            snp_dist_gene[gene] = min(snp_dist_gene[gene],
                                                      snp_dist)
                        else:
                            snp_dist_gene[gene] = snp_dist

                        # compute transcript pos in predictions
                        tx_pos_buf = tx_pos - pred_buffer

                        # hash transcription positions and predictions to gene id
                        for tx_pos_i in range(tx_pos_buf - adj,
                                              tx_pos_buf + adj + 1):
                            gene_pos_preds.setdefault(
                                gene, {})[tx_pos_i] = (ref_preds[tx_pos_i, :],
                                                       alt_preds[tx_pos_i, :])

                        # accumulate transcript predictions by (possibly) summing adjacent positions
                        ap = alt_preds[tx_pos_buf - adj:tx_pos_buf + adj +
                                       1, :].sum(axis=0)
                        rp = ref_preds[tx_pos_buf - adj:tx_pos_buf + adj +
                                       1, :].sum(axis=0)

                        # compute SED scores
                        snp_tx_sed = ap - rp
                        snp_tx_ser = np.log2(ap + 1) - np.log2(rp + 1)

                        # print rows to transcript table
                        if options.transcript_table:
                            for ti in range(ref_preds.shape[1]):
                                if options.all_sed or not np.isclose(
                                        snp_tx_sed[ti], 0, atol=1e-4):
                                    cols = (snp.rsid,
                                            basenji.vcf.cap_allele(
                                                snp.ref_allele),
                                            basenji.vcf.cap_allele(
                                                snp.alt_alleles[0]),
                                            transcript, snp_dist,
                                            gene_data.target_labels[ti],
                                            rp[ti], ap[ti], snp_tx_sed[ti],
                                            snp_tx_ser[ti])
                                    if options.csv:
                                        print(','.join([str(c) for c in cols]),
                                              file=sed_tx_out)
                                    else:
                                        print(
                                            '%-13s %s %5s %12s %5d %12s %6.4f %6.4f %7.4f %7.4f'
                                            % cols,
                                            file=sed_tx_out)

                    # process genes
                    for gene in gene_pos_preds:
                        gene_str = gene
                        if gene in gene_data.multi_seq_genes:
                            gene_str = '%s_multi' % gene

                        # sum gene preds across positions
                        gene_rp = np.zeros(ref_preds.shape[1])
                        gene_ap = np.zeros(alt_preds.shape[1])
                        for pos_i in gene_pos_preds[gene]:
                            pos_rp, pos_ap = gene_pos_preds[gene][pos_i]
                            gene_rp += pos_rp
                            gene_ap += pos_ap

                        # compute SED scores
                        snp_gene_sed = gene_ap - gene_rp
                        snp_gene_ser = np.log2(gene_ap + 1) - np.log2(gene_rp +
                                                                      1)

                        # print rows to gene table
                        for ti in range(ref_preds.shape[1]):
                            if options.all_sed or not np.isclose(
                                    snp_gene_sed[ti], 0, atol=1e-4):
                                cols = [
                                    snp.rsid,
                                    basenji.vcf.cap_allele(snp.ref_allele),
                                    basenji.vcf.cap_allele(snp.alt_alleles[0]),
                                    gene_str, snp_dist_gene[gene],
                                    gene_data.target_labels[ti], gene_rp[ti],
                                    gene_ap[ti], snp_gene_sed[ti],
                                    snp_gene_ser[ti]
                                ]
                                if options.csv:
                                    print(','.join([str(c) for c in cols]),
                                          file=sed_gene_out)
                                else:
                                    print(
                                        '%-13s %s %5s %12s %5d %12s %6.4f %6.4f %7.4f %7.4f'
                                        % tuple(cols),
                                        file=sed_gene_out)

                    # print tracks
                    for ti in options.track_indexes:
                        ref_bw_file = '%s/tracks/%s_%s_t%d_ref.bw' % (
                            options.out_dir, snp.rsid, seq_i, ti)
                        alt_bw_file = '%s/tracks/%s_%s_t%d_alt.bw' % (
                            options.out_dir, snp.rsid, seq_i, ti)
                        diff_bw_file = '%s/tracks/%s_%s_t%d_diff.bw' % (
                            options.out_dir, snp.rsid, seq_i, ti)
                        ref_bw_open = bigwig_open(ref_bw_file,
                                                  options.genome_file)
                        alt_bw_open = bigwig_open(alt_bw_file,
                                                  options.genome_file)
                        diff_bw_open = bigwig_open(diff_bw_file,
                                                   options.genome_file)

                        seq_chrom, seq_start, seq_end = gene_data.seq_coords[
                            seq_i]
                        bw_chroms = [seq_chrom] * ref_preds.shape[0]
                        bw_starts = [
                            int(seq_start + model.batch_buffer +
                                bi * model.target_pool)
                            for bi in range(ref_preds.shape[0])
                        ]
                        bw_ends = [
                            int(bws + model.target_pool) for bws in bw_starts
                        ]

                        ref_values = [float(p) for p in ref_preds[:, ti]]
                        ref_bw_open.addEntries(bw_chroms,
                                               bw_starts,
                                               ends=bw_ends,
                                               values=ref_values)

                        alt_values = [float(p) for p in alt_preds[:, ti]]
                        alt_bw_open.addEntries(bw_chroms,
                                               bw_starts,
                                               ends=bw_ends,
                                               values=alt_values)

                        diff_values = [
                            alt_values[vi] - ref_values[vi]
                            for vi in range(len(ref_values))
                        ]
                        diff_bw_open.addEntries(bw_chroms,
                                                bw_starts,
                                                ends=bw_ends,
                                                values=diff_values)

                        ref_bw_open.close()
                        alt_bw_open.close()
                        diff_bw_open.close()

                # clean up
                gc.collect()

    sed_gene_out.close()
    if options.transcript_table:
        sed_tx_out.close()
示例#5
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <genes_hdf5_file>'
    parser = OptionParser(usage)
    parser.add_option('-g',
                      dest='genome_file',
                      default='%s/assembly/human.hg19.genome' %
                      os.environ['HG19'],
                      help='Chromosome lengths file [Default: %default]')
    parser.add_option('-l',
                      dest='gene_list',
                      help='Process only gene ids in the given file')
    parser.add_option('-o',
                      dest='out_dir',
                      default='grad_map',
                      help='Output directory [Default: %default]')
    parser.add_option('-t',
                      dest='target_indexes',
                      default=None,
                      help='Target indexes to plot')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide parameters, model, and genomic position')
    else:
        params_file = args[0]
        model_file = args[1]
        genes_hdf5_file = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # reads in genes HDF5

    gene_data = basenji.genedata.GeneData(genes_hdf5_file)

    # subset gene sequences
    genes_subset = set()
    if options.gene_list:
        for line in open(options.gene_list):
            genes_subset.add(line.rstrip())

        gene_data.subset_genes(genes_subset)
        print('Filtered to %d sequences' % gene_data.num_seqs)

    #######################################################
    # model parameters and placeholders

    job = basenji.dna_io.read_job_params(params_file)

    job['seq_length'] = gene_data.seq_length
    job['seq_depth'] = gene_data.seq_depth
    job['target_pool'] = gene_data.pool_width

    if 'num_targets' not in job:
        print(
            "Must specify number of targets (num_targets) in the parameters file.",
            file=sys.stderr)
        exit(1)

    # set target indexes
    if options.target_indexes is not None:
        options.target_indexes = [
            int(ti) for ti in options.target_indexes.split(',')
        ]
        target_subset = options.target_indexes
    else:
        options.target_indexes = list(range(job['num_targets']))
        target_subset = None

    # build model
    model = basenji.seqnn.SeqNN()
    model.build(job, target_subset=target_subset)

    # determine latest pre-dilated layer
    dilated_mask = np.array(model.cnn_dilation) > 1
    dilated_indexes = np.where(dilated_mask)[0]
    pre_dilated_layer = np.min(dilated_indexes)
    print('Pre-dilated layer: %d' % pre_dilated_layer)

    # build gradients ops
    t0 = time.time()
    print('Building target/position-specific gradient ops.', end='')
    model.build_grads_genes(gene_data.gene_seqs, layers=[pre_dilated_layer])
    print(' Done in %ds' % (time.time() - t0), flush=True)

    #######################################################
    # acquire gradients

    # initialize saver
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # load variables into session
        saver.restore(sess, model_file)

        for si in range(gene_data.num_seqs):
            # initialize batcher
            batcher = basenji.batcher.Batcher(gene_data.seqs_1hot[si:si + 1],
                                              batch_size=model.batch_size,
                                              pool_width=model.target_pool)

            # get layer representations
            t0 = time.time()
            print('Computing gradients.', end='', flush=True)
            batch_grads, batch_reprs = model.gradients_genes(
                sess, batcher, gene_data.gene_seqs[si:si + 1])
            print(' Done in %ds.' % (time.time() - t0), flush=True)

            # only layer
            batch_reprs = batch_reprs[0]
            batch_grads = batch_grads[0]

            # G (TSSs) x T (targets) x P (seq position) x U (Units layer i)
            print('batch_grads', batch_grads.shape)
            pooled_length = batch_grads.shape[2]

            # S (sequences) x P (seq position) x U (Units layer i)
            print('batch_reprs', batch_reprs.shape)

            # write bigwigs
            t0 = time.time()
            print('Writing BigWigs.', end='', flush=True)

            # for each TSS
            for tss_i in range(batch_grads.shape[0]):
                tss = gene_data.gene_seqs[si].tss_list[tss_i]

                # for each target
                for tii in range(len(options.target_indexes)):
                    ti = options.target_indexes[tii]

                    # dot representation and gradient
                    batch_grads_score = np.multiply(
                        batch_reprs[0], batch_grads[tss_i,
                                                    tii, :, :]).sum(axis=1)

                    # open bigwig
                    bw_file = '%s/%s-%s_t%d.bw' % \
                                (options.out_dir, tss.gene_id, tss.identifier, ti)
                    bw_open = bigwig_open(bw_file, options.genome_file)

                    # access gene sequence information
                    seq_chrom = gene_data.gene_seqs[si].chrom
                    seq_start = gene_data.gene_seqs[si].start

                    # specify bigwig locations and values
                    bw_chroms = [seq_chrom] * pooled_length
                    bw_starts = [
                        int(seq_start + li * model.target_pool)
                        for li in range(pooled_length)
                    ]
                    bw_ends = [
                        int(bws + model.target_pool) for bws in bw_starts
                    ]
                    bw_values = [float(bgs) for bgs in batch_grads_score]

                    # write
                    bw_open.addEntries(bw_chroms,
                                       bw_starts,
                                       ends=bw_ends,
                                       values=bw_values)

                    # close
                    bw_open.close()

            print(' Done in %ds.' % (time.time() - t0), flush=True)
            gc.collect()
示例#6
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <genes_hdf5_file>'
    parser = OptionParser(usage)
    parser.add_option('-g',
                      dest='genome_file',
                      default='%s/assembly/human.hg19.genome' %
                      os.environ['HG19'],
                      help='Chromosome lengths file [Default: %default]')
    parser.add_option('-l',
                      dest='transcript_list',
                      help='Process only transcript ids in the given file')
    parser.add_option('-o',
                      dest='out_dir',
                      default='grad_map',
                      help='Output directory [Default: %default]')
    parser.add_option('-t',
                      dest='target_indexes',
                      default=None,
                      help='Target indexes to plot')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide parameters, model, and genomic position')
    else:
        params_file = args[0]
        model_file = args[1]
        genes_hdf5_file = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # reads in genes HDF5

    gene_data = basenji.genes.GeneData(genes_hdf5_file)

    # subset transcripts
    transcripts_subset = set()
    if options.transcript_list:
        for line in open(options.transcript_list):
            transcripts_subset.add(line.rstrip())

        gene_data.subset_transcripts(transcripts_subset)
        print('Filtered to %d sequences' % gene_data.num_seqs)

    #######################################################
    # model parameters and placeholders

    job = basenji.dna_io.read_job_params(params_file)

    job['batch_length'] = gene_data.seq_length
    job['seq_depth'] = gene_data.seq_depth
    job['target_pool'] = gene_data.pool_width
    job['save_reprs'] = True

    if 'num_targets' not in job:
        print(
            "Must specify number of targets (num_targets) in the parameters file. I know, it's annoying. Sorry.",
            file=sys.stderr)
        exit(1)

    # build model
    model = basenji.seqnn.SeqNN()
    model.build(job)

    # determine final pooling layer
    post_pooling_layer = len(model.cnn_pool) - 1

    #######################################################
    # acquire gradients

    # set target indexes
    if options.target_indexes is not None:
        options.target_indexes = [
            int(ti) for ti in options.target_indexes.split(',')
        ]
    else:
        options.target_indexes = list(range(job['num_targets']))

    # initialize saver
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # load variables into session
        saver.restore(sess, model_file)

        si = 0
        while si < gene_data.num_seqs:
            # initialize batcher
            # batcher = basenji.batcher.Batcher(seqs_1hot[si:si+model.batch_size], batch_size=model.batch_size, pool_width=model.target_pool)
            batcher = basenji.batcher.Batcher(gene_data.seqs_1hot[si:si + 1],
                                              batch_size=model.batch_size,
                                              pool_width=model.target_pool)

            # determine transcript positions
            transcript_positions = set()
            # for bi in range(model.batch_size):   # TEMP
            for bi in range(1):
                if si + bi < len(gene_data.seq_transcripts):
                    for transcript, tx_pos in gene_data.seq_transcripts[si +
                                                                        bi]:
                        transcript_positions.add(tx_pos)
            transcript_positions = sorted(list(transcript_positions))

            # get layer representations
            t0 = time.time()
            print('Computing gradients.', end='', flush=True)
            batch_grads, batch_reprs = model.gradients_pos(
                sess, batcher, transcript_positions, options.target_indexes,
                post_pooling_layer)
            print(' Done in %ds.' % (time.time() - t0), flush=True)

            # only layer
            batch_reprs = batch_reprs[0]
            batch_grads = batch_grads[0]

            # (B sequences) x (P pooled seq len) x (F filters) x (G gene positions) x (T targets)
            print('batch_grads', batch_grads.shape)
            print('batch_reprs', batch_reprs.shape)

            # (B sequences) x (P pooled seq len) x (G gene positions) x (T targets)
            pooled_length = batch_grads.shape[1]

            # write bigwigs
            t0 = time.time()
            print('Writing BigWigs.', end='', flush=True)
            # for bi in range(model.batch_size):   # TEMP
            for bi in range(1):
                sbi = si + bi
                if sbi < gene_data.num_seqs:
                    positions_written = set()
                    for transcript, tx_pos in gene_data.seq_transcripts[sbi]:
                        # has this transcript position been written?
                        if tx_pos not in positions_written:
                            # which gene position is this tx_pos?
                            gi = 0
                            while transcript_positions[gi] != tx_pos:
                                gi += 1

                            # for each target
                            for tii in range(len(options.target_indexes)):
                                ti = options.target_indexes[tii]

                                # dot representation and gradient
                                batch_grads_score = np.multiply(
                                    batch_reprs[bi],
                                    batch_grads[bi, :, :, gi, tii]).sum(axis=1)

                                bw_file = '%s/%s_t%d.bw' % (options.out_dir,
                                                            transcript, ti)
                                bw_open = bigwig_open(bw_file,
                                                      options.genome_file)

                                seq_chrom, seq_start, seq_end = gene_data.seq_coords[
                                    sbi]
                                bw_chroms = [seq_chrom] * pooled_length
                                bw_starts = [
                                    int(seq_start + li * model.target_pool)
                                    for li in range(pooled_length)
                                ]
                                bw_ends = [
                                    int(bws + model.target_pool)
                                    for bws in bw_starts
                                ]
                                bw_values = [
                                    float(bgs) for bgs in batch_grads_score
                                ]

                                bw_open.addEntries(bw_chroms,
                                                   bw_starts,
                                                   ends=bw_ends,
                                                   values=bw_values)

                                bw_open.close()

                                positions_written.add(tx_pos)
            print(' Done in %ds.' % (time.time() - t0), flush=True)
            gc.collect()

            # advance through sequences
            # si += model.batch_size
            si += 1
示例#7
0
def score_write(sess, model, options, target_indexes, seqs_1hot, seqs_chrom,
                seqs_start):
    ''' Compute scores and write them as BigWigs for a set of sequences. '''

    num_seqs = seqs_1hot.shape[0]
    num_targets = len(target_indexes)

    # initialize scores HDF5
    scores_h5_file = '%s/scores.h5' % options.out_dir
    scores_h5_out = h5py.File(scores_h5_file, 'w')

    for si in range(num_seqs):
        # initialize batcher
        batcher_si = batcher.Batcher(seqs_1hot[si:si + 1],
                                     batch_size=model.hp.batch_size,
                                     pool_width=model.hp.target_pool)

        # get layer representations
        t0 = time.time()
        print('Computing gradients.', end='', flush=True)
        _, _, _, batch_grads, batch_reprs, _ = model.gradients(
            sess,
            batcher_si,
            rc=options.rc,
            shifts=options.shifts,
            mc_n=options.mc_n,
            return_all=True)
        print(' Done in %ds.' % (time.time() - t0), flush=True)

        # only layer
        batch_reprs = batch_reprs[0]
        batch_grads = batch_grads[0]

        # increase resolution
        batch_reprs = batch_reprs.astype('float32')
        batch_grads = batch_grads.astype('float32')

        # S (sequences) x T (targets) x P (seq position) x U (units layer i) x E (ensembles)
        print('batch_grads', batch_grads.shape)

        # S (sequences) x P (seq position) x U (Units layer i) x E (ensembles)
        print('batch_reprs', batch_reprs.shape)

        preds_length = batch_reprs.shape[1]
        if 'score' not in scores_h5_out:
            # initialize scores
            scores_h5_out.create_dataset('score',
                                         shape=(num_seqs, preds_length,
                                                num_targets),
                                         dtype='float16')
            scores_h5_out.create_dataset('pvalue',
                                         shape=(num_seqs, preds_length,
                                                num_targets),
                                         dtype='float16')

        # write bigwigs
        t0 = time.time()
        print('Computing and writing scores.', end='', flush=True)

        # for each target
        for tii in range(len(target_indexes)):
            ti = target_indexes[tii]

            # representation x gradient
            batch_grads_scores = np.multiply(batch_reprs[0],
                                             batch_grads[0, tii, :, :, :])

            if options.norm is None:
                # sum across filters
                batch_grads_scores = batch_grads_scores.sum(axis=1)
            else:
                # raise to power
                batch_grads_scores = np.power(np.abs(batch_grads_scores),
                                              options.norm)
                # sum across filters
                batch_grads_scores = batch_grads_scores.sum(axis=1)
                # normalize w/ 1/power
                batch_grads_scores = np.power(batch_grads_scores,
                                              1. / options.norm)

            # mean across ensemble
            batch_grads_mean = batch_grads_scores.mean(axis=1)

            # compute p-values
            if options.norm is None:
                batch_grads_pval = ttest_1samp(batch_grads_scores, 0,
                                               axis=1)[1]
            else:
                batch_grads_pval = ttest_1samp(batch_grads_scores, 0,
                                               axis=1)[1]
                # batch_grads_pval = chi2(df=)
                batch_grads_pval /= 2

            # write to HDF5
            scores_h5_out['score'][si, :,
                                   tii] = batch_grads_mean.astype('float16')
            scores_h5_out['pvalue'][si, :,
                                    tii] = batch_grads_pval.astype('float16')

            if options.bigwig:
                # open bigwig
                bws_file = '%s/s%d_t%d_scores.bw' % (options.out_dir, si, ti)
                bwp_file = '%s/s%d_t%d_pvals.bw' % (options.out_dir, si, ti)
                bws_open = bigwig_open(bws_file, options.genome_file)
                # bwp_open = bigwig_open(bwp_file, options.genome_file)

                # specify bigwig locations and values
                bw_chroms = [seqs_chrom[si]] * preds_length
                bw_starts = [
                    int(seqs_start[si] + pi * model.hp.target_pool)
                    for pi in range(preds_length)
                ]
                bw_ends = [
                    int(bws + model.hp.target_pool) for bws in bw_starts
                ]
                bws_values = [float(bgs) for bgs in batch_grads_mean]
                # bwp_values = [float(bgp) for bgp in batch_grads_pval]

                # write
                bws_open.addEntries(bw_chroms,
                                    bw_starts,
                                    ends=bw_ends,
                                    values=bws_values)
                # bwp_open.addEntries(bw_chroms, bw_starts, ends=bw_ends, values=bwp_values)

            # close
            if options.bigwig:
                bws_open.close()
                # bwp_open.close()

        print(' Done in %ds.' % (time.time() - t0), flush=True)
        gc.collect()

    scores_h5_out.close()