예제 #1
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
            log.debug(opts)

        if opts.seed is not None:
            np.random.seed(opts.seed)

        log.info('Reading data')
        in_file = h5.File(opts.in_file, 'r')

        nb_sample = in_file['/act'].shape[0]
        if opts.nb_sample:
            nb_sample = min(opts.nb_sample, nb_sample)

        nb_filter = in_file['/act'].shape[-1]
        filters_idx = opts.filters
        if filters_idx is None:
            filters_idx = range(nb_filter)
        else:
            filters_idx = ranges_to_list(filters_idx, 0, nb_filter - 1)
            nb_filter = len(filters_idx)

        # Get only view on data to reduce memory usage. Possible since filters
        # can be processed independently.
        filters_act = in_file['/act']

        seqs = in_file['/inputs/dna'][:nb_sample]
        if seqs.shape[1] != filters_act.shape[1]:
            # Trim sequence length to length of activation layer
            tmp = (seqs.shape[1] - filters_act.shape[1]) // 2
            seqs = seqs[:, tmp:(tmp + filters_act.shape[1])]
            assert seqs.shape[1] == filters_act.shape[1]

        filters_weights = in_file['weights/weights']
        if filters_weights.ndim == 4:
            # For backward compatibility, support filter weights of shape
            # [filter_len, 1, nb_input_features, nb_output_features]
            assert filters_weights.shape[1] == 1
            filters_weights = filters_weights[:, 0, :]
        # The number of input features must match the number of nucleotides.
        assert filters_weights.shape[1] == 4
        filter_len = len(filters_weights)

        print('Filters: %d' % nb_filter)
        print('Filter len: %d' % filter_len)
        print('Samples: %d' % nb_sample)

        # Create output directories
        make_dir(opts.out_dir)
        sub_dirs = dict()
        names = ['logos', 'fa']
        if opts.plot_dens:
            names.append('dens')
        if opts.plot_heat:
            names.append('heat')
        if opts.motif_dbs:
            names.append('tomtom')
        for name in names:
            dirname = pt.join(opts.out_dir, name)
            sub_dirs[name] = dirname
            make_dir(dirname)

        meme_filename = pt.join(opts.out_dir, 'meme.txt')
        meme_file = open_meme(meme_filename, seqs)

        if opts.plot_pca:
            tmp = min(len(filters_act), opts.nb_sample_pca)
            log.info('Performing PCA on activations using %d samples' % tmp)
            # Down-sample activations to at most nb_sample_pca samples to reduce
            # memory usage and run-time.
            pca_act = filters_act[:tmp, :, filters_idx]

            act = pca_act.mean(axis=1)
            tmp = self.plot_filename(opts.out_dir, 'pca_mean')
            plot_pca(act, labels=filters_idx, filename=tmp)

            weights = linear_weights(pca_act.shape[1])
            act = np.average(pca_act, 1, weights)
            tmp = self.plot_filename(opts.out_dir, 'pca_wmean')
            plot_pca(act, labels=filters_idx, filename=tmp)

            act = pca_act.max(axis=1)
            tmp = self.plot_filename(opts.out_dir, 'pca_max')
            plot_pca(act, labels=filters_idx, filename=tmp)

        log.info('Analyzing filters')
        log.info('-----------------')
        filter_stats = []
        weblogo_opts = WEBLOGO_OPTS
        if opts.weblogo_opts:
            weblogo_opts = opts.weblogo_opts
        for idx in filters_idx:
            log.info('Filter %d' % idx)
            filter_act = filters_act[:nb_sample, :, idx]
            filter_weights = filters_weights[:, :, idx].T
            assert len(filter_weights) == len(ALPHABET)

            stats = OrderedDict()
            stats['idx'] = idx
            stats['motif'] = get_motif_from_weights(filter_weights)
            stats['act_mean'] = filter_act.mean()
            stats['act_std'] = filter_act.std()
            stats['ic'] = 0
            stats['nb_site'] = 0
            stats = pd.Series(stats)
            filter_stats.append(stats)

            if stats['act_mean'] == 0:
                log.info('Dead filter -> skip')
                continue

            if opts.plot_dens:
                log.info('Plotting filter densities')
                tmp = self.plot_filename(sub_dirs['dens'], '%03d' % idx)
                plot_filter_densities(np.ravel(filter_act), tmp)

            if opts.plot_heat:
                log.info('Plotting filter heatmap')
                tmp = self.plot_filename(sub_dirs['heat'], '%03d' % idx)
                plot_filter_heatmap(filter_weights, tmp)

            log.info('Extracting activating kmers')
            act_kmers = get_act_kmers(filter_act,
                                      filter_len,
                                      seqs,
                                      thr_per=opts.act_thr_per,
                                      thr_max=opts.act_thr_max)
            stats.nb_site = len(act_kmers)

            if len(act_kmers) < 10:
                log.info('Only %d activating kmers -> skip' % len(act_kmers))
                continue

            log.info('Plotting sequence logo')
            logo_file = pt.join(sub_dirs['fa'], '%03d.fa' % idx)
            write_kmers(act_kmers, logo_file)

            plot_logo(logo_file,
                      self.plot_filename(sub_dirs['logos'], '%03d' % idx),
                      options=weblogo_opts)
            if opts.delete_fasta:
                os.remove(logo_file)

            log.info('Computing PWM')
            pwm = get_pwm(act_kmers)
            stats.ic = info_content(pwm)
            add_to_meme(meme_file,
                        idx,
                        pwm,
                        len(act_kmers),
                        trim_thr=opts.trim_thr)

        meme_file.close()
        filter_stats = pd.DataFrame(filter_stats)
        for name in ['idx', 'nb_site']:
            filter_stats[name] = filter_stats[name].astype(np.int32)
        filter_stats.sort_values('act_mean', ascending=False, inplace=True)
        print()
        print('\nFilter statistics:')
        print(filter_stats.to_string())
        filter_stats.to_csv(pt.join(opts.out_dir, 'stats.tsv'),
                            float_format='%.4f',
                            sep='\t',
                            index=False)

        if opts.motif_dbs:
            log.info('Running tomtom')
            cmd = 'tomtom -dist pearson -thresh {thr} -oc {out_dir} ' + \
                '{meme_file} {motif_dbs}'
            cmd = cmd.format(thr=opts.fdr,
                             out_dir=pt.join(opts.out_dir, 'tomtom'),
                             meme_file=meme_filename,
                             motif_dbs=' '.join(opts.motif_dbs))
            print('\n', cmd)
            subprocess.call(cmd, shell=True)

            meme_motifs = []
            for motif_db in opts.motif_dbs:
                meme_motifs.append(read_meme_db(motif_db))
            meme_motifs = pd.concat(meme_motifs)
            tmp = pt.join(opts.out_dir, 'tomtom', 'meme_motifs.tsv')
            meme_motifs.to_csv(tmp, sep='\t', index=False)

            report = get_report(pt.join(opts.out_dir, 'stats.tsv'),
                                pt.join(opts.out_dir, 'tomtom', 'tomtom.txt'),
                                meme_motifs)
            report.sort_values(['idx', 'q-value', 'act_mean'],
                               ascending=[True, True, False],
                               inplace=True)
            report.to_csv(pt.join(opts.out_dir, 'report.tsv'),
                          index=False,
                          sep='\t',
                          float_format='%.3f')

            report_top = report.groupby('idx').first().reset_index()
            report_top.sort_values(['q-value', 'act_mean'],
                                   ascending=[True, False],
                                   inplace=True)
            report_top.index = range(len(report_top))
            report_top.to_csv(pt.join(opts.out_dir, 'report_top.tsv'),
                              index=False,
                              sep='\t',
                              float_format='%.3f')

            print('\nTomtom results:')
            print(report_top.to_string())

        in_file.close()
        log.info('Done!')
        return 0
예제 #2
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
            log.debug(opts)

        if opts.seed is not None:
            np.random.seed(opts.seed)

        if not opts.model_files:
            raise ValueError('No model files provided!')

        log.info('Loading model ...')
        K.set_learning_phase(0)
        model = mod.load_model(opts.model_files, log=log.info)

        weight_layer, act_layer = mod.get_first_conv_layer(model.layers, True)
        log.info('Using activation layer "%s"' % act_layer.name)
        log.info('Using weight layer "%s"' % weight_layer.name)

        try:
            dna_idx = model.input_names.index('dna')
        except BaseException:
            raise IOError('Model is not a valid DNA model!')

        fun_outputs = to_list(act_layer.output)
        if opts.store_preds:
            fun_outputs += to_list(model.output)
        fun = K.function([to_list(model.input)[dna_idx]], fun_outputs)

        log.info('Reading data ...')
        if opts.store_outputs or opts.store_preds:
            output_names = model.output_names
        else:
            output_names = None
        data_reader = mod.DataReader(
            output_names=output_names,
            use_dna=True,
            dna_wlen=to_list(model.input_shape)[dna_idx][1]
        )
        nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample)
        data_reader = data_reader(opts.data_files,
                                  nb_sample=nb_sample,
                                  batch_size=opts.batch_size,
                                  loop=False,
                                  shuffle=opts.shuffle)

        meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'],
                                 nb_sample=nb_sample,
                                 batch_size=opts.batch_size,
                                 loop=False,
                                 shuffle=False)

        out_file = h5.File(opts.out_file, 'w')
        out_group = out_file

        weights = weight_layer.get_weights()
        out_group['weights/weights'] = weights[0]
        out_group['weights/bias'] = weights[1]

        def h5_dump(path, data, idx, dtype=None, compression='gzip'):
            if path not in out_group:
                if dtype is None:
                    dtype = data.dtype
                out_group.create_dataset(
                    name=path,
                    shape=[nb_sample] + list(data.shape[1:]),
                    dtype=dtype,
                    compression=compression
                )
            out_group[path][idx:idx+len(data)] = data

        log.info('Computing activations')
        progbar = ProgressBar(nb_sample, log.info)
        idx = 0
        for data in data_reader:
            if isinstance(data, tuple):
                inputs, outputs, weights = data
            else:
                inputs = data
            if isinstance(inputs, dict):
                inputs = list(inputs.values())
            batch_size = len(inputs[0])
            progbar.update(batch_size)

            if opts.store_inputs:
                for i, name in enumerate(model.input_names):
                    h5_dump('inputs/%s' % name,
                            dna.onehot_to_int(inputs[i]), idx)

            if opts.store_outputs:
                for name, output in six.iteritems(outputs):
                    h5_dump('outputs/%s' % name, output, idx)

            fun_eval = fun(inputs)
            act = fun_eval[0]

            if opts.act_wlen:
                delta = opts.act_wlen // 2
                ctr = act.shape[1] // 2
                act = act[:, (ctr-delta):(ctr+delta+1)]

            if opts.act_fun:
                if opts.act_fun == 'mean':
                    act = act.mean(axis=1)
                elif opts.act_fun == 'wmean':
                    weights = linear_weights(act.shape[1])
                    act = np.average(act, axis=1, weights=weights)
                elif opts.act_fun == 'max':
                    act = act.max(axis=1)
                else:
                    raise ValueError('Invalid function "%s"!' % (opts.act_fun))

            h5_dump('act', act, idx)

            if opts.store_preds:
                preds = fun_eval[1:]
                for i, name in enumerate(model.output_names):
                    h5_dump('preds/%s' % name, preds[i].squeeze(), idx)

            for name, value in six.iteritems(next(meta_reader)):
                h5_dump(name, value, idx)

            idx += batch_size
        progbar.close()

        out_file.close()
        log.info('Done!')

        return 0
예제 #3
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
            log.debug(opts)

        if opts.seed is not None:
            np.random.seed(opts.seed)

        if not opts.model_files:
            raise ValueError('No model files provided!')

        log.info('Loading model ...')
        K.set_learning_phase(0)
        model = mod.load_model(opts.model_files, log=log.info)

        weight_layer, act_layer = mod.get_first_conv_layer(model.layers, True)
        log.info('Using activation layer "%s"' % act_layer.name)
        log.info('Using weight layer "%s"' % weight_layer.name)

        try:
            dna_idx = model.input_names.index('dna')
        except BaseException:
            raise IOError('Model is not a valid DNA model!')

        fun_outputs = to_list(act_layer.output)
        if opts.store_preds:
            fun_outputs += to_list(model.output)
        fun = K.function([to_list(model.input)[dna_idx]], fun_outputs)

        log.info('Reading data ...')
        if opts.store_outputs or opts.store_preds:
            output_names = model.output_names
        else:
            output_names = None
        data_reader = mod.DataReader(output_names=output_names,
                                     use_dna=True,
                                     dna_wlen=to_list(
                                         model.input_shape)[dna_idx][1])
        nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample)
        data_reader = data_reader(opts.data_files,
                                  nb_sample=nb_sample,
                                  batch_size=opts.batch_size,
                                  loop=False,
                                  shuffle=False)

        meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'],
                                 nb_sample=nb_sample,
                                 batch_size=opts.batch_size,
                                 loop=False,
                                 shuffle=False)

        out_file = h5.File(opts.out_file, 'w')
        out_group = out_file

        weights = weight_layer.get_weights()
        out_group['weights/weights'] = weights[0]
        out_group['weights/bias'] = weights[1]

        def h5_dump(path, data, idx, dtype=None, compression='gzip'):
            if path not in out_group:
                if dtype is None:
                    dtype = data.dtype
                out_group.create_dataset(name=path,
                                         shape=[nb_sample] +
                                         list(data.shape[1:]),
                                         dtype=dtype,
                                         compression=compression)
            out_group[path][idx:idx + len(data)] = data

        log.info('Computing activations')
        progbar = ProgressBar(nb_sample, log.info)
        idx = 0
        for data in data_reader:
            if isinstance(data, tuple):
                inputs, outputs, weights = data
            else:
                inputs = data
            if isinstance(inputs, dict):
                inputs = list(inputs.values())
            batch_size = len(inputs[0])
            progbar.update(batch_size)

            if opts.store_inputs:
                for i, name in enumerate(model.input_names):
                    h5_dump('inputs/%s' % name, dna.onehot_to_int(inputs[i]),
                            idx)

            if opts.store_outputs:
                for name, output in six.iteritems(outputs):
                    h5_dump('outputs/%s' % name, output, idx)

            fun_eval = fun(inputs)
            act = fun_eval[0]

            if opts.act_wlen:
                delta = opts.act_wlen // 2
                ctr = act.shape[1] // 2
                act = act[:, (ctr - delta):(ctr + delta + 1)]

            if opts.act_fun:
                if opts.act_fun == 'mean':
                    act = act.mean(axis=1)
                elif opts.act_fun == 'wmean':
                    weights = linear_weights(act.shape[1])
                    act = np.average(act, axis=1, weights=weights)
                elif opts.act_fun == 'max':
                    act = act.max(axis=1)
                else:
                    raise ValueError('Invalid function "%s"!' % (opts.act_fun))

            h5_dump('act', act, idx)

            if opts.store_preds:
                preds = fun_eval[1:]
                for i, name in enumerate(model.output_names):
                    h5_dump('preds/%s' % name, preds[i].squeeze(), idx)

            for name, value in six.iteritems(next(meta_reader)):
                h5_dump(name, value, idx)

            idx += batch_size
        progbar.close()

        out_file.close()
        log.info('Done!')

        return 0
예제 #4
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
            log.debug(opts)

        if opts.seed is not None:
            np.random.seed(opts.seed)

        if not opts.model_files:
            raise ValueError('No model files provided!')

        log.info('Loading model ...')
        K.set_learning_phase(0)
        model = mod.load_model(opts.model_files)

        # Get DNA layer.
        dna_layer = None
        for layer in model.layers:
            if layer.name == 'dna':
                dna_layer = layer
                break
        if not dna_layer:
            raise ValueError('The provided model is not a DNA model!')

        # Create output vector.
        outputs = []
        for output in model.outputs:
            outputs.append(K.reshape(output, (-1, 1)))
        outputs = K.concatenate(outputs, axis=1)

        # Compute gradient of outputs wrt. DNA layer.
        grads = []
        for name in opts.targets:
            if name == 'mean':
                target = K.mean(outputs, axis=1)
            elif name == 'var':
                target = K.var(outputs, axis=1)
            else:
                raise ValueError('Invalid effect size "%s"!' % name)
            grad = K.gradients(target, dna_layer.output)
            grads.extend(grad)
        grad_fun = K.function(model.inputs, grads)

        log.info('Reading data ...')
        nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample)
        replicate_names = dat.get_replicate_names(opts.data_files[0],
                                                  regex=opts.replicate_names,
                                                  nb_key=opts.nb_replicate)
        data_reader = mod.data_reader_from_model(
            model, outputs=False, replicate_names=replicate_names)
        data_reader = data_reader(opts.data_files,
                                  nb_sample=nb_sample,
                                  batch_size=opts.batch_size,
                                  loop=False,
                                  shuffle=False)

        meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'],
                                 nb_sample=nb_sample,
                                 batch_size=opts.batch_size,
                                 loop=False,
                                 shuffle=False)

        out_file = h5.File(opts.out_file, 'w')
        out_group = out_file

        def h5_dump(path, data, idx, dtype=None, compression='gzip'):
            if path not in out_group:
                if dtype is None:
                    dtype = data.dtype
                out_group.create_dataset(name=path,
                                         shape=[nb_sample] +
                                         list(data.shape[1:]),
                                         dtype=dtype,
                                         compression=compression)
            out_group[path][idx:idx + len(data)] = data

        log.info('Computing effects ...')
        progbar = ProgressBar(nb_sample, log.info)
        idx = 0
        for inputs in data_reader:
            if isinstance(inputs, dict):
                inputs = list(inputs.values())
            batch_size = len(inputs[0])
            progbar.update(batch_size)

            # Compute gradients.
            grads = grad_fun(inputs)

            # Slice window at center.
            if opts.dna_wlen:
                for i, grad in enumerate(grads):
                    delta = opts.dna_wlen // 2
                    ctr = grad.shape[1] // 2
                    grads[i] = grad[:, (ctr - delta):(ctr + delta + 1)]

            # Aggregate effects in window
            if opts.agg_effects:
                for i, grad in enumerate(grads):
                    if opts.agg_effects == 'mean':
                        grad = grad.mean(axis=1)
                    elif opts.agg_effects == 'wmean':
                        weights = linear_weights(grad.shape[1])
                        grad = np.average(grad, axis=1, weights=weights)
                    elif opts.agg_effects == 'max':
                        grad = grad.max(axis=1)
                    else:
                        tmp = 'Invalid function "%s"!' % (opts.agg_effects)
                        raise ValueError(tmp)
                    grads[i] = grad

            # Write computed effects
            for name, grad in zip(opts.targets, grads):
                h5_dump(name, grad, idx)

            # Store inputs
            if opts.store_inputs:
                for name, value in zip(model.input_names, inputs):
                    h5_dump(name, value, idx)

            # Store positions
            for name, value in next(meta_reader).items():
                h5_dump(name, value, idx)

            idx += batch_size
        progbar.close()

        out_file.close()
        log.info('Done!')

        return 0
예제 #5
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
            log.debug(opts)

        if opts.seed is not None:
            np.random.seed(opts.seed)

        if not opts.model_files:
            raise ValueError('No model files provided!')

        log.info('Loading model ...')
        K.set_learning_phase(0)
        model = mod.load_model(opts.model_files)

        # Get DNA layer.
        dna_layer = None
        for layer in model.layers:
            if layer.name == 'dna':
                dna_layer = layer
                break
        if not dna_layer:
            raise ValueError('The provided model is not a DNA model!')

        # Create output vector.
        outputs = []
        for output in model.outputs:
            outputs.append(K.reshape(output, (-1, 1)))
        outputs = K.concatenate(outputs, axis=1)

        # Compute gradient of outputs wrt. DNA layer.
        grads = []
        for name in opts.targets:
            if name == 'mean':
                target = K.mean(outputs, axis=1)
            elif name == 'var':
                target = K.var(outputs, axis=1)
            else:
                raise ValueError('Invalid effect size "%s"!' % name)
            grad = K.gradients(target, dna_layer.output)
            grads.extend(grad)
        grad_fun = K.function(model.inputs, grads)

        log.info('Reading data ...')
        nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample)
        replicate_names = dat.get_replicate_names(
            opts.data_files[0],
            regex=opts.replicate_names,
            nb_key=opts.nb_replicate)
        data_reader = mod.data_reader_from_model(
            model, outputs=False, replicate_names=replicate_names)
        data_reader = data_reader(opts.data_files,
                                  nb_sample=nb_sample,
                                  batch_size=opts.batch_size,
                                  loop=False,
                                  shuffle=False)

        meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'],
                                 nb_sample=nb_sample,
                                 batch_size=opts.batch_size,
                                 loop=False,
                                 shuffle=False)

        out_file = h5.File(opts.out_file, 'w')
        out_group = out_file

        def h5_dump(path, data, idx, dtype=None, compression='gzip'):
            if path not in out_group:
                if dtype is None:
                    dtype = data.dtype
                out_group.create_dataset(
                    name=path,
                    shape=[nb_sample] + list(data.shape[1:]),
                    dtype=dtype,
                    compression=compression
                )
            out_group[path][idx:idx+len(data)] = data

        log.info('Computing effects ...')
        progbar = ProgressBar(nb_sample, log.info)
        idx = 0
        for inputs in data_reader:
            if isinstance(inputs, dict):
                inputs = list(inputs.values())
            batch_size = len(inputs[0])
            progbar.update(batch_size)

            # Compute gradients.
            grads = grad_fun(inputs)

            # Slice window at center.
            if opts.dna_wlen:
                for i, grad in enumerate(grads):
                    delta = opts.dna_wlen // 2
                    ctr = grad.shape[1] // 2
                    grads[i] = grad[:, (ctr-delta):(ctr+delta+1)]

            # Aggregate effects in window
            if opts.agg_effects:
                for i, grad in enumerate(grads):
                    if opts.agg_effects == 'mean':
                        grad = grad.mean(axis=1)
                    elif opts.agg_effects == 'wmean':
                        weights = linear_weights(grad.shape[1])
                        grad = np.average(grad, axis=1, weights=weights)
                    elif opts.agg_effects == 'max':
                        grad = grad.max(axis=1)
                    else:
                        tmp = 'Invalid function "%s"!' % (opts.agg_effects)
                        raise ValueError(tmp)
                    grads[i] = grad

            # Write computed effects
            for name, grad in zip(opts.targets, grads):
                h5_dump(name, grad, idx)

            # Store inputs
            if opts.store_inputs:
                for name, value in zip(model.input_names, inputs):
                    h5_dump(name, value, idx)

            # Store positions
            for name, value in next(meta_reader).items():
                h5_dump(name, value, idx)

            idx += batch_size
        progbar.close()

        out_file.close()
        log.info('Done!')

        return 0