Exemplo n.º 1
0
def add_percentiles_to_graph(x, percentile_list, fig_name):
    for p in percentile_list:
        # create percentile tensor
        p_tensor = percentile(x, p)

        # add tensors to a figure with name=enable_bn
        figures.add_scalar(tensor=p_tensor, legend=str(p), fig_name=fig_name)
Exemplo n.º 2
0
def normalized_sobel_edges(img,
                           subtract_median=True,
                           same_number_of_channels=True):
    """Applies the sobel filter to images and normalizes the result.

  Args:
    img: tensor of shape [B, H, W, C].
    subtract_median: bool; if True it subtracts the median from every channel.
      This makes constant backgrounds black.
    same_number_of_channels: bool; returnd tensor has the same number of
      channels as the input tensor if True.

  Returns:
    Tensor of shape [B, H, W, C] if same_number_of_channels
    else [B, H, W, 2C].
  """

    sobel_img = tf.image.sobel_edges(img)

    if same_number_of_channels:
        sobel_img = tf.reduce_sum(sobel_img, -1)
    else:
        n_channels = int(img.shape[-1])
        sobel_img = tf.reshape(
            sobel_img, sobel_img.shape[:-2].concatenate(2 * n_channels))

    if subtract_median:
        sobel_img = abs(sobel_img - contrib_distributions.percentile(
            sobel_img, 50.0, axis=(1, 2), keep_dims=True))

    smax = tf.reduce_max(sobel_img, (1, 2), keepdims=True)
    smin = tf.reduce_min(sobel_img, (1, 2), keepdims=True)
    sobel_img = (sobel_img - smin) / (smax - smin + 1e-8)
    return sobel_img
    def decode_and_resize(image_str_tensor):
        """Decodes an image string, resizes it, and performs custom equalization."""
        image = tf.image.decode_image(image_str_tensor, channels=IMG_SIZE[2])
        image = tf.reshape(image, IMG_SIZE)

        # Shift bottom of color range to 0
        image = image - tf.reduce_min(image, axis=(0, 1))

        # Divide pixel intensity by some portion of max value
        image = tf.cast(image, dtype=tf.float32)
        image = tf.divide(image, percentile(image, PCT_CUTOFF))

        return image
def stat_tensor(tensors):
    out1 = K.mean(tensors, axis=(1, 2))
    out2 = K.std(tensors, axis=(1, 2))
    out3_5 = percentile(tensors, q=5., axis=(1, 2))
    out3_15 = percentile(tensors, q=15., axis=(1, 2))
    out3_35 = percentile(tensors, q=35., axis=(1, 2))
    out3_50 = percentile(tensors, q=50., axis=(1, 2))
    out3_65 = percentile(tensors, q=65., axis=(1, 2))
    out3_85 = percentile(tensors, q=85., axis=(1, 2))
    out3_95 = percentile(tensors, q=95., axis=(1, 2))
    return K.concatenate(
        [out1, out2,
         out3_5, out3_15, out3_35, out3_50, out3_65, out3_85, out3_95],
        axis=1
    )
Exemplo n.º 5
0
def sample_percentiles(predictor,
                       per=[10, 90],
                       interpolation='nearest',
                       name=None):
    """
    Get the percentiles of the samples of a predictor.

    Parameter
    ---------
    predictor : Tensor
        A tensor of samples, where the first dimension indexes the samples.
    per : list
        A list of the percentiles to calculate from the samples. These must be
        in [0, 100].
    interpolation : string
        The type of interpolation method to use, see
        tf.contrib.distributions.percentile for details.
    name : str
        name to give this operation

    Returns
    -------
    percen: Tensor
        A tensor whose first dimension indexes the percentiles, computed along
        the first axis of the input.

    """
    for p in per:
        assert 0 <= p <= 100

    pers = [
        percentile(predictor, p, interpolation=interpolation, axis=0)
        for p in per
    ]

    percen = tf.stack(pers, name=name)
    return percen
Exemplo n.º 6
0
def peak_filter(input):
    N, H, W, C = K.int_shape(input)
    threshold = percentile(input, q=50, axis=(1,2))
    threshold = K.reshape(threshold, [tf.shape(input)[0], 1, 1, C])
    return threshold
Exemplo n.º 7
0
 def _percentile(x, interpolation):
     return percentile(x, 50.0, interpolation=interpolation)
Exemplo n.º 8
0
def do_clipped_factorization(
    counts_df,
    rank=3,
    clip_percentile=99.9,
    learning_rate=1.0,
    minibatch_size=1024 * 32,
    patience=5,
    max_epochs=1000,
    normalize_to_reads_per_million=True,
    log_every_seconds=10,
):
    """
    Attempt to detect and correct for clone and sample batch effects by
    subtracting off a learned low-rank reconstruction of the counts matrix.

    The return value is the clones x samples matrix of residuals after
    correcting for batch effects, with a few additional rows and columns giving
    the learned background effects.

    Implements the factorization:

    X = AB
        where X is (clones x samples), A is (clones x rank), and B is
        (rank x samples)

    by minimizing the "clipped" loss:

        ||minimum(X - AB, percentile(X - AB, clip_percentile)||_2 + unclipped

    The minimum is taken elementwise, and ||...||_2 is the Frobenius norm.
    clip_percentile is a parameter giving the percentile to clip at. The
    clipping makes the factorization robust to outliers, some of which are
    likely phip-seq hits.

    If the above is optimized without an `unclipped` term, a few phage clones
    may have all of their residuals above the truncation threshold. Once this
    happens they will likely stay stuck there since they do not contribute to
    the gradient. The unclipped term fixes this by providing a small nudge
    toward smaller errors without truncation.

    Note that "beads-only" samples are not treated in any special way here.

    The optimization is performed using stochastic gradient descent (SGD) on
    tensorflow.

    Parameters
    ----------
    counts_df : pandas.DataFrame
        Matrix of read counts (clones x samples)
    rank : int
        Rank of low-dimensional background effect matrices A and B
    clip_percentile : float
        Elements with reconstruction errors above this percentile do not
        contribute to the gradient. Aim for a lower-bound on the fraction
        of entries you expect NOT to be hits.
    learning_rate : float
        SGD optimizer learning rate
    minibatch_size : int
        Number of rows per SGD minibatch
    patience : int
        Number of epochs without improvement in training loss to tolerate before
        stopping
    max_epochs : int
        Maximum number of epochs
    normalize_to_reads_per_million : boolean
        Before computing factorization, first divide each column by the total
        number of reads for that sample and multiple by 1 million.
    log_every_seconds : float
        Seconds to wait before printing another optimization status update

    Returns
    -------
    pandas.DataFrame : residuals after correcting for batch effects

    In addition to the clones x samples residuals, rows and columns named
    "_background_0", "_background_1", ... giving the learned background vectors
    are also included.
    """

    # Non-tf setup
    if normalize_to_reads_per_million:
        observed = (counts_df * 1e6 / counts_df.sum(0)).astype("float32")
    else:
        observed = counts_df.astype("float32")
    (n, s) = observed.shape
    if len(counts_df) < minibatch_size:
        minibatch_size = len(counts_df)

    # Placeholders
    target = tf.placeholder(name="target", dtype="float32", shape=[None, s])
    minibatch_indices = tf.placeholder(name="minibatch_indices", dtype="int32")

    # Variables
    a = tf.Variable(np.random.rand(n, rank), name="A", dtype="float32")
    b = tf.Variable(np.random.rand(rank, s), name="B", dtype="float32")
    clip_threshold = tf.Variable(observed.max().max())

    # Derived quantities
    reconstruction = tf.matmul(tf.gather(a, minibatch_indices), b)
    differences = target - reconstruction

    # unclipped_term is based only on the minimum unclipped error for each
    # clone. The intuition is that we know for every clone at least one sample
    # must be a non-hit (e.g. a beads only sample), and so should be well modeled
    # by the background process.
    unclipped_term = tf.reduce_min(tf.pow(differences, 2), axis=1)
    loss = (
        tf.reduce_mean(tf.pow(tf.minimum(differences, clip_threshold), 2)) +
        tf.reduce_mean(unclipped_term) / s)

    update_clip_value = clip_threshold.assign(
        percentile(differences, clip_percentile))

    # Training
    train_step = tf.train.AdamOptimizer(
        learning_rate=learning_rate).minimize(loss)

    init = tf.global_variables_initializer()

    best_cost_value = None
    last_log_at = 0
    with tf.Session() as session:
        session.run(init)
        all_indices = np.arange(observed.shape[0], dtype=int)

        for i in range(max_epochs):
            indices = np.array(list(range(observed.shape[0])))
            np.random.shuffle(indices)
            for minibatch_indices_value in np.array_split(
                    indices, int(len(indices) / minibatch_size)):
                minibatch_indices_value = minibatch_indices_value[:
                                                                  minibatch_size]
                if len(minibatch_indices_value) == minibatch_size:
                    feed_dict = {
                        target: observed.values[minibatch_indices_value],
                        minibatch_indices: minibatch_indices_value,
                    }
                    session.run(train_step, feed_dict=feed_dict)

            feed_dict = {target: observed, minibatch_indices: all_indices}
            (clip_threshold_value,
             cost_value) = session.run([update_clip_value, loss],
                                       feed_dict=feed_dict)

            # Update best epoch
            if best_cost_value is None or cost_value < best_cost_value:
                best_cost_value = cost_value
                best_epoch = i
                (best_a, best_b) = session.run([a, b], feed_dict=feed_dict)

            # Log
            if log_every_seconds and time.time(
            ) - last_log_at > log_every_seconds:
                print("[Epoch %5d] %f, truncating at %f%s" % (
                    i,
                    cost_value,
                    clip_threshold_value,
                    " [new best]" if i == best_epoch else "",
                ))

            # Stop criterion
            if i - best_epoch > patience:
                print("Early stopping at epoch %d." % i)
                break

    background_names = ["_background_%d" % i for i in range(rank)]
    best_a = pd.DataFrame(best_a,
                          index=observed.index,
                          columns=background_names)
    best_b = pd.DataFrame(best_b,
                          index=background_names,
                          columns=observed.columns)

    results = observed - np.matmul(best_a, best_b)
    for name in background_names:
        results[name] = best_a[name]
        results.loc[name] = best_b.loc[name]

    return results
Exemplo n.º 9
0
 def _distribution(self, input_tensor):
     """
         process * batch * actions
         return batch * actions
     """
     return tcd.percentile(input_tensor, self.flags.percentile, axis=0)
Exemplo n.º 10
0
def train(with_gan=True, load_x=True, with_y=True, match_mask=False):
    """Train ring_net for a number of steps."""
    with tf.Graph().as_default():
        x_all = tf.placeholder(tf.float32, [None, FLAGS.seq_length, 512, 1])
        if match_mask: with_gan = False
        # possible dropout inside
        keep_prob = tf.placeholder("float")
        #x_dropout = tf.nn.dropout(x, keep_prob)

        x_in = x_all[:, :FLAGS.seq_start, :, :]
        # conv network
        encoder_state = None
        past_state = None
        future_state = None
        x_1, y_1, encoder_state, past_state, future_state = network_template(
            x_in, encoder_state, past_state, future_state)
        if not match_mask:
            y = x_all[:, FLAGS.seq_start:, :, :]
            x = x_all[:, :FLAGS.seq_start, :, :]
            past_loss_l2 = tf.nn.l2_loss(x - x_1)
            future_loss_l2 = tf.nn.l2_loss(y - y_1)
        else:
            x_mask = x_all > percentile(x_all, q=95.)
            x_mask = tf.one_hot(tf.cast(x_mask, tf.int32), depth=2, axis=-1)
            x_logit = tf.stack([x_1, 1. / x_1], axis=-1)
            y_logit = tf.stack([y_1, 1. / y_1], axis=-1)
            x_1 = tf.nn.softmax(logits=x_logit)
            y_1 = tf.nn.softmax(logits=y_logit)
            y = x_mask[:, FLAGS.seq_start:, :, :]
            x = x_mask[:, :FLAGS.seq_start, :, :]
            past_loss_l2 = tf.reduce_sum(
                tf.nn.softmax_cross_entropy_with_logits(logits=x_logit,
                                                        labels=x))
            future_loss_l2 = tf.reduce_sum(
                tf.nn.softmax_cross_entropy_with_logits(logits=y_logit,
                                                        labels=y))
            #import IPython; IPython.embed()
        if with_gan:
            img = x_all[:, FLAGS.seq_start:, :, :]
            img_ = y_1
            #import IPython; IPython.embed()
            D, D_logits, D3 = discriminator(img, reuse=False)
            #import IPython; IPython.embed()
            D_, D_logits_, D3_ = discriminator(
                y_1, reuse=True, fc_shape=D3.get_shape().as_list())
            d_loss_real = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(
                    logits=D_logits, labels=tf.ones_like(D)))
            d_loss_fake = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(
                    logits=D_logits_, labels=tf.zeros_like(D_)))
            d_loss = d_loss_real + d_loss_fake
            g_loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(
                    logits=D_logits_, labels=tf.ones_like(D_)))
            D3_loss = tf.nn.l2_loss(D3 - D3_)
            t_vars = tf.trainable_variables()
            d_vars = [var for var in t_vars if 'd_' in var.name]
            g_vars = [var for var in t_vars if 'd_' not in var.name]
            tf.summary.scalar('loss_g', g_loss)
            tf.summary.scalar('loss_d', d_loss)
            tf.summary.scalar('loss_feature', D3_loss)
            loss = 0.05 * (past_loss_l2 +
                           future_loss_l2) + g_loss + D3_loss * 1.e-4
            tf.summary.scalar('past_loss_l2', past_loss_l2)
            tf.summary.scalar('future_loss_l2', future_loss_l2)
            d_optim = tf.train.AdamOptimizer(FLAGS.lr).minimize(
                d_loss, var_list=d_vars)
            g_optim = tf.train.AdamOptimizer(FLAGS.lr).minimize(
                loss, var_list=g_vars)
            #import IPython; IPython.embed()
            train_op = tf.group(d_optim, d_optim, g_optim)

        else:
            loss = past_loss_l2 + future_loss_l2
            tf.summary.scalar('loss', loss)

            # training
            optimizer = tf.train.AdamOptimizer(FLAGS.lr)
            gvs = optimizer.compute_gradients(loss)
            # gradient clipping
            capped_gvs = [(tf.clip_by_value(grad, -3., 3.), var)
                          for grad, var in gvs]
            train_op = optimizer.apply_gradients(capped_gvs)

        # List of all Variables
        variables = tf.global_variables()

        # Build a saver
        saver = tf.train.Saver(tf.global_variables())

        # Summary op
        summary_op = tf.summary.merge_all()

        # Build an initialization operation to run below.
        init = tf.global_variables_initializer()

        # Start running operations on the Graph.
        sess = tf.Session()

        # init if this is the very time training

        sess.run(init)
        if FLAGS.resume:
            latest = tf.train.latest_checkpoint(FLAGS.train_dir)
            if not latest:
                print("No checkpoint to continue from in", FLAGS.train_dir)
                sys.exit(1)
            print("resume", latest)
            saver.restore(sess, latest)
        else:
            print("init network from scratch")

        # Summary op
        graph_def = sess.graph.as_graph_def(add_shapes=True)
        summary_writer = tf.summary.FileWriter(FLAGS.train_dir,
                                               graph_def=graph_def)
        if not with_y:
            files = find_files(FLAGS.train_data_index)
        else:
            files = find_pairs(FLAGS.train_data_index)
        sample_dir = FLAGS.train_dir + '/samples/'
        if not os.path.exists(sample_dir):
            os.makedirs(sample_dir)
        for step in range(FLAGS.max_step):
            dat = load_batch(FLAGS.batch_size,
                             files,
                             step,
                             with_y=with_y,
                             normalize=FLAGS.norm_input)
            dat = random_flip(dat)
            t = time.time()
            errG, errD = sess.run([g_loss, d_loss],
                                  feed_dict={
                                      x_all: dat,
                                      keep_prob: FLAGS.keep_prob
                                  })
            if errG > 0.6 and errD > 0.6:
                _, loss_r = sess.run([train_op, loss],
                                     feed_dict={
                                         x_all: dat,
                                         keep_prob: FLAGS.keep_prob
                                     })
            else:
                i = 0
                while errG > 0.6:

                    _ = sess.run(g_optim,
                                 feed_dict={
                                     x_all: dat,
                                     keep_prob: FLAGS.keep_prob
                                 })
                    i += 1
                    if i > 2: break
                    else:
                        errG = sess.run(g_loss,
                                        feed_dict={
                                            x_all: dat,
                                            keep_prob: FLAGS.keep_prob
                                        })
                print('G', i, errG)

                i = 0
                while errD > 0.6:
                    # only update discriminator if loss are within given bounds
                    _ = sess.run(d_optim,
                                 feed_dict={
                                     x_all: dat,
                                     keep_prob: FLAGS.keep_prob
                                 })
                    i += 1
                    if i > 2: break
                    else:
                        errD = sess.run(d_loss,
                                        feed_dict={
                                            x_all: dat,
                                            keep_prob: FLAGS.keep_prob
                                        })
                print('D', i, errD)
                loss_r = sess.run(loss,
                                  feed_dict={
                                      x_all: dat,
                                      keep_prob: FLAGS.keep_prob
                                  })
            #_, loss_r = sess.run([train_op, loss],feed_dict={x:dat, keep_prob:FLAGS.keep_prob})
            elapsed = time.time() - t

            if step % 1000 == 0 and step != 0:
                summary_str = sess.run(summary_op,
                                       feed_dict={
                                           x_all: dat,
                                           keep_prob: FLAGS.keep_prob
                                       })
                summary_writer.add_summary(summary_str, step)
                print("time per batch is " + str(elapsed))
                print(step)
                print(loss_r)

            assert not np.isnan(loss_r), 'Model diverged with loss = NaN'

            if step % 4000 == 0:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
                print("saved to " + FLAGS.train_dir)

                print("now saving sample!")
                im_x, im_y = sess.run([x_1, y_1],
                                      feed_dict={
                                          x_all: dat,
                                          keep_prob: FLAGS.keep_prob
                                      })
                if match_mask:
                    im_x = im_x[..., 1]
                    im_y = im_y[..., 1]
                _plot_samples(dat[:, :FLAGS.seq_start, :, :].squeeze(),
                              sample_dir + 'step_{}_past_t.png'.format(step))
                _plot_samples(im_x.squeeze(),
                              sample_dir + 'step_{}_past.png'.format(step))
                _plot_samples(dat[:, FLAGS.seq_start:, :, :].squeeze(),
                              sample_dir + 'step_{}_future_t.png'.format(step))
                _plot_samples(im_y.squeeze(),
                              sample_dir + 'step_{}_future.png'.format(step))
Exemplo n.º 11
0
def summary_percentiles(x, percents):

    for p in percents:
        name = "percentile_" + str(p)
        tf.summary.scalar(name, percentile(x, p))