示例#1
0
    def test_tensorboard(self):
        """Test creating an Estimator from a TensorGraph that logs information to TensorBoard."""
        n_samples = 10
        n_features = 3
        n_tasks = 2

        # Create a dataset and an input function for processing it.

        np.random.seed(123)
        X = np.random.rand(n_samples, n_features)
        y = np.zeros((n_samples, n_tasks))
        dataset = dc.data.NumpyDataset(X, y)

        def input_fn(epochs):
            x, y, weights = dataset.make_iterator(batch_size=n_samples,
                                                  epochs=epochs).get_next()
            return {'x': x, 'weights': weights}, y

        # Create a TensorGraph model.

        model = dc.models.TensorGraph()
        features = layers.Feature(shape=(None, n_features))
        dense = layers.Dense(out_channels=n_tasks, in_layers=features)
        dense.set_summary('histogram')
        model.add_output(dense)
        labels = layers.Label(shape=(None, n_tasks))
        loss = layers.ReduceMean(layers.L2Loss(in_layers=[labels, dense]))
        model.set_loss(loss)

        # Create an estimator from it.

        x_col = tf.feature_column.numeric_column('x', shape=(n_features, ))
        estimator = model.make_estimator(feature_columns=[x_col])

        # Train the model.

        estimator.train(input_fn=lambda: input_fn(100))
示例#2
0
  def __init__(self,
               input_tokens,
               output_tokens,
               max_output_length,
               encoder_layers=4,
               decoder_layers=4,
               embedding_dimension=512,
               dropout=0.0,
               reverse_input=True,
               variational=False,
               annealing_start_step=5000,
               annealing_final_step=10000,
               **kwargs):
    """Construct a SeqToSeq model.

    In addition to the following arguments, this class also accepts all the keyword arguments
    from TensorGraph.

    Parameters
    ----------
    input_tokens: list
      a list of all tokens that may appear in input sequences
    output_tokens: list
      a list of all tokens that may appear in output sequences
    max_output_length: int
      the maximum length of output sequence that may be generated
    encoder_layers: int
      the number of recurrent layers in the encoder
    decoder_layers: int
      the number of recurrent layers in the decoder
    embedding_dimension: int
      the width of the embedding vector.  This also is the width of all
      recurrent layers.
    dropout: float
      the dropout probability to use during training
    reverse_input: bool
      if True, reverse the order of input sequences before sending them into
      the encoder.  This can improve performance when working with long sequences.
    variational: bool
      if True, train the model as a variational autoencoder.  This adds random
      noise to the encoder, and also constrains the embedding to follow a unit
      Gaussian distribution.
    annealing_start_step: int
      the step (that is, batch) at which to begin turning on the constraint term
      for KL cost annealing
    annealing_final_step: int
      the step (that is, batch) at which to finish turning on the constraint term
      for KL cost annealing
    """
    super(SeqToSeq, self).__init__(
        use_queue=False, **kwargs)  # TODO can we make it work with the queue?
    if SeqToSeq.sequence_end not in input_tokens:
      input_tokens = input_tokens + [SeqToSeq.sequence_end]
    if SeqToSeq.sequence_end not in output_tokens:
      output_tokens = output_tokens + [SeqToSeq.sequence_end]
    self._input_tokens = input_tokens
    self._output_tokens = output_tokens
    self._input_dict = dict((x, i) for i, x in enumerate(input_tokens))
    self._output_dict = dict((x, i) for i, x in enumerate(output_tokens))
    self._max_output_length = max_output_length
    self._embedding_dimension = embedding_dimension
    self._annealing_final_step = annealing_final_step
    self._annealing_start_step = annealing_start_step
    self._features = self._create_features()
    self._labels = layers.Label(shape=(None, None, len(output_tokens)))
    self._gather_indices = layers.Feature(
        shape=(self.batch_size, 2), dtype=tf.int32)
    self._reverse_input = reverse_input
    self._variational = variational
    self.embedding = self._create_encoder(encoder_layers, dropout)
    self.output = self._create_decoder(decoder_layers, dropout)
    self.set_loss(self._create_loss())
    self.add_output(self.output)
示例#3
0
 def _create_features(self):
   return layers.Feature(
       shape=(self.batch_size, self._max_output_length,
              len(self._input_tokens)))
示例#4
0
 def _create_features(self):
   return layers.Feature(shape=(None, None, len(self._input_tokens)))
示例#5
0
# Train a model to predict how well sequences will work for RNA interference.

import deepchem as dc
import deepchem.models.tensorgraph.layers as layers
import tensorflow as tf
import matplotlib.pyplot as plot

# Build the model.

model = dc.models.TensorGraph(model_dir='rnai')
features = layers.Feature(shape=(None, 21, 4))
labels = layers.Label(shape=(None, 1))
prev = features
for i in range(2):
    prev = layers.Conv1D(filters=10,
                         kernel_size=10,
                         activation=tf.nn.relu,
                         padding='same',
                         in_layers=prev)
    prev = layers.Dropout(dropout_prob=0.3, in_layers=prev)
output = layers.Dense(out_channels=1,
                      activation_fn=tf.sigmoid,
                      in_layers=layers.Flatten(prev))
model.add_output(output)
loss = layers.ReduceMean(layers.L2Loss(in_layers=[labels, output]))
model.set_loss(loss)

# Load the data.

train = dc.data.DiskDataset('train_siRNA')
valid = dc.data.DiskDataset('valid_siRNA')
示例#6
0
    if f.endswith('.TIF'):
        for row, blur in zip(rows, blurs):
            fname = f.replace('_F1', '_F%d' % blur).replace('_A', '_%s' % row)
            files.append(os.path.join(image_dir, fname))
            labels.append(os.path.join(label_dir, f))
loader = dc.data.ImageLoader()
dataset = loader.featurize(files, labels)
splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
    dataset, seed=123)

# Create the model.
learning_rate = dc.models.optimizers.ExponentialDecay(0.01, 0.9, 250)
model = dc.models.TensorGraph(learning_rate=learning_rate,
                              model_dir='models/segmentation')
features = layers.Feature(shape=(None, 520, 696, 1)) / 255.0
labels = layers.Label(shape=(None, 520, 696, 1)) / 255.0
# Downsample three times.
conv1 = layers.Conv2D(16, kernel_size=5, stride=2, in_layers=features)
conv2 = layers.Conv2D(32, kernel_size=5, stride=2, in_layers=conv1)
conv3 = layers.Conv2D(64, kernel_size=5, stride=2, in_layers=conv2)
# Do a 1x1 convolution.
conv4 = layers.Conv2D(64, kernel_size=1, stride=1, in_layers=conv3)
# Upsample three times.
concat1 = layers.Concat(in_layers=[conv3, conv4], axis=3)
deconv1 = layers.Conv2DTranspose(32,
                                 kernel_size=5,
                                 stride=2,
                                 in_layers=concat1)
concat2 = layers.Concat(in_layers=[conv2, deconv1], axis=3)
deconv2 = layers.Conv2DTranspose(16,
示例#7
0
  def __init__(self, **kwargs):
    """Construct a GAN.

    This class accepts all the keyword arguments from TensorGraph.
    """
    super(GAN, self).__init__(use_queue=False, **kwargs)

    # Create the inputs.

    self.noise_input = layers.Feature(shape=self.get_noise_input_shape())
    self.data_inputs = []
    for shape in self.get_data_input_shapes():
      self.data_inputs.append(layers.Feature(shape=shape))
    self.conditional_inputs = []
    for shape in self.get_conditional_input_shapes():
      self.conditional_inputs.append(layers.Feature(shape=shape))

    # Create the generator.

    self.generator = self.create_generator(self.noise_input,
                                           self.conditional_inputs)
    if not isinstance(self.generator, Sequence):
      raise ValueError('create_generator() must return a list of Layers')
    if len(self.generator) != len(self.data_inputs):
      raise ValueError(
          'The number of generator outputs must match the number of data inputs'
      )
    for g, d in zip(self.generator, self.data_inputs):
      if g.shape != d.shape:
        raise ValueError(
            'The shapes of the generator outputs must match the shapes of the data inputs'
        )
    for g in self.generator:
      self.add_output(g)

    # Create the discriminator.

    self.discrim_train = self.create_discriminator(self.data_inputs,
                                                   self.conditional_inputs)

    # Make a copy of the discriminator that takes the generator's output as
    # its input.

    replacements = {}
    for g, d in zip(self.generator, self.data_inputs):
      replacements[d] = g
    for c in self.conditional_inputs:
      replacements[c] = c
    self.discrim_gen = self.discrim_train.copy(replacements, shared=True)

    # Make a list of all layers in the generator and discriminator.

    def add_layers_to_set(layer, layers):
      if layer not in layers:
        layers.add(layer)
        for i in layer.in_layers:
          add_layers_to_set(i, layers)

    gen_layers = set()
    for layer in self.generator:
      add_layers_to_set(layer, gen_layers)
    discrim_layers = set()
    add_layers_to_set(self.discrim_train, discrim_layers)
    discrim_layers -= gen_layers

    # Create submodels for training the generator and discriminator.

    gen_loss = self.create_generator_loss(self.discrim_gen)
    discrim_loss = self.create_discriminator_loss(self.discrim_train,
                                                  self.discrim_gen)
    self.generator_submodel = self.create_submodel(
        layers=gen_layers, loss=gen_loss)
    self.discriminator_submodel = self.create_submodel(
        layers=discrim_layers, loss=discrim_loss)
示例#8
0
    def __init__(self, n_generators=1, n_discriminators=1, **kwargs):
        """Construct a GAN.

    In addition to the parameters listed below, this class accepts all the
    keyword arguments from TensorGraph.

    Parameters
    ----------
    n_generators: int
      the number of generators to include
    n_discriminators: int
      the number of discriminators to include
    """
        super(GAN, self).__init__(use_queue=False, **kwargs)
        self.n_generators = n_generators
        self.n_discriminators = n_discriminators

        # Create the inputs.

        self.noise_input = layers.Feature(shape=self.get_noise_input_shape())
        self.data_inputs = []
        for shape in self.get_data_input_shapes():
            self.data_inputs.append(layers.Feature(shape=shape))
        self.conditional_inputs = []
        for shape in self.get_conditional_input_shapes():
            self.conditional_inputs.append(layers.Feature(shape=shape))

        # Create the generators.

        self.generators = []
        for i in range(n_generators):
            generator = self.create_generator(self.noise_input,
                                              self.conditional_inputs)
            if not isinstance(generator, Sequence):
                raise ValueError(
                    'create_generator() must return a list of Layers')
            if len(generator) != len(self.data_inputs):
                raise ValueError(
                    'The number of generator outputs must match the number of data inputs'
                )
            for g, d in zip(generator, self.data_inputs):
                if g.shape != d.shape:
                    raise ValueError(
                        'The shapes of the generator outputs must match the shapes of the data inputs'
                    )
            for g in generator:
                self.add_output(g)
            self.generators.append(generator)

        # Create the discriminators.

        self.discrim_train = []
        self.discrim_gen = []
        for i in range(n_discriminators):
            discrim_train = self.create_discriminator(self.data_inputs,
                                                      self.conditional_inputs)
            self.discrim_train.append(discrim_train)

            # Make a copy of the discriminator that takes each generator's output as
            # its input.

            for generator in self.generators:
                replacements = {}
                for g, d in zip(generator, self.data_inputs):
                    replacements[d] = g
                for c in self.conditional_inputs:
                    replacements[c] = c
                discrim_gen = discrim_train.copy(replacements, shared=True)
                self.discrim_gen.append(discrim_gen)

        # Make a list of all layers in the generators and discriminators.

        def add_layers_to_set(layer, layers):
            if layer not in layers:
                layers.add(layer)
                for i in layer.in_layers:
                    add_layers_to_set(i, layers)

        gen_layers = set()
        for generator in self.generators:
            for layer in generator:
                add_layers_to_set(layer, gen_layers)
        discrim_layers = set()
        for discriminator in self.discrim_train:
            add_layers_to_set(discriminator, discrim_layers)
        discrim_layers -= gen_layers

        # Compute the loss functions.

        gen_losses = [self.create_generator_loss(d) for d in self.discrim_gen]
        discrim_losses = []
        for i in range(n_discriminators):
            for j in range(n_generators):
                discrim_losses.append(
                    self.create_discriminator_loss(
                        self.discrim_train[i],
                        self.discrim_gen[i * n_generators + j]))
        if n_generators == 1 and n_discriminators == 1:
            total_gen_loss = gen_losses[0]
            total_discrim_loss = discrim_losses[0]
        else:
            # Create learnable weights for the generators and discriminators.

            gen_alpha = layers.Variable(np.ones((1, n_generators)))
            gen_weights = layers.SoftMax(gen_alpha)
            discrim_alpha = layers.Variable(np.ones((1, n_discriminators)))
            discrim_weights = layers.SoftMax(discrim_alpha)

            # Compute the weighted errors

            weight_products = layers.Reshape(
                (n_generators * n_discriminators, ),
                in_layers=layers.Reshape(
                    (n_discriminators, 1), in_layers=discrim_weights) *
                layers.Reshape((1, n_generators), in_layers=gen_weights))
            total_gen_loss = layers.WeightedError(
                (layers.Stack(gen_losses, axis=0), weight_products))
            total_discrim_loss = layers.WeightedError(
                (layers.Stack(discrim_losses, axis=0), weight_products))
            gen_layers.add(gen_alpha)
            discrim_layers.add(gen_alpha)
            discrim_layers.add(discrim_alpha)

            # Add an entropy term to the loss.

            entropy = -(layers.ReduceSum(layers.Log(gen_weights)) /
                        n_generators + layers.ReduceSum(
                            layers.Log(discrim_weights)) / n_discriminators)
            total_discrim_loss += entropy

        # Create submodels for training the generators and discriminators.

        self.generator_submodel = self.create_submodel(layers=gen_layers,
                                                       loss=total_gen_loss)
        self.discriminator_submodel = self.create_submodel(
            layers=discrim_layers, loss=total_discrim_loss)
示例#9
0
# Train a model to predict transcription factor binding, based on both
# sequence and chromatin accessibility.

import deepchem as dc
import deepchem.models.tensorgraph.layers as layers
import tensorflow as tf
import numpy as np

# Build the model.

model = dc.models.TensorGraph(batch_size=1000, model_dir='chromatin')
features = layers.Feature(shape=(None, 101, 4))
accessibility = layers.Feature(shape=(None, 1))
labels = layers.Label(shape=(None, 1))
weights = layers.Weights(shape=(None, 1))
prev = features
for i in range(3):
    prev = layers.Conv1D(filters=15, kernel_size=10, activation=tf.nn.relu, padding='same', in_layers=prev)
    prev = layers.Dropout(dropout_prob=0.5, in_layers=prev)
prev = layers.Concat([layers.Flatten(prev), accessibility])
logits = layers.Dense(out_channels=1, in_layers=prev)
output = layers.Sigmoid(logits)
model.add_output(output)
loss = layers.SigmoidCrossEntropy(in_layers=[labels, logits])
weighted_loss = layers.WeightedError(in_layers=[loss, weights])
model.set_loss(weighted_loss)

# Load the data.

train = dc.data.DiskDataset('train_dataset')
valid = dc.data.DiskDataset('valid_dataset')
示例#10
0
import tensorflow as tf
import deepchem.models.tensorgraph.layers as layers

from tensorflow.examples.tutorials.mnist import input_data

# Read dataset
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
# Transfor dataset into format readable by deepchem
train_dataset = dc.data.NumpyDataset(mnist.train.images, mnist.train.labels)
test_dataset = dc.data.NumpyDataset(mnist.test.images, mnist.test.labels)

model = dc.models.TensorGraph(model_dir='mnist')

# Images in MNIST are 28x28, flattened 784
# None means that the input can be of any dimension - we can use it as variable batch size
feature = layers.Feature(shape=(None, 784))
# 0..9 digits
label = layers.Label(shape=(None, 10))
# Reshape flattened layer to matrix to use it with convolution
make_image = layers.Reshape(shape=(None, 28, 28), in_layers=feature)

conv2d_1 = layers.Conv2D(num_outputs=32,
                         activation_fn=tf.nn.relu,
                         in_layers=make_image)
conv2d_2 = layers.Conv2D(num_outputs=64,
                         activation_fn=tf.nn.relu,
                         in_layers=conv2d_1)

flatten = layers.Flatten(in_layers=conv2d_2)
dense1 = layers.Dense(out_channels=1024,
                      activation_fn=tf.nn.relu,
image_dir = 'BBBC005_v1_images'
files = []
labels = []
for f in os.listdir(image_dir):
  if f.endswith('.TIF'):
    files.append(os.path.join(image_dir, f))
    labels.append(int(re.findall('_C(.*?)_', f)[0]))
loader = dc.data.ImageLoader()
dataset = loader.featurize(files, np.array(labels))
splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset, seed=123)

# Create the model.
learning_rate = dc.models.tensorgraph.optimizers.ExponentialDecay(0.001, 0.9, 250)
model = dc.models.TensorGraph(learning_rate=learning_rate, model_dir='models/model')
features = layers.Feature(shape=(None, 520, 696))
labels = layers.Label(shape=(None,))
prev_layer = features
for num_outputs in [16, 32, 64, 128, 256]:
  prev_layer = layers.Conv2D(num_outputs, kernel_size=5, stride=2, in_layers=prev_layer)
output = layers.Dense(1, in_layers=layers.Flatten(prev_layer))
model.add_output(output)
loss = layers.ReduceSum(layers.L2Loss(in_layers=(output, labels)))
model.set_loss(loss)

if not os.path.exists('./models'):
  os.mkdir('models')
if not os.path.exists('./models/model'):
  os.mkdir('models/model')

if not RETRAIN:
def create_model():
    """
    Create our own MNIST model from scratch
    :return:
    :rtype:
    """
    mnist = input_data.read_data_sets("MNIST_DATA/", one_hot=True)

    # the layers from deepchem are the building blocks of what we will use to make our deep learning architecture

    # now we wrap our dataset into a NumpyDataset

    train_dataset = dc.data.NumpyDataset(mnist.train.images,
                                         mnist.train.labels)
    test_dataset = dc.data.NumpyDataset(mnist.test.images, mnist.test.labels)

    # we will create a model that will take an input, add multiple layers, where each layer takes input from the
    # previous layers.

    model = dc.models.TensorGraph(model_dir='mnist')

    # 784 corresponds to an image of size 28 X 28
    # 10 corresponds to the fact that there are 10 possible digits (0-9)
    # the None indicates that we can accept any size input (e.g. an empty array or 500 items each with 784 features)
    # our data is also categorical so we must one hot encode, set single array element to 1 and the rest to 0
    feature = layers.Feature(shape=(None, 784))
    labels = layers.Label(shape=(None, 10))

    # in order to apply convolutional layers to our input, we convert flat vector of 785 to 28X28
    # in_layers means it takes our feature layer as an input
    make_image = layers.Reshape(shape=(None, 28, 28), in_layers=feature)

    # now that we have reshaped the input, we pass to convolution layers

    conv2d_1 = layers.Conv2D(num_outputs=32,
                             activation_fn=tf.nn.relu,
                             in_layers=make_image)

    conv2d_2 = layers.Conv2D(num_outputs=64,
                             activation_fn=tf.nn.relu,
                             in_layers=conv2d_1)

    # we want to end by applying fully connected (Dense) layers to the outputs of our convolutional layer
    # but first, we must flatten the layer from a 2d matrix to a 1d vector

    flatten = layers.Flatten(in_layers=conv2d_2)
    dense1 = layers.Dense(out_channels=1024,
                          activation_fn=tf.nn.relu,
                          in_layers=flatten)

    # note that this is final layer so out_channels of 10 represents the 10 outputs and no activation_fn
    dense2 = layers.Dense(out_channels=10,
                          activation_fn=None,
                          in_layers=dense1)

    # next we want to connect this output to a loss function, so we can train the output

    # compute the value of loss function for every sample then average of all samples to get final loss (ReduceMean)
    smce = layers.SoftMaxCrossEntropy(in_layers=[labels, dense2])
    loss = layers.ReduceMean(in_layers=smce)
    model.set_loss(loss)

    # for MNIST we want the probability that a given sample represents one of the 10 digits
    # we can achieve this using a softmax function to get the probabilities, then cross entropy to get the labels

    output = layers.SoftMax(in_layers=dense2)
    model.add_output(output)

    # if our model takes long to train, reduce nb_epoch to 1
    model.fit(train_dataset, nb_epoch=10)

    # our metric is accuracy, the fraction of labels that are accurately predicted
    metric = dc.metrics.Metric(dc.metrics.accuracy_score)

    train_scores = model.evaluate(train_dataset, [metric])
    test_scores = model.evaluate(test_dataset, [metric])

    print('train_scores', train_scores)
    print('test_scores', test_scores)