def get_variable_initializer(hparams): """Get variable initializer from hparams.""" if not hparams.initializer: return None mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_INITIALIZER_GAIN, value=hparams.initializer_gain, hparams=hparams) if not tf.contrib.eager.in_eager_mode(): tf.logging.info("Using variable initializer: %s", hparams.initializer) if hparams.initializer == "orthogonal": return tf.orthogonal_initializer(gain=hparams.initializer_gain) elif hparams.initializer == "uniform": max_val = 0.1 * hparams.initializer_gain return tf.random_uniform_initializer(-max_val, max_val) elif hparams.initializer == "normal_unit_scaling": return tf.variance_scaling_initializer( hparams.initializer_gain, mode="fan_avg", distribution="normal") elif hparams.initializer == "uniform_unit_scaling": return tf.variance_scaling_initializer( hparams.initializer_gain, mode="fan_avg", distribution="uniform") elif hparams.initializer == "xavier": return tf.contrib.layers.xavier_initializer() else: raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
def q_network(X_state, name): inputs = X_state with tf.variable_scope(name) as scope: dense_outputs = tf.layers.dense(inputs, 100, tf.nn.relu, kernel_initializer=tf.variance_scaling_initializer()) outputs = tf.layers.dense(dense_outputs, n_outputs, kernel_initializer=tf.variance_scaling_initializer()) trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name) trainable_vars_by_name = {var.name[len(scope.name):]: var for var in trainable_vars} return outputs, trainable_vars_by_name
def _get_variable_initializer(hparams): if hparams.initializer == "orthogonal": return tf.orthogonal_initializer(gain=hparams.initializer_gain) elif hparams.initializer == "uniform": max_val = 0.1 * hparams.initializer_gain return tf.random_uniform_initializer(-max_val, max_val) elif hparams.initializer == "normal_unit_scaling": return tf.variance_scaling_initializer( hparams.initializer_gain, mode="fan_avg", distribution="normal") elif hparams.initializer == "uniform_unit_scaling": return tf.variance_scaling_initializer( hparams.initializer_gain, mode="fan_avg", distribution="uniform") else: raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
def backbone_scope(freeze): """ Args: freeze (bool): whether to freeze all the variables under the scope """ def nonlin(x): x = get_norm()(x) return tf.nn.relu(x) with argscope([Conv2D, MaxPooling, BatchNorm], data_format='channels_first'), \ argscope(Conv2D, use_bias=False, activation=nonlin, kernel_initializer=tf.variance_scaling_initializer( scale=2.0, mode='fan_out')), \ ExitStack() as stack: if cfg.BACKBONE.NORM in ['FreezeBN', 'SyncBN']: if freeze or cfg.BACKBONE.NORM == 'FreezeBN': stack.enter_context(argscope(BatchNorm, training=False)) else: stack.enter_context(argscope( BatchNorm, sync_statistics='nccl' if cfg.TRAINER == 'replicated' else 'horovod')) if freeze: stack.enter_context(freeze_variables(stop_gradient=False, skip_collection=True)) else: # the layers are not completely freezed, but we may want to only freeze the affine if cfg.BACKBONE.FREEZE_AFFINE: stack.enter_context(custom_getter_scope(freeze_affine_getter)) yield
def __init__(self, name: str, n_heads: int, keys_encoder: Attendable, values_encoder: Attendable = None, dropout_keep_prob: float = 1.0, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: check_argument_types() BaseAttention.__init__(self, name, reuse, save_checkpoint, load_checkpoint, initializers) self.n_heads = n_heads self.dropout_keep_prob = dropout_keep_prob self.keys_encoder = keys_encoder if values_encoder is not None: self.values_encoder = values_encoder else: self.values_encoder = self.keys_encoder if self.n_heads <= 0: raise ValueError("Number of heads must be greater than zero.") if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0: raise ValueError("Dropout keep prob must be inside (0,1].") self._variable_scope.set_initializer(tf.variance_scaling_initializer( mode="fan_avg", distribution="uniform"))
def get_logits(self, image): gauss_init = tf.random_normal_initializer(stddev=0.01) with argscope(Conv2D, kernel_initializer=tf.variance_scaling_initializer(scale=2.)), \ argscope([Conv2D, FullyConnected], activation=tf.nn.relu), \ argscope([Conv2D, MaxPooling], data_format='channels_last'): # necessary padding to get 55x55 after conv1 image = tf.pad(image, [[0, 0], [2, 2], [2, 2], [0, 0]]) l = Conv2D('conv1', image, filters=96, kernel_size=11, strides=4, padding='VALID') # size: 55 visualize_conv1_weights(l.variables.W) l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm1') l = MaxPooling('pool1', l, 3, strides=2, padding='VALID') # 27 l = Conv2D('conv2', l, filters=256, kernel_size=5, split=2) l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm2') l = MaxPooling('pool2', l, 3, strides=2, padding='VALID') # 13 l = Conv2D('conv3', l, filters=384, kernel_size=3) l = Conv2D('conv4', l, filters=384, kernel_size=3, split=2) l = Conv2D('conv5', l, filters=256, kernel_size=3, split=2) l = MaxPooling('pool3', l, 3, strides=2, padding='VALID') l = FullyConnected('fc6', l, 4096, kernel_initializer=gauss_init, bias_initializer=tf.ones_initializer()) l = Dropout(l, rate=0.5) l = FullyConnected('fc7', l, 4096, kernel_initializer=gauss_init) l = Dropout(l, rate=0.5) logits = FullyConnected('fc8', l, 1000, kernel_initializer=gauss_init) return logits
def build_graph(self, image, label): image = image_preprocess(image, bgr=True) image = tf.transpose(image, [0, 3, 1, 2]) cfg = { 18: ([2, 2, 2, 2], preresnet_basicblock), 34: ([3, 4, 6, 3], preresnet_basicblock), } defs, block_func = cfg[DEPTH] with argscope(Conv2D, use_bias=False, kernel_initializer=tf.variance_scaling_initializer(scale=2.0, mode='fan_out')), \ argscope([Conv2D, MaxPooling, GlobalAvgPooling, BatchNorm], data_format='channels_first'): convmaps = (LinearWrap(image) .Conv2D('conv0', 64, 7, strides=2, activation=BNReLU) .MaxPooling('pool0', 3, strides=2, padding='SAME') .apply2(preresnet_group, 'group0', block_func, 64, defs[0], 1) .apply2(preresnet_group, 'group1', block_func, 128, defs[1], 2) .apply2(preresnet_group, 'group2', block_func, 256, defs[2], 2) .apply2(preresnet_group, 'group3new', block_func, 512, defs[3], 1)()) print(convmaps) convmaps = GlobalAvgPooling('gap', convmaps) logits = FullyConnected('linearnew', convmaps, 1000) loss = compute_loss_and_error(logits, label) wd_cost = regularize_cost('.*/W', l2_regularizer(1e-4), name='l2_regularize_loss') add_moving_summary(loss, wd_cost) return tf.add_n([loss, wd_cost], name='cost')
def additive_attention(a, b, a_lengths, b_lengths, max_seq_len, hidden_units=150, scope='additive-attention', reuse=False): """ For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn, where attn(i, j) = dot(v, tanh(W*a_i + W*b_j)). v is a learnable vector and W is a learnable matrix. The rows of attn are softmax normalized. Args: a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. max_seq_len: Length of padded sequences a and b. Integer. hidden_units: Number of hidden units. Integer. Returns: Attention matrix. Tensor of shape [max_seq_len, max_seq_len]. """ with tf.variable_scope(scope, reuse=reuse): aW = time_distributed_dense_layer(a, hidden_units, bias=False, scope='dense', reuse=False) bW = time_distributed_dense_layer(b, hidden_units, bias=False, scope='dense', reuse=True) aW = tf.expand_dims(aW, 2) bW = tf.expand_dims(bW, 1) v = tf.get_variable( name='dot_weights', initializer=tf.variance_scaling_initializer(), shape=[hidden_units] ) logits = tf.einsum('ijkl,l->ijk', tf.nn.tanh(aW + bW), v) logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2) attn = tf.exp(logits) attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len) return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2)
def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format="channels_first"): """Strided 2-D convolution with explicit padding. The padding is consistent and is based only on `kernel_size`, not on the dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). Args: inputs: `Tensor` of size `[batch, channels, height_in, width_in]`. filters: `int` number of filters in the convolution. kernel_size: `int` size of the kernel to be used in the convolution. strides: `int` strides of the convolution. data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. Returns: A `Tensor` of shape `[batch, filters, height_out, width_out]`. """ if strides > 1: inputs = fixed_padding(inputs, kernel_size, data_format=data_format) return tf.layers.conv2d( inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, padding=("SAME" if strides == 1 else "VALID"), use_bias=False, kernel_initializer=tf.variance_scaling_initializer(), data_format=data_format)
def __call__(self, inputs, targets=None): """Calculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, input_length]. targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} """ # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. initializer = tf.variance_scaling_initializer( self.params.initializer_gain, mode="fan_avg", distribution="uniform") with tf.variable_scope("Transformer", initializer=initializer): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = model_utils.get_padding_bias(inputs) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder_outputs = self.encode(inputs, attention_bias) # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: return self.predict(encoder_outputs, attention_bias) else: logits = self.decode(targets, encoder_outputs, attention_bias) return logits
def _fully_connected(self, x, out_dim): w = tf.get_variable( 'DW', [x.get_shape()[1], out_dim], initializer=tf.variance_scaling_initializer(distribution='uniform')) b = tf.get_variable( 'biases', [out_dim], initializer=tf.constant_initializer()) return tf.nn.xw_plus_b(x, w, b)
def output(self) -> tf.Tensor: pooled_outputs = [] for filter_size, num_filters in self.filters: with tf.variable_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, self.embedding_size, num_filters] w_filter = get_variable( "conv_W", filter_shape, initializer=tf.variance_scaling_initializer( mode="fan_avg", distribution="uniform")) b_filter = get_variable( "conv_bias", [num_filters], initializer=tf.zeros_initializer()) conv = tf.nn.conv1d( self.embedded_inputs, w_filter, stride=1, padding="VALID", name="conv") # Apply nonlinearity conv_relu = tf.nn.relu(tf.nn.bias_add(conv, b_filter)) # Max-pooling over the outputs pooled = tf.reduce_max(conv_relu, 1) pooled_outputs.append(pooled) # Combine all the pooled features return tf.concat(pooled_outputs, axis=1)
def build_graph(self, image, label): assert tf.test.is_gpu_available() MEAN_IMAGE = tf.constant([0.4914, 0.4822, 0.4465], dtype=tf.float32) STD_IMAGE = tf.constant([0.2023, 0.1994, 0.2010], dtype=tf.float32) image = ((image / 255.0) - MEAN_IMAGE) / STD_IMAGE image = tf.transpose(image, [0, 3, 1, 2]) pytorch_default_init = tf.variance_scaling_initializer(scale=1.0 / 3, mode='fan_in', distribution='uniform') with argscope([Conv2D, BatchNorm, GlobalAvgPooling], data_format='channels_first'), \ argscope(Conv2D, kernel_initializer=pytorch_default_init): net = Conv2D('conv0', image, 64, kernel_size=3, strides=1, use_bias=False) for i, blocks_in_module in enumerate(MODULE_SIZES): for j in range(blocks_in_module): stride = 2 if j == 0 and i > 0 else 1 with tf.variable_scope("res%d.%d" % (i, j)): net = preactivation_block(net, FILTER_SIZES[i], stride) net = GlobalAvgPooling('gap', net) logits = FullyConnected('linear', net, CLASS_NUM, kernel_initializer=tf.random_normal_initializer(stddev=1e-3)) ce_cost = tf.nn.softmax_cross_entropy_with_logits(labels=label, logits=logits) ce_cost = tf.reduce_mean(ce_cost, name='cross_entropy_loss') single_label = tf.to_int32(tf.argmax(label, axis=1)) wrong = tf.to_float(tf.logical_not(tf.nn.in_top_k(logits, single_label, 1)), name='wrong_vector') # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error'), ce_cost) add_param_summary(('.*/W', ['histogram'])) # weight decay on all W matrixes. including convolutional layers wd_cost = tf.multiply(WEIGHT_DECAY, regularize_cost('.*', tf.nn.l2_loss), name='wd_cost') return tf.add_n([ce_cost, wd_cost], name='cost')
def get_tf_initializer(name="glorot"): if name == "const": return tf.constant_initializer(0.3) elif name == "glorot": return tf.variance_scaling_initializer( scale=1.0, mode="fan_avg", distribution="normal") elif name == "normal": return tf.truncated_normal_initializer(dtype=tf.float32, stddev=0.36)
def q_network(state_tensor): inputs = state_tensor conv_outputs1 = tf.layers.conv2d(inputs, filters=32, kernel_size=(8,8), strides=4, padding='same', activation=tf.nn.relu, kernel_initializer=tf.variance_scaling_initializer()) conv_outputs2 = tf.layers.conv2d(conv_outputs1, filters=64, kernel_size=(4,4), strides=2, padding='same', activation=tf.nn.relu, kernel_initializer=tf.variance_scaling_initializer()) conv_outputs3 = tf.layers.conv2d(conv_outputs2, filters=64, kernel_size=(3,3), strides=1, padding='same', activation=tf.nn.relu, kernel_initializer=tf.variance_scaling_initializer()) flat_outputs = tf.reshape(conv_outputs3, shape=[-1, n_hidden_in]) dense_outputs = tf.layers.dense(flat_outputs, n_hidden, activation=tf.nn.relu, kernel_initializer=tf.variance_scaling_initializer()) outputs = tf.layers.dense(dense_outputs, n_outputs, kernel_initializer=tf.variance_scaling_initializer()) return outputs
def Deconv2D(x, out_channel, kernel_shape, stride, padding='SAME', W_init=None, b_init=None, nl=tf.identity, use_bias=True, data_format='NHWC'): """ 2D deconvolution on 4D inputs. Args: x (tf.Tensor): a tensor of shape NHWC. Must have known number of channels, but can have other unknown dimensions. out_channel: the output number of channel. kernel_shape: (h, w) tuple or a int. stride: (h, w) tuple or a int. padding (str): 'valid' or 'same'. Case insensitive. W_init: initializer for W. Defaults to `tf.variance_scaling_initializer(2.0)`, i.e. kaiming-normal. b_init: initializer for b. Defaults to zero. nl: a nonlinearity function. use_bias (bool): whether to use bias. Returns: tf.Tensor: a NHWC tensor named ``output`` with attribute `variables`. Variable Names: * ``W``: weights * ``b``: bias """ in_shape = x.get_shape().as_list() channel_axis = 3 if data_format == 'NHWC' else 1 in_channel = in_shape[channel_axis] assert in_channel is not None, "[Deconv2D] Input cannot have unknown channel!" assert isinstance(out_channel, int), out_channel if W_init is None: W_init = tf.variance_scaling_initializer(scale=2.0) if b_init is None: b_init = tf.constant_initializer() with rename_get_variable({'kernel': 'W', 'bias': 'b'}): layer = tf.layers.Conv2DTranspose( out_channel, kernel_shape, strides=stride, padding=padding, data_format='channels_last' if data_format == 'NHWC' else 'channels_first', activation=lambda x: nl(x, name='output'), use_bias=use_bias, kernel_initializer=W_init, bias_initializer=b_init, trainable=True) ret = layer.apply(x, scope=tf.get_variable_scope()) ret.variables = VariableHolder(W=layer.kernel) if use_bias: ret.variables.b = layer.bias return ret
def get_variable_initializer(hparams): """Get variable initializer from hparams.""" if not hparams.initializer: return None tf.logging.info("Using variable initializer: %s", hparams.initializer) if hparams.initializer == "orthogonal": return tf.orthogonal_initializer(gain=hparams.initializer_gain) elif hparams.initializer == "uniform": max_val = 0.1 * hparams.initializer_gain return tf.random_uniform_initializer(-max_val, max_val) elif hparams.initializer == "normal_unit_scaling": return tf.variance_scaling_initializer( hparams.initializer_gain, mode="fan_avg", distribution="normal") elif hparams.initializer == "uniform_unit_scaling": return tf.variance_scaling_initializer( hparams.initializer_gain, mode="fan_avg", distribution="uniform") else: raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
def fastrcnn_Xconv1fc_head(feature, num_classes, num_convs): """ Args: feature (any shape): num_classes(int): num_category + 1 num_convs (int): number of conv layers Returns: cls_logits (Nxnum_class), reg_logits (Nx num_class-1 x 4) """ l = feature with argscope(Conv2D, data_format='channels_first', kernel_initializer=tf.variance_scaling_initializer( scale=2.0, mode='fan_out', distribution='normal')): for k in range(num_convs): l = Conv2D('conv{}'.format(k), l, cfg.FPN.FRCNN_CONV_HEAD_DIM, 3, activation=tf.nn.relu) l = FullyConnected('fc', l, cfg.FPN.FRCNN_FC_HEAD_DIM, kernel_initializer=tf.variance_scaling_initializer(), activation=tf.nn.relu) return fastrcnn_outputs('outputs', l, num_classes)
def embedded_inputs(self) -> tf.Tensor: with tf.variable_scope("input_projection"): embedding_matrix = get_variable( "word_embeddings", [len(self.vocabulary), self.embedding_size], initializer=tf.variance_scaling_initializer( mode="fan_avg", distribution="uniform")) return dropout( tf.nn.embedding_lookup(embedding_matrix, self.inputs), self.dropout_keep_prob, self.train_mode)
def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format): """Strided 2-D convolution with explicit padding.""" # The padding is consistent and is based only on `kernel_size`, not on the # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). if strides > 1: inputs = fixed_padding(inputs, kernel_size, data_format) return tf.layers.conv2d( inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, padding=('SAME' if strides == 1 else 'VALID'), use_bias=False, kernel_initializer=tf.variance_scaling_initializer(), data_format=data_format)
def modality_matrix(self) -> tf.Tensor: """Create an embedding matrix for varyining target modalities. Used to embed different target space modalities in the tensor2tensor models (e.g. during the zero-shot translation). """ emb_size = self.input_sequence.temporal_states.shape.as_list()[-1] return get_variable( name="target_modality_embedding_matrix", shape=[32, emb_size], dtype=tf.float32, initializer=tf.variance_scaling_initializer( mode="fan_avg", distribution="uniform"))
def fastrcnn_2fc_head(feature): """ Args: feature (any shape): Returns: 2D head feature """ dim = cfg.FPN.FRCNN_FC_HEAD_DIM init = tf.variance_scaling_initializer() hidden = FullyConnected('fc6', feature, dim, kernel_initializer=init, activation=tf.nn.relu) hidden = FullyConnected('fc7', hidden, dim, kernel_initializer=init, activation=tf.nn.relu) return hidden
def resnet_backbone(image, num_blocks, group_func, block_func): with argscope(Conv2D, nl=tf.identity, use_bias=False, W_init=tf.variance_scaling_initializer(scale=2.0, mode='fan_out')): logits = (LinearWrap(image) .Conv2D('conv0', 64, 7, stride=2, nl=BNReLU) .MaxPooling('pool0', shape=3, stride=2, padding='SAME') .apply(group_func, 'group0', block_func, 64, num_blocks[0], 1) .apply(group_func, 'group1', block_func, 128, num_blocks[1], 2) .apply(group_func, 'group2', block_func, 256, num_blocks[2], 2) .apply(group_func, 'group3', block_func, 512, num_blocks[3], 2) .GlobalAvgPooling('gap') .FullyConnected('linear', 1000, nl=tf.identity)()) return logits
def get_instance(args): """ create an instance of the initializer """ scale = float(args.get('scale', 1.0)) mode = args.get('mode', "fan_in") assert (mode in ["fan_in", "fan_out", "fan_avg"]) distribution = args.get('distribution', "normal") assert (distribution in ["normal", "uniform"]) return tf.variance_scaling_initializer(scale, mode, distribution, seed=SEED)
def fastrcnn_Xconv1fc_head(feature, num_convs, norm=None): """ Args: feature (NCHW): num_classes(int): num_category + 1 num_convs (int): number of conv layers norm (str or None): either None or 'GN' Returns: 2D head feature """ assert norm in [None, 'GN'], norm l = feature with argscope(Conv2D, data_format='channels_first', kernel_initializer=tf.variance_scaling_initializer( scale=2.0, mode='fan_out', distribution='normal')): for k in range(num_convs): l = Conv2D('conv{}'.format(k), l, cfg.FPN.FRCNN_CONV_HEAD_DIM, 3, activation=tf.nn.relu) if norm is not None: l = GroupNorm('gn{}'.format(k), l) l = FullyConnected('fc', l, cfg.FPN.FRCNN_FC_HEAD_DIM, kernel_initializer=tf.variance_scaling_initializer(), activation=tf.nn.relu) return l
def fastrcnn_2fc_head(feature, num_classes): """ Args: feature (any shape): num_classes(int): num_category + 1 Returns: cls_logits (Nxnum_class), reg_logits (Nx num_class-1 x 4) """ dim = cfg.FPN.FRCNN_FC_HEAD_DIM init = tf.variance_scaling_initializer() hidden = FullyConnected('fc6', feature, dim, kernel_initializer=init, activation=tf.nn.relu) hidden = FullyConnected('fc7', hidden, dim, kernel_initializer=init, activation=tf.nn.relu) return fastrcnn_outputs('outputs', hidden, num_classes)
def resnet_backbone(image, num_blocks, group_func, block_func): with argscope(Conv2D, use_bias=False, kernel_initializer=tf.variance_scaling_initializer(scale=2.0, mode='fan_out')): # Note that this pads the image by [2, 3] instead of [3, 2]. # Similar things happen in later stride=2 layers as well. l = Conv2D('conv0', image, 64, 7, strides=2, activation=BNReLU) l = MaxPooling('pool0', l, pool_size=3, strides=2, padding='SAME') l = group_func('group0', l, block_func, 64, num_blocks[0], 1) l = group_func('group1', l, block_func, 128, num_blocks[1], 2) l = group_func('group2', l, block_func, 256, num_blocks[2], 2) l = group_func('group3', l, block_func, 512, num_blocks[3], 2) l = GlobalAvgPooling('gap', l) logits = FullyConnected('linear', l, 1000, kernel_initializer=tf.random_normal_initializer(stddev=0.01)) return logits
def maskrcnn_head(feature, num_class): """ Args: feature (NxCx7x7): num_classes(int): num_category + 1 Returns: mask_logits (N x num_category x 14 x 14): """ with argscope([Conv2D, Deconv2D], data_format='NCHW', W_init=tf.variance_scaling_initializer( scale=2.0, mode='fan_in', distribution='normal')): l = Deconv2D('deconv', feature, 256, 2, stride=2, nl=tf.nn.relu) l = Conv2D('conv', l, num_class - 1, 1) return l
def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1, W_init=None, activation=tf.identity): in_shape = x.get_shape().as_list() in_channel = in_shape[1] assert out_channel % in_channel == 0, (out_channel, in_channel) channel_mult = out_channel // in_channel if W_init is None: W_init = tf.variance_scaling_initializer(2.0) kernel_shape = [kernel_shape, kernel_shape] filter_shape = kernel_shape + [in_channel, channel_mult] W = tf.get_variable('W', filter_shape, initializer=W_init) conv = tf.nn.depthwise_conv2d(x, W, [1, 1, stride, stride], padding=padding, data_format='NCHW') return activation(conv, name='output')
def conv2d_fixed_padding(**kwargs): """conv2d with fixed_padding, based only on kernel_size.""" strides = kwargs["strides"] if strides > 1: kwargs["inputs"] = fixed_padding(kwargs["inputs"], kwargs["kernel_size"], kwargs["data_format"]) defaults = { "padding": ("SAME" if strides == 1 else "VALID"), "use_bias": False, "kernel_initializer": tf.variance_scaling_initializer(), } defaults.update(kwargs) return tf.layers.conv2d(**defaults)
from tensorflow.examples.tutorials.mnist import input_data import numpy as np import matplotlib.pyplot as plt mnist = input_data.read_data_sets('data_MNIST/', one_hot=True) input_node = 784 n_nodes_h1 = 621 n_nodes_h2 = 312 n_nodes_h3 = 128 n_nodes_h4 = 312 n_nodes_h5 = 621 n_classes = 784 batch_size = 128 scaler = tf.variance_scaling_initializer() X = tf.placeholder(tf.float32, [None, 784]) y = tf.placeholder(tf.float32) # noise matrix mean = 0.9 stddev = 0.7 noise_global = np.random.normal(mean, stddev, 784) def stacked_an(X): hidden_1_layer = { 'weights': tf.Variable(scaler([input_node, n_nodes_h1], dtype=tf.float32)), 'biases': tf.Variable(scaler([n_nodes_h1], dtype=tf.float32))
def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format, training, dropout, dropout_prob): """Strided 2-D convolution with explicit padding.""" # The padding is consistent and is based only on `kernel_size`, not on the # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). if strides > 1: inputs = fixed_padding(inputs, kernel_size, data_format) output = None if dropout != 'spiral': output = tf.layers.conv2d( inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, padding=('SAME' if strides == 1 else 'VALID'), use_bias=False, kernel_initializer=tf.variance_scaling_initializer(), data_format=data_format) elif dropout == 'spiral': output = spiral_conv2d( inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, padding=('SAME' if strides == 1 else 'VALID'), use_bias=False, kernel_initializer=tf.variance_scaling_initializer(), data_format=data_format, training=training, spiral_prob=dropout_prob) output_channel_last = tf.transpose(output, [0, 3, 2, 1]) output_channel_last = DropBlock(keep_prob=dropout_prob, block_size=7)(output_channel_last, training=training) return tf.transpose(output_channel_last, [0, 3, 2, 1], name='dropblock_output') #if not training: # dropout_prob = 1 if dropout == 'dropout': print("Normal Dropout") output = Dropout(rate=(1 - dropout_prob))(output, training=training) elif dropout == 'spatial': print("spatial Dropout") output = SpatialDropout2D(rate=(1 - dropout_prob), data_format=data_format)(output, training=training) elif dropout == 'dropblock3': print("DropBlock3") output = tf.identity(output, name='conv_output') output_channel_last = tf.transpose(output, [0, 3, 2, 1]) output_channel_last = DropBlock(keep_prob=dropout_prob, block_size=3)(output_channel_last, training=training) output = tf.transpose(output_channel_last, [0, 3, 2, 1], name='dropblock_output') elif dropout == 'dropblock5': print("DropBlock5") output = tf.identity(output, name='conv_output') output_channel_last = tf.transpose(output, [0, 3, 2, 1]) output_channel_last = DropBlock(keep_prob=dropout_prob, block_size=5)(output_channel_last, training=training) output = tf.transpose(output_channel_last, [0, 3, 2, 1], name='dropblock_output') elif dropout == 'dropblock7': print("DropBlock7") output = tf.identity(output, name='conv_output') output_channel_last = tf.transpose(output, [0, 3, 2, 1]) output_channel_last = DropBlock(keep_prob=dropout_prob, block_size=7)(output_channel_last, training=training) output = tf.transpose(output_channel_last, [0, 3, 2, 1], name='dropblock_output') return output
def __init__(self, observation_spec, conv_layer_params=None, input_fc_layer_params=(75, 40), lstm_size=(40,), output_fc_layer_params=(75, 40), activation_fn=tf.keras.activations.relu, name='ValueRnnNetwork'): """Creates an instance of `ValueRnnNetwork`. Network supports calls with shape outer_rank + observation_spec.shape. Note outer_rank must be at least 1. Args: observation_spec: A nest of `tensor_spec.TensorSpec` representing the observations. conv_layer_params: Optional list of convolution layers parameters, where each item is a length-three tuple indicating (filters, kernel_size, stride). input_fc_layer_params: Optional list of fully_connected parameters, where each item is the number of units in the layer. This is applied before the LSTM cell. lstm_size: An iterable of ints specifying the LSTM cell sizes to use. output_fc_layer_params: Optional list of fully_connected parameters, where each item is the number of units in the layer. This is applied after the LSTM cell. activation_fn: Activation function, e.g. tf.keras.activations.relu,. name: A string representing name of the network. Raises: ValueError: If `observation_spec` contains more than one observation. """ if len(nest.flatten(observation_spec)) > 1: raise ValueError( 'Network only supports observation_specs with a single observation.') input_layers = utils.mlp_layers( conv_layer_params, input_fc_layer_params, activation_fn=activation_fn, kernel_initializer=tf.keras.initializers.glorot_uniform(), name='input_mlp') # Create RNN cell if len(lstm_size) == 1: cell = tf.keras.layers.LSTMCell(lstm_size[0]) else: cell = tf.keras.layers.StackedRNNCells( [tf.keras.layers.LSTMCell(size) for size in lstm_size]) state_spec = nest.map_structure( functools.partial( tensor_spec.TensorSpec, dtype=tf.float32, name='network_state_spec'), list(cell.state_size)) output_layers = [] if output_fc_layer_params: output_layers = [ tf.keras.layers.Dense( num_units, activation=activation_fn, kernel_initializer=tf.variance_scaling_initializer( scale=2.0, mode='fan_in', distribution='truncated_normal'), name='output/dense') for num_units in output_fc_layer_params ] value_projection_layer = keras_layers.Dense( 1, activation=None, kernel_initializer=tf.random_uniform_initializer( minval=-0.03, maxval=0.03), ) state_spec = nest.map_structure( functools.partial( tensor_spec.TensorSpec, dtype=tf.float32, name='network_state_spec'), list(cell.state_size)) super(ValueRnnNetwork, self).__init__( observation_spec=observation_spec, action_spec=None, state_spec=state_spec, name=name) self._conv_layer_params = conv_layer_params self._input_layers = input_layers self._cell = cell self._output_layers = output_layers self._value_projection_layer = value_projection_layer
def novel_fc(x, hidden_sizes, training=False, l = (1e-6, 1e-6, 1e-6), p = (0.5, 0.5, 0.5),\ n_cosmo_params = 7, n_hod_params = 4): cosmo_sizes, hod_sizes, cap_sizes = hidden_sizes if type(l) is float: cosmo_l, hod_l, cap_l = l, l, l else: cosmo_l, hod_l, cap_l = l if type(p) is float: cosmo_p, hod_p, cap_p = p, p, p else: cosmo_p, hod_p, cap_p = p initializer = tf.variance_scaling_initializer(scale=2.0) #onlly for duplicating r n_params = n_cosmo_params + n_hod_params cosmo_x = tf.slice(x, [0, 0], [-1, n_cosmo_params]) cosmo_x = tf.concat( values=[cosmo_x, tf.slice(x, [0, n_params - 1], [-1, -1])], axis=1) #print tf.shape(cosmo_x) #print tf.shape(tf.slice(x, [0, n_params-1], [-1, -1])) hod_x = tf.slice(x, [0, n_cosmo_params], [-1, -1]) cosmo_regularizer = tf.contrib.layers.l1_regularizer(cosmo_l) cosmo_out = cosmo_x for size in cosmo_sizes: fc_output = tf.layers.dense(cosmo_out, size, kernel_initializer = initializer,\ kernel_regularizer = cosmo_regularizer) bd_out = tf.layers.dropout(fc_output, cosmo_p, training=training) bn_out = tf.layers.batch_normalization(bd_out, axis=-1, training=training) cosmo_out = tf.nn.relu(bn_out) #tf.nn.leaky_relu(bn_out, alpha=0.01) hod_regularizer = tf.contrib.layers.l1_regularizer(hod_l) hod_out = hod_x for size in hod_sizes: fc_output = tf.layers.dense(hod_out, size, kernel_initializer = initializer,\ kernel_regularizer = hod_regularizer) bd_out = tf.layers.dropout(fc_output, hod_p, training=training) bn_out = tf.layers.batch_normalization(bd_out, axis=-1, training=training) hod_out = tf.nn.relu(bn_out) #tf.nn.leaky_relu(bn_out, alpha=0.01) cap_out = tf.concat(values=[cosmo_out, hod_out], axis=1) cap_regularizer = tf.contrib.layers.l1_regularizer(cap_l) for size in cap_sizes: fc_output = tf.layers.dense(cap_out, size, kernel_initializer = initializer,\ kernel_regularizer = cap_regularizer) bd_out = tf.layers.dropout(fc_output, cap_p, training=training) bn_out = tf.layers.batch_normalization(bd_out, axis=-1, training=training) cap_out = tf.nn.relu(bn_out) #tf.nn.leaky_relu(bn_out, alpha=0.01) pred = tf.layers.dense(cap_out, 1, kernel_initializer=initializer, kernel_regularizer=cap_regularizer)[:, 0] #, return pred
def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format="channels_first", use_td=False, targeting_rate=None, keep_prob=None, is_training=None): """Strided 2-D convolution with explicit padding. The padding is consistent and is based only on `kernel_size`, not on the dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). Args: inputs: `Tensor` of size `[batch, channels, height_in, width_in]`. filters: `int` number of filters in the convolution. kernel_size: `int` size of the kernel to be used in the convolution. strides: `int` strides of the convolution. data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. use_td: `str` one of "weight" or "unit". Set to False or "" to disable targeted dropout. targeting_rate: `float` proportion of weights to target with targeted dropout. keep_prob: `float` keep probability for targeted dropout. is_training: `bool` for whether the model is in training. Returns: A `Tensor` of shape `[batch, filters, height_out, width_out]`. Raises: Exception: if use_td is not valid. """ if strides > 1: inputs = fixed_padding(inputs, kernel_size, data_format=data_format) if use_td: inputs_shape = common_layers.shape_list(inputs) if use_td == "weight": if data_format == "channels_last": size = kernel_size * kernel_size * inputs_shape[-1] else: size = kernel_size * kernel_size * inputs_shape[1] targeting_count = targeting_rate * tf.to_float(size) targeting_fn = common_layers.weight_targeting elif use_td == "unit": targeting_count = targeting_rate * filters targeting_fn = common_layers.unit_targeting else: raise Exception("Unrecognized targeted dropout type: %s" % use_td) y = common_layers.td_conv( inputs, filters, kernel_size, targeting_count, targeting_fn, keep_prob, is_training, do_prune=True, strides=strides, padding=("SAME" if strides == 1 else "VALID"), data_format=data_format, use_bias=False, kernel_initializer=tf.variance_scaling_initializer()) else: y = tf.layers.conv2d( inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, padding=("SAME" if strides == 1 else "VALID"), use_bias=False, kernel_initializer=tf.variance_scaling_initializer(), data_format=data_format) return y
batch_size = 500 print('n_epoch = ', n_epoch) print('batch_size = ', batch_size) print() #%% Graph tf.reset_default_graph() X = tf.placeholder(tf.float32, shape=(None, n_input), name="X") Y = tf.placeholder(tf.float32, shape=(None, n_output), name="Y") training = tf.placeholder_with_default(False, shape=(), name='training') batch_norm_momentum = 0.9 with tf.name_scope("dnn"): he_init = tf.variance_scaling_initializer() my_batch_norm_layer = partial(tf.layers.batch_normalization, training=training, momentum=batch_norm_momentum) my_dense_layer = partial(tf.layers.dense, kernel_initializer=he_init, activation=tf.nn.relu) hidden1 = my_dense_layer(X, n_hidden[0], name="hidden1") bn1 = tf.nn.relu(my_batch_norm_layer(hidden1)) hidden2 = my_dense_layer(bn1, n_hidden[1], name="hidden2") bn2 = tf.nn.relu(my_batch_norm_layer(hidden2)) outputs = tf.layers.dense(bn2, n_output,
def _build_graph(self, inputs): images, truemap_coded = inputs orig_imgs = images true_np = truemap_coded[..., 0] true_np = tf.cast(true_np, tf.int32) true_np = tf.identity(true_np, name='truemap-np') one_np = tf.one_hot(true_np, 2, axis=-1) true_np = tf.expand_dims(true_np, axis=-1) true_dist = truemap_coded[..., 1:] true_dist = tf.identity(true_dist, name='truemap-dist') #### with argscope(Conv2D, activation=tf.identity, use_bias=False, # K.he initializer W_init=tf.variance_scaling_initializer(scale=2.0, mode='fan_out')), \ argscope([Conv2D, BatchNorm], data_format=self.data_format): i = tf.transpose(images, [0, 3, 1, 2]) i = i if not self.input_norm else i / 255.0 #### d = encoder(i, self.freeze) d[0] = crop_op(d[0], (184, 184)) d[1] = crop_op(d[1], (72, 72)) #### np_feat = decoder('np', d) np = BNReLU('preact_out_np', np_feat[-1]) dist_feat = decoder('dst', d) dist = BNReLU('preact_out_dist', dist_feat[-1]) #### logi_np = Conv2D('conv_out_np', np, 2, 1, use_bias=True, activation=tf.identity) logi_np = tf.transpose(logi_np, [0, 2, 3, 1]) soft_np = tf.nn.softmax(logi_np, axis=-1) prob_np = tf.identity(soft_np[..., 1], name='predmap-prob-np') prob_np = tf.expand_dims(prob_np, axis=-1) pred_np = tf.argmax(soft_np, axis=-1, name='predmap-np') pred_np = tf.expand_dims(tf.cast(pred_np, tf.float32), axis=-1) #### logi_dist = Conv2D('conv_out_dist', dist, 1, 1, use_bias=True, activation=tf.identity) logi_dist = tf.transpose(logi_dist, [0, 2, 3, 1]) prob_dist = tf.identity(logi_dist, name='predmap-prob-dist') pred_dist = tf.identity(logi_dist, name='predmap-dist') # encoded so that inference can extract all output at once predmap_coded = tf.concat([prob_np, pred_dist], axis=-1, name='predmap-coded') #### #### if get_current_tower_context().is_training: ######## LOSS ### Distance regression loss loss_mse = pred_dist - true_dist loss_mse = loss_mse * loss_mse loss_mse = tf.reduce_mean(loss_mse, name='loss-mse') add_moving_summary(loss_mse) ### Nuclei Blob classification loss loss_bce = categorical_crossentropy(soft_np, one_np) loss_bce = tf.reduce_mean(loss_bce, name='loss-bce') add_moving_summary(loss_bce) ### combine the loss into single cost function self.cost = tf.identity(loss_mse + loss_bce, name='overall-loss') add_moving_summary(self.cost) #### add_param_summary(('.*/W', ['histogram'])) # monitor W #### logging visual sthg orig_imgs = tf.cast(orig_imgs, tf.uint8) tf.summary.image('input', orig_imgs, max_outputs=1) orig_imgs = crop_op(orig_imgs, (190, 190), "NHWC") pred_np = colorize(prob_np[..., 0], cmap='jet') true_np = colorize(true_np[..., 0], cmap='jet') pred_dist = colorize(prob_dist[..., 0], cmap='jet') true_dist = colorize(true_dist[..., 0], cmap='jet') viz = tf.concat([ orig_imgs, true_np, pred_np, true_dist, pred_dist, ], 2) tf.summary.image('output', viz, max_outputs=1) return
def _build_graph(self, inputs): images, truemap_coded = inputs orig_imgs = images if hasattr(self, 'type_classification') and self.type_classification: true_type = truemap_coded[..., 1] true_type = tf.cast(true_type, tf.int32) true_type = tf.identity(true_type, name='truemap-type') one_type = tf.one_hot(true_type, self.nr_types, axis=-1) true_type = tf.expand_dims(true_type, axis=-1) true_np = tf.cast(true_type > 0, tf.int32) # ? sanity this true_np = tf.identity(true_np, name='truemap-np') one_np = tf.one_hot(tf.squeeze(true_np), 2, axis=-1) else: true_np = truemap_coded[..., 0] true_np = tf.cast(true_np, tf.int32) true_np = tf.identity(true_np, name='truemap-np') one_np = tf.one_hot(true_np, 2, axis=-1) true_np = tf.expand_dims(true_np, axis=-1) true_hv = truemap_coded[..., -2:] true_hv = tf.identity(true_hv, name='truemap-hv') #### with argscope(Conv2D, activation=tf.identity, use_bias=False, # K.he initializer W_init=tf.variance_scaling_initializer(scale=2.0, mode='fan_out')), \ argscope([Conv2D, BatchNorm], data_format=self.data_format): i = tf.transpose(images, [0, 3, 1, 2]) i = i if not self.input_norm else i / 255.0 #### d = encoder(i, self.freeze) d[0] = crop_op(d[0], (184, 184)) d[1] = crop_op(d[1], (72, 72)) #### np_feat = decoder('np', d) npx = BNReLU('preact_out_np', np_feat[-1]) hv_feat = decoder('hv', d) hv = BNReLU('preact_out_hv', hv_feat[-1]) if self.type_classification: tp_feat = decoder('tp', d) tp = BNReLU('preact_out_tp', tp_feat[-1]) # Nuclei Type Pixels (TP) logi_class = Conv2D('conv_out_tp', tp, self.nr_types, 1, use_bias=True, activation=tf.identity) logi_class = tf.transpose(logi_class, [0, 2, 3, 1]) soft_class = tf.nn.softmax(logi_class, axis=-1) #### Nuclei Pixels (NP) logi_np = Conv2D('conv_out_np', npx, 2, 1, use_bias=True, activation=tf.identity) logi_np = tf.transpose(logi_np, [0, 2, 3, 1]) soft_np = tf.nn.softmax(logi_np, axis=-1) prob_np = tf.identity(soft_np[..., 1], name='predmap-prob-np') prob_np = tf.expand_dims(prob_np, axis=-1) #### Horizontal-Vertival (HV) logi_hv = Conv2D('conv_out_hv', hv, 2, 1, use_bias=True, activation=tf.identity) logi_hv = tf.transpose(logi_hv, [0, 2, 3, 1]) prob_hv = tf.identity(logi_hv, name='predmap-prob-hv') pred_hv = tf.identity(logi_hv, name='predmap-hv') # * channel ordering: type-map, segmentation map # encoded so that inference can extract all output at once if self.type_classification: predmap_coded = tf.concat([soft_class, prob_np, pred_hv], axis=-1, name='predmap-coded') else: predmap_coded = tf.concat([prob_np, pred_hv], axis=-1, name='predmap-coded') #### def get_gradient_hv(l, h_ch, v_ch): """ Calculate the horizontal partial differentiation for horizontal channel and the vertical partial differentiation for vertical channel. The partial differentiation is approximated by calculating the central differnce which is obtained by using Sobel kernel of size 5x5. The boundary is zero-padded when channel is convolved with the Sobel kernel. Args: l (tensor): tensor of shape NHWC with C should be 2 (1 channel for horizonal and 1 channel for vertical) h_ch(int) : index within C axis of `l` that corresponds to horizontal channel v_ch(int) : index within C axis of `l` that corresponds to vertical channel """ def get_sobel_kernel(size): assert size % 2 == 1, 'Must be odd, get size=%d' % size h_range = np.arange(-size // 2 + 1, size // 2 + 1, dtype=np.float32) v_range = np.arange(-size // 2 + 1, size // 2 + 1, dtype=np.float32) h, v = np.meshgrid(h_range, v_range) kernel_h = h / (h * h + v * v + 1.0e-15) kernel_v = v / (h * h + v * v + 1.0e-15) return kernel_h, kernel_v mh, mv = get_sobel_kernel(5) mh = tf.constant(mh, dtype=tf.float32) mv = tf.constant(mv, dtype=tf.float32) mh = tf.reshape(mh, [5, 5, 1, 1]) mv = tf.reshape(mv, [5, 5, 1, 1]) # central difference to get gradient, ignore the boundary problem h = tf.expand_dims(l[..., h_ch], axis=-1) v = tf.expand_dims(l[..., v_ch], axis=-1) dh = tf.nn.conv2d(h, mh, strides=[1, 1, 1, 1], padding='SAME') dv = tf.nn.conv2d(v, mv, strides=[1, 1, 1, 1], padding='SAME') output = tf.concat([dh, dv], axis=-1) return output def loss_mse(true, pred, name=None): ### regression loss loss = pred - true loss = tf.reduce_mean(loss * loss, name=name) return loss def loss_msge(true, pred, focus, name=None): focus = tf.stack([focus, focus], axis=-1) pred_grad = get_gradient_hv(pred, 1, 0) true_grad = get_gradient_hv(true, 1, 0) loss = pred_grad - true_grad loss = focus * (loss * loss) # artificial reduce_mean with focus region loss = tf.reduce_sum(loss) / (tf.reduce_sum(focus) + 1.0e-8) loss = tf.identity(loss, name=name) return loss #### if get_current_tower_context().is_training: #---- LOSS ----# loss = 0 for term, weight in self.loss_term.items(): if term == 'mse': term_loss = loss_mse(true_hv, pred_hv, name='loss-mse') elif term == 'msge': focus = truemap_coded[..., 0] term_loss = loss_msge(true_hv, pred_hv, focus, name='loss-msge') elif term == 'bce': term_loss = categorical_crossentropy(soft_np, one_np) term_loss = tf.reduce_mean(term_loss, name='loss-bce') elif 'dice' in self.loss_term: term_loss = dice_loss(soft_np[...,0], one_np[...,0]) \ + dice_loss(soft_np[...,1], one_np[...,1]) term_loss = tf.identity(term_loss, name='loss-dice') else: assert False, 'Not support loss term: %s' % term add_moving_summary(term_loss) loss += term_loss * weight if self.type_classification: term_loss = categorical_crossentropy(soft_class, one_type) term_loss = tf.reduce_mean(term_loss, name='loss-xentropy-class') add_moving_summary(term_loss) loss = loss + term_loss # term_loss = dice_loss(soft_class[...,0], one_type[...,0]) \ # + dice_loss(soft_class[...,1], one_type[...,1]) \ # + dice_loss(soft_class[...,2], one_type[...,2]) \ # + dice_loss(soft_class[...,3], one_type[...,3]) \ # + dice_loss(soft_class[...,4], one_type[...,4]) term_loss = 0 for type_id in range(self.nr_types): term_loss += dice_loss(soft_class[..., type_id], one_type[..., type_id]) term_loss = tf.identity(term_loss, name='loss-dice-class') add_moving_summary(term_loss) loss = loss + term_loss ### combine the loss into single cost function self.cost = tf.identity(loss, name='overall-loss') add_moving_summary(self.cost) #### add_param_summary(('.*/W', ['histogram'])) # monitor W ### logging visual sthg orig_imgs = tf.cast(orig_imgs, tf.uint8) tf.summary.image('input', orig_imgs, max_outputs=1) orig_imgs = crop_op(orig_imgs, (190, 190), "NHWC") pred_np = colorize(prob_np[..., 0], cmap='jet') true_np = colorize(true_np[..., 0], cmap='jet') pred_h = colorize(prob_hv[..., 0], vmin=-1, vmax=1, cmap='jet') pred_v = colorize(prob_hv[..., 1], vmin=-1, vmax=1, cmap='jet') true_h = colorize(true_hv[..., 0], vmin=-1, vmax=1, cmap='jet') true_v = colorize(true_hv[..., 1], vmin=-1, vmax=1, cmap='jet') if not self.type_classification: viz = tf.concat([ orig_imgs, pred_h, pred_v, pred_np, true_h, true_v, true_np ], 2) else: pred_type = tf.transpose(soft_class, (0, 1, 3, 2)) pred_type = tf.reshape(pred_type, [-1, 80, 80 * self.nr_types]) true_type = tf.cast(true_type[..., 0] / self.nr_classes, tf.float32) true_type = colorize(true_type, vmin=0, vmax=1, cmap='jet') pred_type = colorize(pred_type, vmin=0, vmax=1, cmap='jet') viz = tf.concat([ orig_imgs, pred_h, pred_v, pred_np, pred_type, true_h, true_v, true_np, true_type, ], 2) viz = tf.concat([viz[0], viz[-1]], axis=0) viz = tf.expand_dims(viz, axis=0) tf.summary.image('output', viz, max_outputs=1) return
def build_graph(self, A, B): A = tf.transpose(A / 255.0, [0, 3, 1, 2]) B = tf.transpose(B / 255.0, [0, 3, 1, 2]) # use the torch initializers with argscope([Conv2D, Conv2DTranspose, FullyConnected], kernel_initializer=tf.variance_scaling_initializer(scale=0.333, distribution='uniform'), use_bias=False), \ argscope(BatchNorm, gamma_init=tf.random_uniform_initializer()), \ argscope([Conv2D, Conv2DTranspose, BatchNorm], data_format='NCHW'): with tf.variable_scope('gen'): with tf.variable_scope('B'): AB = self.generator(A) with tf.variable_scope('A'): BA = self.generator(B) ABA = self.generator(AB) with tf.variable_scope('B'): BAB = self.generator(BA) viz_A_recon = tf.concat([A, AB, ABA], axis=3, name='viz_A_recon') viz_B_recon = tf.concat([B, BA, BAB], axis=3, name='viz_B_recon') tf.summary.image('Arecon', tf.transpose(viz_A_recon, [0, 2, 3, 1]), max_outputs=50) tf.summary.image('Brecon', tf.transpose(viz_B_recon, [0, 2, 3, 1]), max_outputs=50) with tf.variable_scope('discrim'): with tf.variable_scope('A'): A_dis_real, A_feats_real = self.discriminator(A) A_dis_fake, A_feats_fake = self.discriminator(BA) with tf.variable_scope('B'): B_dis_real, B_feats_real = self.discriminator(B) B_dis_fake, B_feats_fake = self.discriminator(AB) with tf.name_scope('LossA'): # reconstruction loss recon_loss_A = tf.reduce_mean(tf.squared_difference(A, ABA), name='recon_loss') # gan loss self.build_losses(A_dis_real, A_dis_fake) G_loss_A = self.g_loss D_loss_A = self.d_loss # feature matching loss fm_loss_A = self.get_feature_match_loss(A_feats_real, A_feats_fake) with tf.name_scope('LossB'): recon_loss_B = tf.reduce_mean(tf.squared_difference(B, BAB), name='recon_loss') self.build_losses(B_dis_real, B_dis_fake) G_loss_B = self.g_loss D_loss_B = self.d_loss fm_loss_B = self.get_feature_match_loss(B_feats_real, B_feats_fake) global_step = get_global_step_var() rate = tf.train.piecewise_constant(global_step, [np.int64(10000)], [0.01, 0.5]) rate = tf.identity(rate, name='rate') # TF issue#8594 g_loss = tf.add_n([((G_loss_A + G_loss_B) * 0.1 + (fm_loss_A + fm_loss_B) * 0.9) * (1 - rate), (recon_loss_A + recon_loss_B) * rate], name='G_loss_total') d_loss = tf.add_n([D_loss_A, D_loss_B], name='D_loss_total') self.collect_variables('gen', 'discrim') # weight decay wd_g = regularize_cost('gen/.*/W', l2_regularizer(1e-5), name='G_regularize') wd_d = regularize_cost('discrim/.*/W', l2_regularizer(1e-5), name='D_regularize') self.g_loss = g_loss + wd_g self.d_loss = d_loss + wd_d add_moving_summary(recon_loss_A, recon_loss_B, rate, g_loss, d_loss, wd_g, wd_d)
def net(observations, config): # observation space = shape=(batch_size, episode_length, 10, 14, 2) # action space = shape=(batch, episode_length, 23) batch_size = tf.shape(observations)[0] episode_len = tf.shape(observations)[1] input_ = tf.reshape(observations, shape=[batch_size, episode_len, observations.shape.as_list()[ 2], functools.reduce(operator.mul, observations.shape.as_list()[3:], 1)]) init_xavier_weights = tf.variance_scaling_initializer( scale=1.0, mode='fan_avg', distribution='uniform') init_output_weights = tf.variance_scaling_initializer( scale=config.init_output_factor, mode='fan_in', distribution='normal') # seperate value and policy with tf.variable_scope('o_trunk_policy'): conv1 = tf.layers.conv2d( inputs=input_, filters=128, kernel_size=[1, 3], padding='valid', activation=tf.nn.relu, kernel_initializer=init_xavier_weights ) conv2 = tf.layers.conv2d( inputs=conv1, filters=128, kernel_size=[1, 3], padding='valid', activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) flatten = tf.reshape(conv2, shape=[batch_size, episode_len, functools.reduce( operator.mul, conv2.shape.as_list()[2:], 1)]) trunk_fc = tf.layers.dense( inputs=flatten, units=256, activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) with tf.variable_scope('o_crown'): # offensive off_fc = tf.layers.dense( inputs=trunk_fc, units=128, activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) with tf.variable_scope('actions'): off_action_mean = tf.layers.dense( inputs=off_fc, units=12, activation=tf.tanh, # NOTE tanh is not good? kernel_initializer=init_output_weights, ) with tf.variable_scope('decision'): logits = tf.layers.dense( inputs=off_fc, units=3, activation=None, kernel_initializer=init_output_weights, ) with tf.variable_scope('o_trunk_value'): conv1 = tf.layers.conv2d( inputs=input_, filters=128, kernel_size=[1, 3], padding='valid', activation=tf.nn.relu, kernel_initializer=init_xavier_weights ) conv2 = tf.layers.conv2d( inputs=conv1, filters=128, kernel_size=[1, 3], padding='valid', activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) flatten = tf.reshape(conv2, shape=[batch_size, episode_len, functools.reduce( operator.mul, conv2.shape.as_list()[2:], 1)]) trunk_fc = tf.layers.dense( inputs=flatten, units=256, activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) with tf.variable_scope('o_crown'): # offensive off_fc = tf.layers.dense( inputs=trunk_fc, units=128, activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) off_value = tf.layers.dense( inputs=off_fc, units=1, activation=None, kernel_initializer=init_output_weights, ) off_value = tf.reshape( off_value, shape=[batch_size, episode_len]) off_value = tf.check_numerics(off_value, 'off_value') # with tf.variable_scope('o_trunk'): # conv1 = tf.layers.conv2d( # inputs=input_, # filters=64, # kernel_size=[1, 3], # padding='same', # activation=tf.nn.relu, # kernel_initializer=init_xavier_weights # ) # conv2 = tf.layers.conv2d( # inputs=conv1, # filters=64, # kernel_size=[1, 3], # padding='same', # activation=tf.nn.relu, # kernel_initializer=init_xavier_weights, # ) # flatten = tf.reshape(conv2, shape=[batch_size, episode_len, functools.reduce( # operator.mul, conv2.shape.as_list()[2:], 1)]) # trunk_fc = tf.layers.dense( # inputs=flatten, # units=128, # activation=tf.nn.relu, # kernel_initializer=init_xavier_weights, # ) # with tf.variable_scope('o_crown'): # # offensive # off_fc = tf.layers.dense( # inputs=trunk_fc, # units=64, # activation=tf.nn.relu, # kernel_initializer=init_xavier_weights, # ) # with tf.variable_scope('policy'): # with tf.variable_scope('actions'): # off_action_mean = tf.layers.dense( # inputs=off_fc, # units=12, # activation=tf.tanh, # NOTE tanh is not good? # kernel_initializer=init_output_weights, # ) # with tf.variable_scope('decision'): # logits = tf.layers.dense( # inputs=off_fc, # units=3, # activation=None, # kernel_initializer=init_output_weights, # ) # with tf.variable_scope('value'): # off_value = tf.layers.dense( # inputs=off_fc, # units=1, # activation=None, # kernel_initializer=init_output_weights, # ) # off_value = tf.reshape( # off_value, shape=[batch_size, episode_len]) # off_value = tf.check_numerics(off_value, 'off_value') with tf.variable_scope('d_trunk'): conv1 = tf.layers.conv2d( inputs=input_, filters=64, kernel_size=[1, 3], padding='same', activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) conv2 = tf.layers.conv2d( inputs=conv1, filters=64, kernel_size=[1, 3], padding='same', activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) flatten = tf.reshape(conv2, shape=[batch_size, episode_len, functools.reduce( operator.mul, conv2.shape.as_list()[2:], 1)]) trunk_fc = tf.layers.dense( inputs=flatten, units=128, activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) with tf.variable_scope('d_crown'): # defensive def_fc = tf.layers.dense( inputs=trunk_fc, units=64, activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) with tf.variable_scope('policy'): with tf.variable_scope('actions'): def_action_mean = tf.layers.dense( inputs=def_fc, units=10, activation=tf.tanh, # NOTE tanh is not good? kernel_initializer=init_output_weights, ) with tf.variable_scope('value'): def_value = tf.layers.dense( inputs=def_fc, units=1, activation=None, kernel_initializer=init_output_weights, ) def_value = tf.reshape( def_value, shape=[batch_size, episode_len]) def_value = tf.check_numerics(def_value, 'def_value') return logits, off_action_mean, off_value, def_action_mean, def_value
def _initializer(self, params): return tf.variance_scaling_initializer(mode="fan_avg", distribution="uniform", dtype=self.dtype)
def train_my_lstm(): # Data train_size = 10000 test_size = 100 batch_size = 100 time_steps = 5 num_features = 1 # problem_type = 'regression' # X_train, y_train = sin_data(train_size, time_steps=time_steps) # X_test, y_test = sin_data(test_size, time_steps=time_steps) problem_type = 'binary_classification' X_train, y_train = binary_data(train_size, time_steps=time_steps) X_test, y_test = binary_data(test_size, time_steps=time_steps) # Place holders. Do NOT write None for batch_size inputs = tf.placeholder(tf.float32, shape=[batch_size, time_steps, num_features]) truth = tf.placeholder(tf.float32, shape=[batch_size, 1]) initializer = tf.variance_scaling_initializer(scale=2.0) # Network structure: 3D LSTM - Dense(1) X = my_3d_lstm(inputs) X = tf.layers.flatten(X) outputs = tf.layers.dense(X, 1, kernel_initializer=initializer) if problem_type == 'binary_classification': loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=truth, logits=outputs) loss = tf.reduce_mean(loss) elif problem_type == 'regression': loss = tf.nn.l2_loss(truth - outputs) / batch_size optimizer = tf.train.AdamOptimizer(learning_rate=0.01) optimizer = optimizer.minimize(loss) # Initialize and run the graph init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) for epoch_index in range(20): for batch_index in range(train_size // batch_size): X_train_batch = X_train[batch_index * batch_size:(batch_index + 1) * batch_size] y_train_batch = y_train[batch_index * batch_size:(batch_index + 1) * batch_size] results = sess.run(optimizer, feed_dict={ inputs: X_train_batch, truth: y_train_batch }) if (batch_index % 10 == 0): results_train_batch = sess.run([outputs, loss], feed_dict={ inputs: X_train_batch, truth: y_train_batch }) results_test = sess.run([outputs, loss], feed_dict={ inputs: X_test, truth: y_test }) print('------------------------------') print('Epoch %d. Batch %d.' % (epoch_index, batch_index)) print('Train accuracy: %f. Loss: %f.' % (np.mean((results_train_batch[0] > 0) == y_train_batch), results_train_batch[1])) # print('Train accuracy: %f. Loss: %f.' % (np.mean(np.abs(results_train_batch[0]-y_train_batch)), results_train_batch[1])) print('Test accuracy: %f. Loss: %f.' % (np.mean( (results_test[0] > 0) == y_test), results_test[1]))
def __build(self): with self._graph.as_default(): with tf.variable_scope("inputs"): # input arguments self._features = tf.placeholder(tf.float32, shape=[None, self.t_bins, self.f_bins, 2*self.n_mic]) self._targets = tf.placeholder(tf.float32, shape=[None, self.t_bins, self.f_bins, 2*self.n_src]) global_step = tf.Variable(1, trainable=False) # preprocessing feature_magnitude = self._features[..., :self.n_mic] target_magnitude = self._targets[..., :self.n_src] feature_phase = self._features[..., self.n_mic:] target_phase = self._targets[..., self.n_src:] phase_difference = target_phase - self._features[..., self.n_mic:self.n_mic+1] target_real = target_magnitude * tf.cos(phase_difference) target_image = target_magnitude * tf.sin(phase_difference) with tf.variable_scope("cnn", initializer=tf.keras.initializers.Orthogonal(gain=1.0), regularizer=tf.contrib.layers.l2_regularizer(scale=1e-6)): conv = tf.layers.conv2d(feature_magnitude, 64, (3, 3), (1, 1), "same", activation=tf.nn.relu) conv = tf.layers.max_pooling2d(conv, [1, 4], [1, 4], "valid") conv = tf.layers.conv2d(conv, 64, (3, 3), (1, 1), "same", activation=tf.nn.relu) conv = tf.layers.max_pooling2d(conv, [1, 2], [1, 2], "valid") conv = tf.layers.conv2d(conv, 64, (3, 3), (1, 1), "same", activation=tf.nn.relu) conv = tf.layers.max_pooling2d(conv, [1, 2], [1, 2], "valid") conv = tf.reshape(conv, (-1, self.t_bins, 64*32)) conv = tf.unstack(conv, axis=1) with tf.variable_scope("rnn", initializer=tf.variance_scaling_initializer(), regularizer=tf.contrib.layers.l2_regularizer(scale=1e-6)): # cells formation cells_forward = [] cells_backward = [] for i in range(3): cell = tf.nn.rnn_cell.GRUCell(num_units=1024) cells_forward.append(cell) cell = tf.nn.rnn_cell.GRUCell(num_units=1024) cells_backward.append(cell) # rnn formation rnn_forward = tf.nn.rnn_cell.MultiRNNCell(cells=cells_forward) rnn_backward = tf.nn.rnn_cell.MultiRNNCell(cells=cells_backward) rnn, _, _ = tf.nn.static_bidirectional_rnn(rnn_forward, rnn_backward, conv, dtype=tf.float32) rnn = tf.stack(rnn, axis=1) rnn = tf.reshape(rnn, [-1, self.t_bins, 2048]) with tf.variable_scope("fnn", initializer=tf.variance_scaling_initializer(), regularizer=tf.contrib.layers.l2_regularizer(scale=1e-6)): fnn = tf.layers.dense(rnn, units=self.f_bins*self.n_src) fnn = tf.nn.relu(fnn) with tf.variable_scope("mask", initializer=tf.keras.initializers.Orthogonal(gain=1.0), regularizer=tf.contrib.layers.l2_regularizer(1e-6)): # mask for real part mask_real = tf.layers.dense(fnn, units=self.f_bins*self.n_src) mask_real = tf.reshape(mask_real, [-1, self.t_bins, self.f_bins, self.n_src]) # mask_rv = 1 - tf.reduce_sum(mask_re, axis=-1, keepdims=True) # self._mask_re = tf.concat([mask_re, mask_rv], axis=-1) # mask for imag part mask_image = tf.layers.dense(fnn, units=self.f_bins*self.n_src) mask_image = tf.reshape(mask_image, [-1, self.t_bins, self.f_bins, self.n_src]) # mask_iv = 1 - tf.reduce_sum(mask_im, axis=-1, keepdims=True) # self._mask_im = tf.concat([mask_im, mask_iv], axis=-1) with tf.variable_scope("outputs"): # logits layer logits_real = mask_real * feature_magnitude[..., :1] logits_image = mask_image * feature_magnitude[..., :1] self._logits = tf.concat((logits_real, logits_image), axis=-1) # logit_re = tf.concat((logits_real, mask_rv * ftr_mgt[..., :1]), axis=-1) # logit_im = tf.concat((logit_im, mask_iv * ftr_mgt[..., :1]), axis=-1) # regression: MSE & L2-regularization & permutational loss self._loss = DTLoss(target_real, logits_real, self.n_src) + \ DTLoss(target_image, logits_image, self.n_src) + \ tf.losses.get_regularization_loss() # backward ''' lr = tf.train.exponential_decay( learning_rate=1e-3, global_step=global_step, decay_steps=self.decay_step, decay_rate=0.1, staircase=True ) ''' lr = 1e-3 optimizer = tf.train.AdamOptimizer(lr) self.minimize = optimizer.minimize(loss=self._loss) # operation self._session = tf.Session() self._session.run(tf.global_variables_initializer())
def cnn(self): with tf.name_scope('embedding_layer'): embeddings = self.random_embedding(self.config.vocab_size, self.config.embedding_dim) embedding = tf.Variable(embeddings, dtype=tf.float32, trainable=True, name='word_embedding') embeddings_inputs = tf.nn.embedding_lookup(embedding, self.content) # embeddings_inputs[batch_size, sequence_length, embedding_dim] # 需要将输入转换成四维的矩阵,最后一维为深度 # embeddings_inputs_expanded[batch_size, sequence_length, embedding_dim, input_depth] self.embeddings_inputs_expanded = tf.expand_dims( embeddings_inputs, -1) with tf.name_scope('cnn_layer'): # shape = [kernel_height, kernel_width, input_depth, output_depth] filter_weights = tf.get_variable( name='weights', shape=[ self.config.kernel_size, self.config.embedding_dim, 1, self.config.num_filters ], initializer=tf.variance_scaling_initializer(), dtype=tf.float32) biases = tf.get_variable(name='biases', shape=[self.config.num_filters], initializer=tf.zeros_initializer(), dtype=tf.float32) '''' strides=[batch_stride, height_stride, width_stride, depth_stride],第一个和第四个维度要求为1,因为卷积层的步长只对矩阵的长和宽有效。 padding VALID: 不填充; SAME: 全0填充; 不同的填充方法最终导致的输出矩阵的大小是不一样的,具体的计算方法为: 使用全0填充 output_height=[input_height/stride_height] output_width=[input_width/stride_width] 不填充 output_height=[(input_height-filter_height+1)/stride_height] output_width=[(input_width-filter_width+1)/stride_width] ''' conv = tf.nn.conv2d(self.embeddings_inputs_expanded, filter_weights, strides=[1, 1, 1, 1], padding='VALID') conv = tf.nn.relu(tf.nn.bias_add(conv, biases), name='conv') # ksize=[batch_kernel_size, height_kernel, width_kernel, depth_kernel_size],第一个和第四个维度通常为1。 # 这里设置过滤器的尺寸为在卷积完之后的高度和宽度,目的是池化完之后的结果第二个维度和第三个维度都为1,方便后边计算。 pool = tf.nn.max_pool(conv, ksize=[ 1, self.config.seq_length - self.config.kernel_size + 1, 1, 1 ], strides=[1, 1, 1, 1], padding='VALID', name='pool') # 将池化的结果修改维度,修改成二维矩阵,因为第二维和第三维本身就是1,那么最后的结果为[batch_size,num_filters] h = tf.reshape(pool, [-1, self.config.num_filters]) w = tf.get_variable( name='w', shape=[self.config.num_filters, self.config.num_classes], initializer=tf.contrib.layers.xavier_initializer(), dtype=tf.float32) b = tf.get_variable(name='b', shape=[self.config.num_classes], initializer=tf.zeros_initializer(), dtype=tf.float32) fc = tf.matmul(h, w) + b self.logtis = tf.nn.dropout(fc, self.config.dropout_prob) with tf.name_scope('optimize_layer'): self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logtis, labels=self.label)) self.optimizer = tf.train.AdamOptimizer( self.config.learning_rate).minimize(self.loss) with tf.name_scope('score'): self.predict_label = tf.argmax(self.logtis, 1) correct_pred = tf.equal(tf.argmax(self.logtis, 1), tf.argmax(self.label, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
pool5 = tf.layers.dropout(pool5, rate=pooldropout_rate, seed=115, training=training) # Flatten output with tf.name_scope('flatten') as scope: flat_output = tf.contrib.layers.flatten(pool5) # dropout at fc rate flat_output = tf.layers.dropout(flat_output, rate=fcdropout_rate, seed=116, training=training) # Fully connected layer 1 with tf.name_scope('fc1') as scope: fc1 = tf.layers.dense( flat_output, 2048, activation=None, kernel_initializer=tf.variance_scaling_initializer(scale=2, seed=117), bias_initializer=tf.zeros_initializer(), kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=lamF), name="fc1" ) bn_fc1 = tf.layers.batch_normalization( fc1, axis=-1, momentum=0.9, epsilon=epsilon, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), moving_mean_initializer=tf.zeros_initializer(),
def build_predict(self, inputs, reverse_preds=None, embed_penultimate=False, target_subset=None, save_reprs=False): """Construct per-location real-valued predictions.""" assert inputs is not None print('Targets pooled by %d to length %d' % (self.hp.target_pool, self.hp.seq_length // self.hp.target_pool)) if self.hp.augment_mutation > 0: # sample mutation binary mask across sequences mut_mask_probs = self.hp.augment_mutation * np.ones( (self.hp.seq_length, 1)) mut_mask_dist = tfp.distributions.Bernoulli(probs=mut_mask_probs, dtype=tf.float32) mut_mask = mut_mask_dist.sample(tf.shape(inputs)[0]) # sample random nucleotide for mutations mut_1hot_probs = 0.25 * np.ones((self.hp.seq_length, 4)) mut_1hot_dist = tfp.distributions.OneHotCategorical( probs=mut_1hot_probs, dtype=tf.float32) mut_1hot = mut_1hot_dist.sample(tf.shape(inputs)[0]) # modify sequence inputs_mut = inputs - mut_mask * inputs + mut_mask * mut_1hot inputs = tf.cond(self.is_training, lambda: inputs_mut, lambda: inputs) ################################################### # convolution layers ################################################### filter_weights = [] layer_reprs = [inputs] seqs_repr = inputs for layer_index in range(self.hp.cnn_layers): with tf.variable_scope('cnn%d' % layer_index, reuse=tf.AUTO_REUSE): # convolution block args_for_block = self._make_conv_block_args( layer_index, layer_reprs) seqs_repr = layers.conv_block(seqs_repr=seqs_repr, **args_for_block) # save representation layer_reprs.append(seqs_repr) if save_reprs: self.layer_reprs = layer_reprs # final nonlinearity if self.hp.nonlinearity == 'relu': seqs_repr = tf.nn.relu(seqs_repr) elif self.hp.nonlinearity == 'gelu': seqs_repr = tf.nn.sigmoid(1.702 * seqs_repr) * seqs_repr else: print('Unrecognized nonlinearity "%s"' % self.hp.nonlinearity, file=sys.stderr) exit(1) ################################################### # slice out side buffer ################################################### # update batch buffer to reflect pooling seq_length = seqs_repr.shape[1].value pool_preds = self.hp.seq_length // seq_length assert self.hp.batch_buffer % pool_preds == 0, ( 'batch_buffer %d not divisible' ' by the CNN pooling %d') % (self.hp.batch_buffer, pool_preds) batch_buffer_pool = self.hp.batch_buffer // pool_preds # slice out buffer seq_length = seqs_repr.shape[1] seqs_repr = seqs_repr[:, batch_buffer_pool:seq_length - batch_buffer_pool, :] seq_length = seqs_repr.shape[1] ################################################### # final layer ################################################### if embed_penultimate: final_repr = seqs_repr else: with tf.variable_scope('final', reuse=tf.AUTO_REUSE): final_filters = self.hp.sum_targets * self.hp.target_classes final_repr = tf.layers.dense( inputs=seqs_repr, units=final_filters, activation=None, kernel_initializer=tf.variance_scaling_initializer( scale=2.0, mode='fan_in'), kernel_regularizer=tf.contrib.layers.l1_regularizer( self.hp.final_l1_scale)) print('Convolution w/ %d %dx1 filters to final targets' % \ (final_filters, seqs_repr.shape[2])) if target_subset is not None: # get convolution parameters filters_full = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, 'final/dense/kernel')[0] bias_full = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, 'final/dense/bias')[0] # subset to specific targets filters_subset = tf.gather(filters_full, target_subset, axis=1) bias_subset = tf.gather(bias_full, target_subset, axis=0) # substitute a new limited convolution final_repr = tf.tensordot(seqs_repr, filters_subset, 1) final_repr = tf.nn.bias_add(final_repr, bias_subset) # update # targets self.hp.sum_targets = len(target_subset) # expand length back out if self.hp.target_classes > 1: final_repr = tf.reshape( final_repr, (-1, seq_length, self.hp.sum_targets, self.hp.target_classes)) # transform for reverse complement if reverse_preds is not None: final_repr = tf.cond(reverse_preds, lambda: tf.reverse(final_repr, axis=[1]), lambda: final_repr) ################################################### # link function ################################################### if embed_penultimate: predictions = final_repr else: # work-around for specifying my own predictions # self.preds_adhoc = tf.placeholder( # tf.float32, shape=final_repr.shape, name='preds-adhoc') # float 32 exponential clip max exp_max = 50 # choose link if self.hp.link in ['identity', 'linear']: predictions = tf.identity(final_repr, name='preds') elif self.hp.link == 'relu': predictions = tf.relu(final_repr, name='preds') elif self.hp.link == 'exp': final_repr_clip = tf.clip_by_value(final_repr, -exp_max, exp_max) predictions = tf.exp(final_repr_clip, name='preds') elif self.hp.link == 'exp_linear': predictions = tf.where( final_repr > 0, final_repr + 1, tf.exp(tf.clip_by_value(final_repr, -exp_max, exp_max)), name='preds') elif self.hp.link == 'softplus': final_repr_clip = tf.clip_by_value(final_repr, -exp_max, 10000) predictions = tf.nn.softplus(final_repr_clip, name='preds') else: print('Unknown link function %s' % self.hp.link, file=sys.stderr) exit(1) # clip if self.hp.target_clip is not None: predictions = tf.clip_by_value(predictions, 0, self.hp.target_clip) # sqrt if self.hp.target_sqrt: predictions = tf.sqrt(predictions) return predictions
def conv2d_fixed_padding(inputs,filters,kernel_size,strides,data_format): if strides>1: inputs = fixed_padding(inputs,kernel_size,data_format) #variance_scaling_initializer默认生成N(0,1/sqrt(n))的分布,对于kernel=(k,k,f_i,f_o),n=k*k*f_i return tf.layers.conv2d(inputs = inputs,filters=filters,kernel_size=kernel_size, strides = strides,padding=('SAME' if strides==1 else "VALID"),use_bias = False, kernel_initializer=tf.variance_scaling_initializer(),data_format=data_format)
def _get_network(self, inputs, use_histograms=False): screen = pysc2_common_net_funcs.preprocess_state_input( inputs, self._config) with tf.variable_scope('shared_spatial_network'): shared_spatial_net = network_utils.get_layers( screen, self._config['network_structure']['shared_spatial_network'], self._config['network_structure']['default_activation'], self._training, use_histograms=use_histograms) if self._config['network_structure'][ 'scale_gradients_at_shared_spatial_split']: with tf.variable_scope('spatial_gradient_scale'): # scale because multiple action component streams are meeting here # (always one more branch than number of spatial components) spatial_count = 1 for name, using in self._action_components.items(): if using and name in pysc2_common_net_funcs.spatial_components: spatial_count += 1 scale = 1 / spatial_count shared_spatial_net = (1 - scale) * tf.stop_gradient( shared_spatial_net) + scale * shared_spatial_net if self._config['dueling_network']: with tf.variable_scope('dueling_gradient_scale'): # scale the gradients entering last shared layer, as in original Dueling DQN paper scale = 1 / math.sqrt(2) shared_spatial_net = (1 - scale) * tf.stop_gradient( shared_spatial_net) + scale * shared_spatial_net # for dueling net, split here if self._config['dueling_network']: with tf.variable_scope('value_network'): fc_value = network_utils.get_layers( shared_spatial_net, self._config['network_structure']['value_network'], self._config['network_structure']['default_activation'], self._training, use_histograms=use_histograms) value = tf.layers.dense( fc_value, 1, activation=None, kernel_initializer=tf.variance_scaling_initializer( scale=2.0), name='value') else: # returning this from the function for debugging purposes, so need it to exist if not using dueling net value = None with tf.variable_scope('shared_non_spatial_network'): shared_non_spatial = network_utils.get_layers( shared_spatial_net, self._config['network_structure'] ['shared_non_spatial_network'], self._config['network_structure']['default_activation'], self._training, use_histograms=use_histograms) if self._config['network_structure'][ 'scale_gradients_at_shared_non_spatial_split']: with tf.variable_scope('non_spatial_gradient_scale'): # scale because multiple action component streams are meeting here non_spatial_count = 0 for name, using in self._action_components.items(): if using and name not in pysc2_common_net_funcs.spatial_components: non_spatial_count += 1 scale = 1 / non_spatial_count shared_non_spatial = (1 - scale) * tf.stop_gradient( shared_non_spatial) + scale * shared_non_spatial num_options = pysc2_common_net_funcs.get_num_options_per_function( self._config) # create each component stream component_streams = {} # final q vals with value added action_q_vals = {} # if another stream requires the output of another stream component_one_hots_or_embeddings = {} for c in pysc2_common_net_funcs.component_order: # are we using this component? if self._action_components[c]: with tf.variable_scope(c + '_stream'): stream_input = shared_non_spatial if c in pysc2_common_net_funcs.spatial_components: stream_input = shared_spatial_net # optionally one stream of fully connected layers per component spec = self._config['network_structure'][ 'component_stream_default'] if c in self._config['network_structure'][ 'component_stream_specs']: spec = self._config['network_structure'][ 'component_stream_specs'][c] # optionally feed one hot OR embedded versions of earlier stream outputs to this stream dependencies = None if self._config['network_structure'][ 'use_stream_outputs_as_inputs_to_other_streams']: if c in self._config['network_structure'][ 'stream_dependencies']: dependencies = [] for d in self._config['network_structure'][ 'stream_dependencies'][c]: dependencies.append( component_one_hots_or_embeddings[d]) component_stream = network_utils.get_layers( stream_input, spec, self._config['network_structure'] ['default_activation'], self._training, dependencies, use_histograms=use_histograms) if c not in pysc2_common_net_funcs.spatial_components or self._config[ 'network_structure'][ 'end_spatial_streams_with_dense_instead_of_flatten']: # make a dense layer with width equal to number of possible actions dense = tf.layers.Dense(num_options[c], name=c) component_streams[c] = dense(component_stream) if self._use_histograms: weights = dense.kernel bias = dense.bias name = 'final_dense_' + c + '_' tf.summary.histogram(name + 'weights', weights) tf.summary.histogram(name + 'bias', bias) else: # flatten a conv output component_streams[c] = tf.reshape(component_stream, [-1, num_options[c]], name=c) if self._use_histograms: tf.summary.histogram('advantage_' + c, component_streams[c]) if self._config['dueling_network']: # action_q_vals is A(s,a), value is V(s) # Q(s,a) = V(s) + A(s,a) - 1/|A| * SUM_a(A(s,a)) with tf.variable_scope('q_vals'): advantage = component_streams[c] action_q_vals[c] = tf.add( value, (advantage - tf.reduce_mean(advantage, axis=1, keepdims=True)), name=name) else: action_q_vals[c] = component_streams[c] # filter out actions ('function') that are illegal for this state if c == 'function': with tf.variable_scope('available_actions_mask'): # available actions mask; avoids using negative infinity, and is the right size action_neg_inf_q_vals = action_q_vals[ 'function'] * 0 - 1000000 action_q_vals['function'] = tf.where( inputs['available_actions'], action_q_vals['function'], action_neg_inf_q_vals) if self._config['network_structure'][ 'use_stream_outputs_as_inputs_to_other_streams']: with tf.variable_scope('stream_action_one_hot'): found_dependency = False for stream, dependencies in self._config[ 'network_structure'][ 'stream_dependencies'].items(): if self._action_components[ stream] and c in dependencies: found_dependency = True break if found_dependency: action_index = tf.math.argmax(action_q_vals[c], axis=-1) if c == 'screen': # special handling for screen->screen2 only action_one_hot = tf.one_hot( action_index, num_options[c]) action_one_hot = tf.reshape( action_one_hot, [ -1, self._config['env']['screen_size'], self._config['env']['screen_size'], 1 ]) component_one_hots_or_embeddings[ c] = tf.stop_gradient(action_one_hot) if num_options[c] <= 10: action_one_hot = tf.one_hot( action_index, num_options[c]) # argmax should be non-differentiable but just to remind myself use stop_gradient component_one_hots_or_embeddings[ c] = tf.stop_gradient(action_one_hot) else: component_one_hots_or_embeddings[ c] = tf.keras.layers.Embedding( input_dim=num_options[c], output_dim=math.ceil(num_options[c]**( 1 / 4.0)))(action_index) # return action_q_vals return action_q_vals, value, component_streams
def __call__(self, inputs, training): inputs = tf.identity(inputs, 'model_inputs') print('===================== model inputs', inputs) with self._model_variable_scope(): # init conv if self.bottleneck: init_channel_num = self.k * 2 else: init_channel_num = 16 inputs = tf.layers.conv2d( inputs=inputs, filters=init_channel_num, kernel_size=3, strides=1, padding='SAME', use_bias=False, kernel_initializer=tf.variance_scaling_initializer(), name='init_conv') if not self.bottleneck: with tf.variable_scope('stage1'): for block_num in range(self.N): inputs = _add_layer(self, inputs, block_num, training) inputs = _add_transition(self, inputs, 'transition1', training) with tf.variable_scope('stage2'): for block_num in range(self.N): inputs = _add_layer(self, inputs, block_num, training) inputs = _add_transition(self, inputs, 'transition2', training) with tf.variable_scope('stage3'): for block_num in range(self.N): inputs = _add_layer(self, inputs, block_num, training) if self.bottleneck: with tf.variable_scope('stage1'): for block_num in range(self.N): inputs = _add_bottleneck_layer(self, inputs, block_num, training) inputs = _add_transition(self, inputs, 'transition1', training) with tf.variable_scope('stage2'): for block_num in range(self.N): inputs = _add_bottleneck_layer(self, inputs, block_num, training) inputs = _add_transition(self, inputs, 'transition2', training) with tf.variable_scope('stage3'): for block_num in range(self.N): inputs = _add_bottleneck_layer(self, inputs, block_num, training) inputs = batch_norm(inputs, training, name='bnlast') inputs = tf.nn.relu(inputs) # global avg pooling inputs = tf.reduce_mean(inputs, [1, 2], keepdims=True, name='final_reduce_mean') inputs = tf.squeeze(inputs, [1, 2]) inputs = tf.layers.dense(inputs=inputs, units=self.num_classes) inputs = tf.identity(inputs, 'final_dense') print('===================== model outputs', inputs) return inputs
def conv(self, num_out_channels, k_height, k_width, d_height=1, d_width=1, mode='SAME', input_layer=None, num_channels_in=None, use_batch_norm=None, stddev=None, activation='relu', bias=0.0, kernel_initializer=None): """Construct a conv2d layer on top of cnn.""" if input_layer is None: input_layer = self.top_layer if num_channels_in is None: num_channels_in = self.top_size if stddev is not None and kernel_initializer is None: kernel_initializer = tf.truncated_normal_initializer(stddev=stddev) if kernel_initializer is None: kernel_initializer = tf.variance_scaling_initializer() name = 'conv' + str(self.counts['conv']) self.counts['conv'] += 1 with tf.variable_scope(name): strides = [1, d_height, d_width, 1] if self.data_format == 'NCHW': strides = [strides[0], strides[3], strides[1], strides[2]] if mode != 'SAME_RESNET': conv = self._conv2d_impl(input_layer, num_channels_in, num_out_channels, kernel_size=[k_height, k_width], strides=[d_height, d_width], padding=mode, kernel_initializer=kernel_initializer) else: # Special padding mode for ResNet models if d_height == 1 and d_width == 1: conv = self._conv2d_impl( input_layer, num_channels_in, num_out_channels, kernel_size=[k_height, k_width], strides=[d_height, d_width], padding='SAME', kernel_initializer=kernel_initializer) else: rate = 1 # Unused (for 'a trous' convolutions) kernel_height_effective = k_height + (k_height - 1) * (rate - 1) pad_h_beg = (kernel_height_effective - 1) // 2 pad_h_end = kernel_height_effective - 1 - pad_h_beg kernel_width_effective = k_width + (k_width - 1) * (rate - 1) pad_w_beg = (kernel_width_effective - 1) // 2 pad_w_end = kernel_width_effective - 1 - pad_w_beg padding = [[0, 0], [pad_h_beg, pad_h_end], [pad_w_beg, pad_w_end], [0, 0]] if self.data_format == 'NCHW': padding = [ padding[0], padding[3], padding[1], padding[2] ] padded_input_layer = tf.pad(input_layer, padding) conv = self._conv2d_impl( padded_input_layer, num_channels_in, num_out_channels, kernel_size=[k_height, k_width], strides=[d_height, d_width], padding='VALID', kernel_initializer=kernel_initializer) if use_batch_norm is None: use_batch_norm = self.use_batch_norm mlperf.logger.log_conv2d(input_tensor=input_layer, output_tensor=conv, stride_height=d_height, stride_width=d_width, filters=num_out_channels, initializer=kernel_initializer, use_bias=not use_batch_norm and bias is not None) if not use_batch_norm: if bias is not None: biases = self.get_variable( 'biases', [num_out_channels], self.variable_dtype, self.dtype, initializer=tf.constant_initializer(bias)) biased = tf.reshape( tf.nn.bias_add(conv, biases, data_format=self.data_format), conv.get_shape()) else: biased = conv else: self.top_layer = conv self.top_size = num_out_channels biased = self.batch_norm(**self.batch_norm_config) if activation == 'relu': mlperf.logger.log(key=mlperf.tags.MODEL_HP_RELU) conv1 = self.relu(biased) elif activation == 'linear' or activation is None: conv1 = biased elif activation == 'tanh': conv1 = tf.nn.tanh(biased) else: raise KeyError('Invalid activation type \'%s\'' % activation) self.top_layer = conv1 self.top_size = num_out_channels return conv1
def q_network(state_tensor): inputs = state_tensor dense_outputs1 = tf.layers.dense(inputs=inputs, units=30, activation=tf.nn.relu, kernel_initializer=tf.variance_scaling_initializer()) dense_outputs2 = tf.layers.dense(inputs=dense_outputs1, units=30, activation=tf.nn.relu, kernel_initializer=tf.variance_scaling_initializer()) outputs = tf.layers.dense(inputs=dense_outputs2, units=n_outputs, kernel_initializer=tf.variance_scaling_initializer()) return outputs
def build_network(self, inputs, is_training): """Builds the forward pass of the model. Args: inputs: the list of inputs, excluding labels is_training: if in the phrase of training. Returns: The logits of the model. """ def inception_v1(inputs, k, l, m, n, p, q): cols = [[('conv', k, 1, 1)], [('conv', l, 1, 1), ('conv', m, 3, 3)], [('conv', n, 1, 1), ('conv', p, 5, 5)], [('mpool', 3, 3, 1, 1, 'SAME'), ('conv', q, 1, 1)]] return inception_module(inputs, cols, self.channel_pos) if self.data_format == 'NCHW': inputs = tf.transpose(inputs, [0, 3, 1, 2]) conv1 = tf.layers.conv2d( inputs=inputs, filters=64, kernel_size=7, strides=2, padding='same', data_format=self.channel_pos, activation=tf.nn.relu, kernel_initializer=tf.variance_scaling_initializer(), bias_initializer=tf.constant_initializer(0.0) ) pool1 = tf.layers.max_pooling2d( inputs=conv1, pool_size=3, strides=2, padding='same', data_format=self.channel_pos ) conv2 = tf.layers.conv2d( inputs=pool1, filters=64, kernel_size=1, strides=1, padding='same', data_format=self.channel_pos, activation=tf.nn.relu, kernel_initializer=tf.variance_scaling_initializer(), bias_initializer=tf.constant_initializer(0.0) ) conv3 = tf.layers.conv2d( inputs=conv2, filters=192, kernel_size=3, strides=1, padding='same', data_format=self.channel_pos, activation=tf.nn.relu, kernel_initializer=tf.variance_scaling_initializer(), bias_initializer=tf.constant_initializer(0.0) ) pool3 = tf.layers.max_pooling2d( inputs=conv3, pool_size=3, strides=2, padding='same', data_format=self.channel_pos ) output1 = inception_v1(pool3, 64, 96, 128, 16, 32, 32) output2 = inception_v1(output1, 128, 128, 192, 32, 96, 64) pool4 = tf.layers.max_pooling2d( inputs=output2, pool_size=3, strides=2, padding='same', data_format=self.channel_pos ) output3 = inception_v1(pool4, 192, 96, 208, 16, 48, 64) output4 = inception_v1(output3, 160, 112, 224, 24, 64, 64) output5 = inception_v1(output4, 128, 128, 256, 24, 64, 64) output6 = inception_v1(output5, 112, 144, 288, 32, 64, 64) output7 = inception_v1(output6, 256, 160, 320, 32, 128, 128) pool8 = tf.layers.max_pooling2d( inputs=output7, pool_size=3, strides=2, padding='same', data_format=self.channel_pos ) output9 = inception_v1(pool8, 256, 160, 320, 32, 128, 128) output10 = inception_v1(output9, 384, 192, 384, 48, 128, 128) pool11 = tf.layers.average_pooling2d( inputs=output10, pool_size=7, strides=1, padding='valid', data_format=self.channel_pos ) output12 = tf.reshape(pool11, [-1, 1024]) stddev = np.sqrt(1.0 / self.num_classes) logits = tf.contrib.layers.fully_connected( inputs=output12, num_outputs=self.num_classes, activation_fn=None, weights_initializer=tf.truncated_normal_initializer(stddev), biases_initializer=tf.constant_initializer(0.0) ) return logits
def __init__(self, name, input_shape, output_dim, h_size=512, logdir=None): """A3C Network tensors and operations are defined here Args: name (str): The name of scope input_shape (list): The shape of input image [H, W, C] output_dim (int): Number of actions logdir (str, optional): directory to save summaries Notes: You should be familiar with Policy Gradients. The only difference between vanilla PG and A3C is that there is an operation to apply gradients manually """ self.h_size = h_size with tf.variable_scope(name): #The network recieves a frame from the game, flattened into an array. #It then resizes it and processes it through four convolutional layers. self.stateInput = tf.placeholder(tf.float32, shape=[None, *input_shape], name='state') net = self.stateInput #init = tf.random_normal_initializer(mean=0.0, stddev=0.01, dtype=tf.float32) init = tf.variance_scaling_initializer( scale=2) # He initialization net = tf.layers.conv2d(net, filters=32, kernel_size=8, strides=4, padding='valid', kernel_initializer=init, activation=tf.nn.relu) net = tf.layers.conv2d(net, filters=64, kernel_size=4, strides=2, padding='valid', kernel_initializer=init, activation=tf.nn.relu) net = tf.layers.conv2d(net, filters=64, kernel_size=3, strides=1, padding='valid', kernel_initializer=init, activation=tf.nn.relu) net = tf.layers.conv2d(net, filters=self.h_size, kernel_size=7, strides=1, padding='valid', kernel_initializer=init, activation=tf.nn.relu) #We take the output from the final convolutional layer and split it into separate advantage and value streams. self.streamAC, self.streamVC = tf.split( net, 2, 3) # (N,1,1,512) --> (N,1,1,256), (N,1,1,256) self.streamAC = tf.layers.flatten(self.streamAC) self.Policy = tf.clip_by_value( tf.layers.dense(self.streamAC, output_dim, use_bias=True, activation=tf.nn.softmax, kernel_initializer=init), 1e-10, 1.) self.predict = tf.argmax(self.Policy, 1) self.streamVC = tf.layers.flatten(self.streamVC) self.Value = tf.layers.dense(self.streamVC, 1, use_bias=False, activation=None, kernel_initializer=init) self.Value = tf.squeeze(self.Value) #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. self.action = tf.placeholder(shape=[None], dtype=tf.int32, name='action_input') self.actions_onehot = tf.one_hot(self.action, output_dim, dtype=tf.float32, name='action_onehot') self.advantage = tf.placeholder(tf.float32, shape=[None], name="advantage_input") self.reward = tf.placeholder(tf.float32, shape=[None], name="reward_input") policy_gain = tf.boolean_mask(self.Policy, self.actions_onehot) policy_gain = tf.log(policy_gain) * self.advantage policy_gain = tf.reduce_mean(policy_gain, name="policy_gain") entropy = -tf.reduce_sum(self.Policy * tf.log(self.Policy), 1) entropy = tf.reduce_mean(entropy) value_loss = tf.losses.mean_squared_error(self.Value, self.reward, scope="value_loss") # Becareful negative sign because we only can minimize # we want to maximize policy gain and entropy (for exploration) self.total_loss = -policy_gain + 0.1 * value_loss - entropy * 0.02 self.optimizer = tf.train.AdamOptimizer(learning_rate=0.00025) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) self.gradients = self.optimizer.compute_gradients( self.total_loss, var_list) self.gradients_placeholders = [] for grad, var in self.gradients: placeholder = tf.placeholder(var.dtype, shape=var.get_shape()) placeholder = tf.clip_by_norm(placeholder, 40) self.gradients_placeholders.append((placeholder, var)) self.apply_gradients = self.optimizer.apply_gradients( self.gradients_placeholders) if logdir: loss_summary = tf.summary.scalar("total_loss", self.total_loss) value_summary = tf.summary.histogram("values", self.values) self.summary_op = tf.summary.merge([loss_summary, value_summary]) self.summary_writer = tf.summary.FileWriter(logdir)
def __init__( self, observation_spec, conv_layer_params=None, input_fc_layer_params=(75, 40), lstm_size=(40, ), output_fc_layer_params=(75, 40), activation_fn=tf.keras.activations.relu, name='LSTMEncodingNetwork', ): """Creates an instance of `LSTMEncodingNetwork`. Args: observation_spec: A nest of `tensor_spec.TensorSpec` representing the observations. conv_layer_params: Optional list of convolution layers parameters, where each item is a length-three tuple indicating (filters, kernel_size, stride). input_fc_layer_params: Optional list of fully connected parameters, where each item is the number of units in the layer. These feed into the recurrent layer. lstm_size: An iterable of ints specifying the LSTM cell sizes to use. output_fc_layer_params: Optional list of fully connected parameters, where each item is the number of units in the layer. These are applied on top of the recurrent layer. activation_fn: Activation function, e.g. tf.keras.activations.relu,. name: A string representing name of the network. """ kernel_initializer = tf.variance_scaling_initializer( scale=2.0, mode='fan_in', distribution='truncated_normal') input_encoder = encoding_network.EncodingNetwork( observation_spec, conv_layer_params=conv_layer_params, fc_layer_params=input_fc_layer_params, activation_fn=activation_fn, kernel_initializer=kernel_initializer) # Create RNN cell if len(lstm_size) == 1: cell = tf.keras.layers.LSTMCell(lstm_size[0]) else: cell = tf.keras.layers.StackedRNNCells( [tf.keras.layers.LSTMCell(size) for size in lstm_size]) output_encoder = ([ tf.keras.layers.Dense(num_units, activation=activation_fn, kernel_initializer=kernel_initializer, name='/'.join([name, 'dense'])) for num_units in output_fc_layer_params ]) state_spec = nest.map_structure( functools.partial(tensor_spec.TensorSpec, dtype=tf.float32, name='network_state_spec'), cell.state_size) super(LSTMEncodingNetwork, self).__init__(observation_spec=observation_spec, action_spec=None, state_spec=state_spec, name=name) self._conv_layer_params = conv_layer_params self._input_encoder = input_encoder self._dynamic_unroll = dynamic_unroll_layer.DynamicUnroll(cell) self._output_encoder = output_encoder
def inference(self, inputs, trainable=True): with tf.name_scope('norm'): inputs = tf.div(tf.cast(inputs, tf.float32), 255.0) images = tf.split(inputs, [1, 1, 1, 1], axis=3) for i in range(4): tf.summary.image('input_images_%d' % i, images[i]) def act(inputs): return tf.nn.leaky_relu(inputs, alpha=0.01) initializer = tf.variance_scaling_initializer() with tf.name_scope('conv1'): conv = tf.contrib.layers.conv2d(inputs, 32, stride=4, kernel_size=8, activation_fn=act, trainable=trainable, padding='SAME', weights_initializer=initializer) with tf.name_scope('conv2'): conv = tf.contrib.layers.conv2d(conv, 64, stride=2, kernel_size=4, activation_fn=act, trainable=trainable, padding='SAME', weights_initializer=initializer) with tf.name_scope('conv3'): conv = tf.contrib.layers.conv2d(conv, 64, stride=1, kernel_size=3, activation_fn=act, trainable=trainable, padding='SAME', weights_initializer=initializer) with tf.name_scope('fully_connected'): flatten = tf.contrib.layers.flatten(conv) fc = tf.contrib.layers.fully_connected( flatten, 512, trainable=trainable, activation_fn=act, weights_initializer=initializer) with tf.name_scope('output'): w = tf.get_variable('ow', shape=[512, self.config.action_size], trainable=trainable, initializer=initializer) b = tf.get_variable('ob', shape=[self.config.action_size], trainable=trainable, initializer=tf.zeros_initializer()) outputs = tf.add(tf.matmul(fc, w), b, name='q_values') return outputs
def init_hidden_uniform(): return tf.variance_scaling_initializer(scale=1.0 / 3.0, mode="fan_in", distribution="uniform")
def fpn_model(features): """ Args: features ([tf.Tensor]): ResNet features c2-c5 Returns: [tf.Tensor]: FPN features p2-p6 """ assert len(features) == 4, features num_channel = cfg.FPN.NUM_CHANNEL use_gn = cfg.FPN.NORM == "GN" def upsample2x(name, x): return FixedUnPooling( name, x, 2, unpool_mat=np.ones((2, 2), dtype="float32"), data_format="channels_first" ) # tf.image.resize is, again, not aligned. # with tf.name_scope(name): # shape2d = tf.shape(x)[2:] # x = tf.transpose(x, [0, 2, 3, 1]) # x = tf.image.resize_nearest_neighbor(x, shape2d * 2, align_corners=True) # x = tf.transpose(x, [0, 3, 1, 2]) # return x with argscope( Conv2D, data_format="channels_first", activation=tf.identity, use_bias=True, kernel_initializer=tf.variance_scaling_initializer(scale=1.0), ): lat_2345 = [ Conv2D("lateral_1x1_c{}".format(i + 2), c, num_channel, 1) for i, c in enumerate(features) ] if use_gn: lat_2345 = [GroupNorm("gn_c{}".format(i + 2), c) for i, c in enumerate(lat_2345)] lat_sum_5432 = [] for idx, lat in enumerate(lat_2345[::-1]): if idx == 0: lat_sum_5432.append(lat) else: lat = lat + upsample2x("upsample_lat{}".format(6 - idx), lat_sum_5432[-1]) lat_sum_5432.append(lat) p2345 = [ Conv2D("posthoc_3x3_p{}".format(i + 2), c, num_channel, 3) for i, c in enumerate(lat_sum_5432[::-1]) ] if use_gn: p2345 = [GroupNorm("gn_p{}".format(i + 2), c) for i, c in enumerate(p2345)] p6 = MaxPooling( "maxpool_p6", p2345[-1], pool_size=1, strides=2, data_format="channels_first", padding="VALID", ) return p2345 + [p6]
def __call__(self, inputs, inputs_unpadded_length, former_encoder_input, multi_attention_bias, targets=None): """Calculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, input_length, hidden_size]. inputs_unpadded_length: int tensor with shape [batch_size, ]. Indicate the actual length of each input. targets: None or int tensor with shape [batchCalculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, input_length, hidden_size]. inputs_unpadded_length: int tensor with shape [batch_size, ]. Indicate the actual length of each input. targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} _size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} """ # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. initializer = tf.variance_scaling_initializer( self.params["initializer_gain"], mode="fan_avg", distribution="uniform") with tf.variable_scope(self.scope, initializer=initializer): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = model_utils.get_padding_bias( inputs, inputs_unpadded_length) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder_outputs = self.encode(inputs, attention_bias, inputs_unpadded_length, former_encoder_input, multi_attention_bias) # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: return self.predict(encoder_outputs, attention_bias) else: logits = self.decode(targets, encoder_outputs, attention_bias) return logits
# Model architecture parameters n_stocks = 500 n_neurons_1 = 1024 n_neurons_2 = 512 n_neurons_3 = 256 n_neurons_4 = 128 n_target = 1 # Placeholder X = tf.placeholder(dtype=tf.float32, shape=[None, n_stocks]) Y = tf.placeholder(dtype=tf.float32, shape=[None]) # Initializers sigma = 1 weight_initializer = tf.variance_scaling_initializer(mode="fan_avg", distribution="uniform", scale=sigma) bias_initializer = tf.zeros_initializer() # Layer 1: Variables for hidden weights and biases W_hidden_1 = tf.Variable(weight_initializer([n_stocks, n_neurons_1])) bias_hidden_1 = tf.Variable(bias_initializer([n_neurons_1])) # Layer 2: Variables for hidden weights and biases W_hidden_2 = tf.Variable(weight_initializer([n_neurons_1, n_neurons_2])) bias_hidden_2 = tf.Variable(bias_initializer([n_neurons_2])) # Layer 3: Variables for hidden weights and biases W_hidden_3 = tf.Variable(weight_initializer([n_neurons_2, n_neurons_3])) bias_hidden_3 = tf.Variable(bias_initializer([n_neurons_3]))
def fpn_model(features): """ Args: features ([tf.Tensor]): ResNet features c2-c5 Returns: [tf.Tensor]: FPN features p2-p6 """ assert len(features) == 4, features num_channel = cfg.FPN.NUM_CHANNEL use_gn = cfg.FPN.NORM == 'GN' def upsample2x(name, x): try: resize = tf.compat.v2.image.resize_images with tf.name_scope(name): shp2d = tf.shape(x)[2:] x = tf.transpose(x, [0, 2, 3, 1]) x = resize(x, shp2d * 2, 'nearest') x = tf.transpose(x, [0, 3, 1, 2]) return x except AttributeError: return FixedUnPooling(name, x, 2, unpool_mat=np.ones((2, 2), dtype='float32'), data_format='channels_first') with argscope( Conv2D, data_format='channels_first', activation=tf.identity, use_bias=True, kernel_initializer=tf.variance_scaling_initializer(scale=1.)): lat_2345 = [ Conv2D('lateral_1x1_c{}'.format(i + 2), c, num_channel, 1) for i, c in enumerate(features) ] if use_gn: lat_2345 = [ GroupNorm('gn_c{}'.format(i + 2), c) for i, c in enumerate(lat_2345) ] lat_sum_5432 = [] for idx, lat in enumerate(lat_2345[::-1]): if idx == 0: lat_sum_5432.append(lat) else: lat = lat + upsample2x('upsample_lat{}'.format(6 - idx), lat_sum_5432[-1]) lat_sum_5432.append(lat) p2345 = [ Conv2D('posthoc_3x3_p{}'.format(i + 2), c, num_channel, 3) for i, c in enumerate(lat_sum_5432[::-1]) ] if use_gn: p2345 = [ GroupNorm('gn_p{}'.format(i + 2), c) for i, c in enumerate(p2345) ] p6 = MaxPooling('maxpool_p6', p2345[-1], pool_size=1, strides=2, data_format='channels_first', padding='VALID') return p2345 + [p6]