def define_self_prediction_rew(self, convfeat, rep_size, enlargement, scope): #RND. # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:, 1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.leaky_relu( conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) rgbr = [to2d(xr)] X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2)) #define expert agent observations random features # yes_gpu = any(get_available_gpus()) with tf.variable_scope( tf.get_variable_scope(), reuse=True), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): X_im = np.load(os.getcwd() + '/policies/obs.npy') Xr_im = tf.cast(X_im, tf.float32) / 255. Xr_im = tf.reshape(Xr_im, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] Xr_im = tf.clip_by_value((Xr_im - tf.reduce_mean(Xr_im)) / (tf.math.reduce_std(Xr_im)**0.5), -5.0, 5.0) Xr_im = tf.nn.leaky_relu( conv(Xr_im, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) Xr_im = tf.nn.leaky_relu( conv(Xr_im, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) Xr_im = tf.nn.leaky_relu( conv(Xr_im, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) Xr_im = [to2d(Xr_im)[::self.demonstration_stride]] Xr_im = fc(Xr_im[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2)) Xr_im = tf.stop_gradient(Xr_im) # Predictor network. for ph in self.ph_ob.values(): if len(ph.shape.as_list() ) == 5: # B,T,H,W,C ###Batch time height width color? logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xrp = ph[:, 1:] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.leaky_relu( conv(xrp, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2))) rgbrp = to2d(xrp) X_r_hat = tf.nn.relu( fc(rgbrp, 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(X_r_hat, 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = fc(X_r_hat, 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) #### #self.im_rew = tf.math.maximum(1 - tf.divide(tf.reduce_mean(tf.square(self.Xr_im[:(X_r).shape[0]] - X_r), axis=-1, keep_dims=True),tf.add(tf.reduce_mean(tf.square(self.Xr_im[:X_r.shape[0]]), axis=-1, keep_dims=True),tf.reduce_mean(tf.square(X_r), axis=-1, keep_dims=True))),tf.constant(0.5)) im_rew = tf.reduce_mean(tf.tensordot(tf.stop_gradient(X_r), Xr_im, axes=[[1], [1]]), axis=1) im_rew = tf.reshape(im_rew, (self.sy_nenvs, self.sy_nsteps - 1)) #self.int_rew =tf.math.maximum(self.im_rew,self.int_rew) self.int_rew = self.int_rew * (1 + tf.math.tanh(im_rew / 100)) #### noisy_targets = tf.stop_gradient(X_r) self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.)
def build(self, input_shape): if self.data_format == 'channels_last': channel_axis = -1 input_row, input_col = input_shape[1:-1] else: channel_axis = 1 input_row, input_col = input_shape[2:] if input_shape[channel_axis] is None: raise ValueError('The channel dimension of the inputs ' 'should be defined. Found `None`.') input_filter = int(input_shape[channel_axis]) if self.data_format == 'channels_last': input_row, input_col = input_shape[1:-1] input_filter = input_shape[3] else: input_row, input_col = input_shape[2:] input_filter = input_shape[1] if (((input_row is None) and ((self.share_row_combining_weights, self.share_col_combining_weights) in [(True, False), (False, False)])) or ((input_col is None) and ((self.share_row_combining_weights, self.share_col_combining_weights) in [(False, True), (False, False)]))): raise ValueError('The spatial dimensions of the inputs to ' ' a LowRankLocallyConnected2D layer ' 'should be fully-defined, but layer received ' 'the inputs shape ' + str(input_shape)) # Compute output shapes. # Compute using the first filter since output will be same across filters. kernel_size = self.kernel_size[0] if isinstance( self.kernel_size, list) else self.kernel_size dilations = self.dilations[0] if isinstance(self.dilations, list) else self.dilations output_row = conv_utils.conv_output_length(input_row, kernel_size[0], self.padding, self.strides[0], dilation=dilations) output_col = conv_utils.conv_output_length(input_col, kernel_size[1], self.padding, self.strides[1], dilation=dilations) if isinstance(self.kernel_size, list): # Different filters. self.kernel_bases = [] for i, kernel_size in enumerate(self.kernel_size): kernel_bases_shape = (kernel_size[0], kernel_size[1], input_filter, self.filters) self.kernel_bases.append( self.add_weight(shape=kernel_bases_shape, initializer=self.kernel_initializer, name='kernel_bases%d' % i, regularizer=self.kernel_regularizer, constraint=self.kernel_constraint)) else: self.kernel_bases_shape = (self.kernel_size[0], self.kernel_size[1], input_filter, self.spatial_rank * self.filters) self.kernel_shape = (output_row, output_col, self.kernel_size[0], self.kernel_size[1], input_filter, self.filters) self.kernel_bases = self.add_weight( shape=self.kernel_bases_shape, initializer=self.kernel_initializer, name='kernel_bases', regularizer=self.kernel_regularizer, constraint=self.kernel_constraint) self.output_row = output_row self.output_col = output_col if not (self.share_row_combining_weights or self.share_col_combining_weights): if self.input_dependent: self.combining_weights = None else: self.combining_weights_shape = (output_row, output_col, self.spatial_rank) initializer = ( tf.constant_initializer(1. / np.sqrt(self.spatial_rank)) if self.combining_weights_initializer == 'conv_init' else self.combining_weights_initializer) self.wts = self.add_weight( shape=self.combining_weights_shape, initializer=initializer, name='combining_weights', regularizer=self.combining_weights_regularizer, constraint=self.combining_weights_constraint) # If self.wts is overwritten it is removed from layer.weights. # Thus, below assignment is necessary. self.combining_weights = self.wts else: c = 1. / (float(self.share_row_combining_weights) + float( self.share_col_combining_weights)) # Scale for init. initializer = (tf.constant_initializer(c / np.sqrt(self.spatial_rank)) if self.combining_weights_initializer == 'conv_init' else self.combining_weights_initializer) combining_weights_shape_row = (output_row, self.spatial_rank) combining_weights_shape_col = (output_col, self.spatial_rank) self.wts_row = tf.constant([[0.]]) self.wts_col = tf.constant([[0.]]) if self.share_row_combining_weights: self.wts_row = self.add_weight( shape=combining_weights_shape_row, initializer=initializer, name='combining_weights_row', regularizer=self.combining_weights_regularizer, constraint=self.combining_weights_constraint) if self.share_col_combining_weights: self.wts_col = self.add_weight( shape=combining_weights_shape_col, initializer=tf.constant_initializer( c / np.sqrt(self.spatial_rank)) if self.combining_weights_initializer == 'conv_init' else self.combining_weights_initializer, name='combining_weights_col', regularizer=self.combining_weights_regularizer, constraint=self.combining_weights_constraint) if self.share_row_combining_weights and self.share_col_combining_weights: self.combining_weights = tf.math.add(self.wts_col[tf.newaxis], self.wts_row[:, tf.newaxis], name='combining_weights') self.combining_weights_shape = (output_row, output_col, self.spatial_rank) elif self.share_row_combining_weights: self.combining_weights = tf.identity(self.wts_row, name='combining_weights') self.combining_weights_shape = combining_weights_shape_row elif self.share_col_combining_weights: self.combining_weights = tf.identity(self.wts_col, name='combining_weights') self.combining_weights_shape = combining_weights_shape_col if not self.input_dependent: if self.normalize_weights == 'softmax': # Normalize the weights to sum to 1. self.combining_weights = tf.nn.softmax( self.combining_weights, axis=-1, name='normalized_combining_weights') elif self.normalize_weights == 'norm': # Normalize the weights to sum to preserve kernel var. self.combining_weights = tf.math.l2_normalize( self.combining_weights, axis=-1, epsilon=1e-12, name='normalized_combining_weights') if (self.input_dependent or isinstance(self.kernel_size, list) or ((self.share_row_combining_weights, self.share_col_combining_weights) in [(True, False), (False, True)])): # Different kernel bases can not be combined. # Shape may not be defined for one of axes in one dimension separate wts. self.kernel = None else: self.kernel = tf.tensordot( self.combining_weights, tf.reshape(self.kernel_bases, (self.kernel_size[0], self.kernel_size[1], input_filter, self.spatial_rank, self.filters)), [[-1], [-2]], name='kernel') self.bias_spatial = 0. self.bias_channels = 0. if self.use_spatial_bias: if not (self.share_row_combining_weights or self.share_col_combining_weights): self.bias_spatial = self.add_weight( shape=(output_row, output_col, 1), initializer=self.bias_initializer, name='spatial_bias', regularizer=self.bias_regularizer, constraint=self.bias_constraint) else: self.bias_row = 0. self.bias_col = 0. if self.share_row_combining_weights: self.bias_row = self.add_weight( shape=(output_row, 1, 1), initializer=self.bias_initializer, name='bias_row', regularizer=self.bias_regularizer, constraint=self.bias_constraint) if self.share_col_combining_weights: self.bias_col = self.add_weight( shape=(1, output_col, 1), initializer=self.bias_initializer, name='bias_col', regularizer=self.bias_regularizer, constraint=self.bias_constraint) self.bias_spatial = tf.math.add(self.bias_row, self.bias_col, name='spatial_bias') if self.use_bias: self.bias_channels = self.add_weight( shape=(1, 1, self.filters), initializer=self.bias_initializer, name='bias_channels', regularizer=self.bias_regularizer, constraint=self.bias_constraint) self.bias = tf.math.add(self.bias_spatial, self.bias_channels, name='bias') if self.data_format == 'channels_last': self.input_spec = InputSpec(ndim=4, axes={-1: input_filter}) else: self.input_spec = InputSpec(ndim=4, axes={1: input_filter}) self.built = True
def _scale_expression(expr, w): """Scale a linear expression by w.""" b = tf.matmul(expr.b, w) w = tf.tensordot(expr.w, w, axes=1) return LinearExpression(w=w, b=b, lower=expr.lower, upper=expr.upper)
def compute_mel_filterbank_features(waveforms, sample_rate=16000, dither=1.0 / np.iinfo(np.int16).max, preemphasis=0.97, frame_length=25, frame_step=10, fft_length=None, window_fn=functools.partial( tf.signal.hann_window, periodic=True), lower_edge_hertz=80.0, upper_edge_hertz=7600.0, num_mel_bins=80, log_noise_floor=1e-3, apply_mask=True): """Implement mel-filterbank extraction using tf ops. Args: waveforms: float32 tensor with shape [batch_size, max_len] sample_rate: sampling rate of the waveform dither: stddev of Gaussian noise added to waveform to prevent quantization artefacts preemphasis: waveform high-pass filtering constant frame_length: frame length in ms frame_step: frame_Step in ms fft_length: number of fft bins window_fn: windowing function lower_edge_hertz: lowest frequency of the filterbank upper_edge_hertz: highest frequency of the filterbank num_mel_bins: filterbank size log_noise_floor: clip small values to prevent numeric overflow in log apply_mask: When working on a batch of samples, set padding frames to zero Returns: filterbanks: a float32 tensor with shape [batch_size, len, num_bins, 1] """ # `stfts` is a complex64 Tensor representing the short-time Fourier # Transform of each signal in `signals`. Its shape is # [batch_size, ?, fft_unique_bins] # where fft_unique_bins = fft_length // 2 + 1 # Find the wave length: the largest index for which the value is !=0 # note that waveforms samples that are exactly 0.0 are quite common, so # simply doing sum(waveforms != 0, axis=-1) will not work correctly. wav_lens = tf.reduce_max( tf.expand_dims(tf.range(tf.shape(waveforms)[1]), 0) * tf.to_int32(tf.not_equal(waveforms, 0.0)), axis=-1) + 1 if dither > 0: waveforms += tf.random_normal(tf.shape(waveforms), stddev=dither) if preemphasis > 0: waveforms = waveforms[:, 1:] - preemphasis * waveforms[:, :-1] wav_lens -= 1 frame_length = int(frame_length * sample_rate / 1e3) frame_step = int(frame_step * sample_rate / 1e3) if fft_length is None: fft_length = int(2**(np.ceil(np.log2(frame_length)))) stfts = tf.contrib.signal.stft(waveforms, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length, window_fn=window_fn, pad_end=True) stft_lens = (wav_lens + (frame_step - 1)) // frame_step masks = tf.to_float( tf.less_equal(tf.expand_dims(tf.range(tf.shape(stfts)[1]), 0), tf.expand_dims(stft_lens, 1))) # An energy spectrogram is the magnitude of the complex-valued STFT. # A float32 Tensor of shape [batch_size, ?, 257]. magnitude_spectrograms = tf.abs(stfts) # Warp the linear-scale, magnitude spectrograms into the mel-scale. num_spectrogram_bins = magnitude_spectrograms.shape[-1].value linear_to_mel_weight_matrix = ( tf.contrib.signal.linear_to_mel_weight_matrix(num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz, upper_edge_hertz)) mel_spectrograms = tf.tensordot(magnitude_spectrograms, linear_to_mel_weight_matrix, 1) # Note: Shape inference for tensordot does not currently handle this case. mel_spectrograms.set_shape(magnitude_spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_sgram = tf.log(tf.maximum(log_noise_floor, mel_spectrograms)) if apply_mask: log_mel_sgram *= tf.expand_dims(tf.to_float(masks), -1) return tf.expand_dims(log_mel_sgram, -1, name="mel_sgrams")
def lagrangian_optimizer_fmeasure( train_set, epsilon, learning_rate, learning_rate_constraint, loops): """Implements surrogate-based Lagrangian optimizer (Algorithm 3). Specifically solves: max F-measure s.t. F-measure(group1) >= F-measure(group0) - epsilon. Args: train_set: (features, labels, groups) epsilon: float, constraint slack. learning_rate: float, learning rate for model parameters. learning_rate_constraint: float, learning rate for Lagrange multipliers. loops: int, number of iterations. Returns: stochastic_model containing list of models and probabilities, deterministic_model. """ x_train, y_train, z_train = train_set dimension = x_train.shape[-1] tf.reset_default_graph() # Data tensors. features_tensor = tf.constant(x_train.astype("float32"), name="features") labels_tensor = tf.constant(y_train.astype("float32"), name="labels") # Linear model. weights = tf.Variable(tf.zeros(dimension, dtype=tf.float32), name="weights") threshold = tf.Variable(0, name="threshold", dtype=tf.float32) predictions_tensor = (tf.tensordot(features_tensor, weights, axes=(1, 0)) + threshold) # Contexts. context = tfco.rate_context(predictions_tensor, labels_tensor) context0 = context.subset(z_train < 1) context1 = context.subset(z_train > 0) # F-measure rates. fm_overall = tfco.f_score(context) fm1 = tfco.f_score(context1) fm0 = tfco.f_score(context0) # Rate minimization problem. problem = tfco.RateMinimizationProblem(-fm_overall, [fm0 <= fm1 + epsilon]) # Optimizer. optimizer = tfco.LagrangianOptimizerV1( tf.train.AdamOptimizer(learning_rate=learning_rate), constraint_optimizer=tf.train.AdamOptimizer( learning_rate=learning_rate_constraint)) train_op = optimizer.minimize(problem) # Start TF session and initialize variables. session = tf.Session() session.run(tf.global_variables_initializer()) # We maintain a list of objectives and model weights during training. objectives = [] violations = [] models = [] # Perform full gradient updates. for ii in range(loops): # Gradient updates. session.run(train_op) # Checkpoint once in 10 iterations. if ii % 10 == 0: # Model weights. model = [session.run(weights), session.run(threshold)] models.append(model) # Objective. objective = -evaluation.expected_fmeasure( x_train, y_train, [model], [1.0]) objectives.append(objective) # Violation. fmeasure0, fmeasure1 = evaluation.expected_group_fmeasures( x_train, y_train, z_train, [model], [1.0]) violations.append([fmeasure0 - fmeasure1 - epsilon]) # Use the recorded objectives and constraints to find the best iterate. best_iterate = tfco.find_best_candidate_index( np.array(objectives), np.array(violations)) deterministic_model = models[best_iterate] # Use shrinking to find a sparse distribution over iterates. probabilities = tfco.find_best_candidate_distribution( np.array(objectives), np.array(violations)) models_pruned = [models[i] for i in range(len(models)) if probabilities[i] > 0.0] probabilities_pruned = probabilities[probabilities > 0.0] return (models_pruned, probabilities_pruned), deterministic_model
def create_model( bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings, num_segments, aggregation_method, pretrained_model='bert', from_distilled_student=False, ): """Creates a classification model.""" scope = "" if from_distilled_student: scope = "student" parade_model = Parade(bert_config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, num_segments=num_segments, pretrained_model=pretrained_model, use_one_hot_embeddings=use_one_hot_embeddings, scope=scope) output_layer = None if aggregation_method == 'cls_attn': output_layer = parade_model.reduced_by_attn() elif aggregation_method == 'cls_avg': output_layer = parade_model.reduced_by_avg() elif aggregation_method == 'cls_max': output_layer = parade_model.reduced_by_max() elif aggregation_method == 'cls_transformer': output_layer = parade_model.reduced_by_transformer( is_training, num_transformer_layers=2) else: raise ValueError( "Un-supported model type: {}".format(aggregation_method)) with tf.variable_scope(scope): output_weights = tf.get_variable( "output_weights", [num_labels, parade_model.hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.tensordot(output_layer, output_weights, axes=[-1, -1]) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, log_probs)
def multihead_graph_attention(query_antecedent, memory_antecedent, bias, total_key_depth, total_value_depth, output_depth, num_heads, dropout_rate, image_shapes=None, attention_type="edge_vector", name="multihead_graph_attention", save_weights_to=None, make_image_summary=True, dropout_broadcast_dims=None, adjacency_matrix=None, num_edge_types=5, vars_3d=False, **kwargs): """Multihead scaled-dot-product attention with input/output transformations. Args: query_antecedent: a Tensor with shape [batch, length_q, channels] memory_antecedent: a Tensor with shape [batch, length_m, channels] or None bias: bias Tensor (see attention_bias()) total_key_depth: an integer total_value_depth: an integer output_depth: an integer num_heads: an integer dividing total_key_depth and total_value_depth dropout_rate: a floating point number image_shapes: optional tuple of integer scalars. see comments for attention_image_summary() attention_type: a string, either "dot_product", "dot_product_relative", "local_mask_right", "local_unmasked", "masked_dilated_1d", "unmasked_dilated_1d", graph, or any attention function with the signature (query, key, value, **kwargs) name: an optional string. save_weights_to: an optional dictionary to capture attention weights for vizualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. dropout_broadcast_dims: an optional list of integers less than 4 specifying in which dimensions to broadcast the dropout decisions. saves memory. adjacency_matrix: an optional tensor of shape [batch, len_q, len_q] containing edge vectors for attention num_edge_types: number of edge types, an int vars_3d: use 3-dimensional variables for input/output transformations **kwargs (dict): Parameters for the attention function Returns: The result of the attention transformation. The output shape is [batch_size, length_q, output_depth] Raises: ValueError: if the key depth or value depth are not divisible by the number of attention heads. """ if total_key_depth % num_heads != 0: raise ValueError("Key depth (%d) must be divisible by the number of " "attention heads (%d)." % (total_key_depth, num_heads)) if total_value_depth % num_heads != 0: raise ValueError("Value depth (%d) must be divisible by the number of " "attention heads (%d)." % (total_value_depth, num_heads)) vars_3d_num_heads = num_heads if vars_3d else None with tf.variable_scope(name, default_name="multihead_attention", values=[query_antecedent, memory_antecedent]): q, k, v = common_attention.compute_qkv( query_antecedent, memory_antecedent, total_key_depth, total_value_depth, vars_3d_num_heads=vars_3d_num_heads) q = common_attention.split_heads(q, num_heads) k = common_attention.split_heads(k, num_heads) v = common_attention.split_heads(v, num_heads) key_depth_per_head = total_key_depth // num_heads if not vars_3d: q *= key_depth_per_head**-0.5 additional_returned_value = None if callable( attention_type): # Generic way to extend multihead_attention x = attention_type(q, k, v, **kwargs) if isinstance(x, tuple): x, additional_returned_value = x # Unpack elif attention_type == "edge_vector": x = graph_attention(q, k, v, bias, dropout_rate, image_shapes, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=dropout_broadcast_dims, adjacency_matrix=adjacency_matrix, num_edge_types=num_edge_types) x = common_attention.combine_heads(x) # Set last dim specifically. x.set_shape(x.shape.as_list()[:-1] + [total_value_depth]) if vars_3d: o_var = tf.get_variable( "o", [num_heads, total_value_depth // num_heads, output_depth]) o_var = tf.reshape(o_var, [total_value_depth, output_depth]) x = tf.tensordot(x, o_var, axes=1) else: x = common_layers.dense(x, output_depth, use_bias=False, name="output_transform") if additional_returned_value is not None: return x, additional_returned_value return x
def color_transform(masks): with tf.name_scope("color_transform"): n_components = masks.shape.as_list()[-1] colors = tf.constant(get_mask_plot_colors(n_components), name="mask_colors") return tf.tensordot(masks, colors, axes=1)
def compute_attention_component(antecedent, total_depth, filter_width=1, padding="VALID", name="c", vars_3d_num_heads=0, sparsity_technique=None, threshold=3.0, training=True, clip_alpha=None, initial_sparsity=None, split_heads=False, num_heads=None): """Computes attention compoenent (query, key or value). Args: antecedent: a Tensor with shape [batch, length, channels] total_depth: an integer filter_width: An integer specifying how wide you want the attention component to be. padding: One of "VALID", "SAME" or "LEFT". Default is VALID: No padding. name: a string specifying scope name. vars_3d_num_heads: an optional integer (if we want to use 3d variables) sparsity_technique: technique used for sparsifying weights. threshold: log alpha threshold used for evaluation with variational dropout. training: whether model is being trained or not. clip_alpha: alpha clipping threshold for variational dropout. initial_sparsity: initial sparsity level for lottery ticket & scratch experiments. split_heads: Whether to prune each head separately. num_heads: The number of heads in the attention module. Returns: c : [batch, length, depth] tensor """ # We don't support 3d attention variables or filter_width > 1 with sparsity # techniques assert not sparsity_technique or (not vars_3d_num_heads and filter_width == 1) if vars_3d_num_heads > 0: assert filter_width == 1 input_depth = antecedent.get_shape().as_list()[-1] depth_per_head = total_depth // vars_3d_num_heads initializer_stddev = input_depth**-0.5 if "q" in name: initializer_stddev *= depth_per_head**-0.5 var = tf.get_variable( name, [input_depth, vars_3d_num_heads, total_depth // vars_3d_num_heads], initializer=tf.random_normal_initializer( stddev=initializer_stddev)) var = tf.cast(var, antecedent.dtype) var = tf.reshape(var, [input_depth, total_depth]) return tf.tensordot(antecedent, var, axes=1) if filter_width == 1: if sparsity_technique: if split_heads: # Prune each heads weights separately so that they are free # to have different weight magnitude distributions. if num_heads is None: raise ValueError( "`num_heads` must be set for split head pruning.") if total_depth % num_heads != 0: raise ValueError( "`total_depth` must be divisible by `num_heads`.") input_depth = antecedent.get_shape().as_list()[-1] depth_per_head = int(total_depth / num_heads) masked_head_weights = [] for head_id in range(num_heads): head_name = name + "_shard_{}".format(head_id) with tf.variable_scope(head_name) as vs: head_weights = tf.get_variable( "kernel", [input_depth, depth_per_head]) masked_head_weights.append( pruning.apply_mask(head_weights, vs)) component_weights = tf.concat(masked_head_weights, axis=1) # compute the full component result return tf.tensordot(antecedent, component_weights, axes=1) else: return common_sparse.dense( antecedent, total_depth, use_bias=False, sparsity_technique=sparsity_technique, threshold=threshold, training=training, clip_alpha=clip_alpha, name=name, initial_sparsity=initial_sparsity) else: return common_layers.dense(antecedent, total_depth, use_bias=False, name=name) else: return common_layers.conv1d(antecedent, total_depth, filter_width, padding=padding, name=name)
def test_correct_output(self, normalize_weights, input_dependent, data_format, combining_weights_initializer): spatial_rank = 2 kernel_size = 3 filters = 16 input_chs = 3 if data_format == 'channels_last': input_shape = (1, 32, 32, input_chs) if data_format == 'channels_first': input_shape = (1, input_chs, 32, 32) images = tf.constant(np.random.randn(*input_shape), dtype=tf.float32) layer1 = tf.keras.layers.LocallyConnected2D(filters=filters, kernel_size=(kernel_size, kernel_size), strides=(1, 1), padding='valid', data_format=data_format) layer2 = layers.LowRankLocallyConnected2D( filters=filters, kernel_size=(kernel_size, kernel_size), strides=(1, 1), padding='valid', spatial_rank=spatial_rank, normalize_weights=normalize_weights, combining_weights_initializer=combining_weights_initializer, share_row_combining_weights=False, share_col_combining_weights=False, data_format=data_format, input_dependent=input_dependent) output1 = layer1(images) output2 = layer2(images) assign_ops = [] # Kernel from locally connected network. kernel1 = layer1.kernel combining_weights = layer2.combining_weights if input_dependent: combining_weights = tf.reduce_mean(combining_weights, axis=0) # Kernel from low rank locally connected network. kernel2 = tf.tensordot( combining_weights, tf.reshape(layer2.kernel_bases, (layer2.kernel_size[0], layer2.kernel_size[1], input_chs, layer2.spatial_rank, layer2.filters)), [[-1], [-2]], name='kernel') kernel2 = kernel_low_rank_lc_to_lc(kernel2, data_format) assign_ops.append(tf.assign(kernel1, kernel2)) # Test results consistent with keras locallyconnected2d layer. self.evaluate(tf.global_variables_initializer()) for op in assign_ops: self.evaluate(op) max_error = np.max(np.abs(self.evaluate(output1 - output2))) self.assertLess(max_error, 1e-5)
def build_model(self, args): # auto-encoder with tf.variable_scope('encoder_decoder'): # word embedding embedding = tf.get_variable('embedding', initializer=self.word_init) # embedding = tf.get_variable('embedding', [self.vocab_size, self.dim_emb]) enc_inputs = tf.nn.embedding_lookup(embedding, self.enc_inputs) dec_inputs = tf.nn.embedding_lookup(embedding, self.dec_inputs) with tf.variable_scope('projection'): # style information projection = {} projection['W'] = tf.get_variable( 'W', [self.dim_h, self.vocab_size]) projection['b'] = tf.get_variable('b', [self.vocab_size]) encoder = self.create_cell(self.dim_h, args.n_layers, self.dropout, 'encoder') decoder = self.create_cell(self.dim_h, args.n_layers, self.dropout, 'decoder') self.loss_rec, origin_info, transfer_info = self.reconstruction( encoder, enc_inputs, self.labels, decoder, dec_inputs, self.targets, self.dec_mask, projection) _, soft_tsf_ids, self.rec_ids, self.tsf_ids = self.run_decoder( decoder, dec_inputs, embedding, projection, origin_info, transfer_info) # make the real sents and fake sents the same length if args.trim_padding: fake_probs = fake_probs[:, :1 + self.batch_len, :] # discriminator with tf.variable_scope('discriminator'): classifier_embedding = tf.get_variable('embedding', initializer=self.word_init) # classifier_embedding = tf.get_variable('embedding', [self.vocab_size, self.dim_emb]) # remove bos, use dec_inputs to avoid noises adding into enc_inputs real_sents = tf.nn.embedding_lookup(classifier_embedding, self.dec_inputs[:, 1:]) fake_sents = tf.tensordot(soft_tsf_ids, classifier_embedding, [[2], [0]]) fake_sents = fake_sents[:, : -1, :] # make the dimension the same as real sents # mask the sequences mask = tf.sequence_mask(self.enc_lens, self.max_len - 1, dtype=tf.float32) mask = tf.expand_dims(mask, -1) real_sents *= mask fake_sents *= mask self.loss_d, self.loss_g = self.run_discriminator( real_sents, fake_sents, self.labels, args) ##### optimizer ##### self.loss = self.loss_rec + self.rho * self.loss_g theta_eg = retrive_var(['encoder_decoder']) theta_d = retrive_var(['discriminator']) opt = tf.train.AdamOptimizer(self.learning_rate, beta1=0.5) grad, _ = zip(*opt.compute_gradients(self.loss, theta_eg)) grad, _ = tf.clip_by_global_norm(grad, 30.0) self.optimize_tot = opt.apply_gradients(zip(grad, theta_eg)) self.optimize_rec = opt.minimize(self.loss_rec, var_list=theta_eg) self.optimize_d = opt.minimize(self.loss_d, var_list=theta_d) self.saver = tf.train.Saver(max_to_keep=5)
def encoder(features, mode, vocab, hps): """Model function. Atttention seq2seq model, augmented with an encoder over the targets of the nearest neighbors. Args: features: Dictionary of input Tensors. mode: train or eval. Keys from tf.estimator.ModeKeys. vocab: A list of strings of words in the vocabulary. hps: Hyperparams. Returns: Encoder outputs. """ # [batch_size, src_len] src_inputs = features["src_inputs"] src_len = features["src_len"] with tf.variable_scope("embeddings"): scale = (3.0 / hps.emb_dim)**0.5 embeddings = tf.get_variable("embeddings", [vocab.size(), hps.emb_dim], dtype=tf.float32, initializer=tf.random_uniform_initializer( minval=-scale, maxval=scale)) # [batch_size, src_len, emb_dim] src_input_emb = tf.nn.embedding_lookup(embeddings, src_inputs) if mode == tf_estimator.ModeKeys.TRAIN and hps.emb_drop > 0.: src_input_emb = tf.nn.dropout(src_input_emb, keep_prob=1.0 - hps.emb_drop) src_att_context, neighbor_att_context = None, None src_copy_context, neighbor_copy_context = None, None with tf.variable_scope("src_encoder"): # 2 * [batch_size, src_len, encoder_dim] src_encoder_outputs, src_encoder_states = tf.nn.bidirectional_dynamic_rnn( cell_fw=get_rnn_cell(mode=mode, hps=hps, input_dim=hps.emb_dim, num_units=hps.encoder_dim, num_layers=hps.num_encoder_layers, dropout=hps.encoder_drop, cell_type="lstm"), cell_bw=get_rnn_cell(mode=mode, hps=hps, input_dim=hps.emb_dim, num_units=hps.encoder_dim, num_layers=hps.num_encoder_layers, dropout=hps.encoder_drop, cell_type="lstm"), inputs=src_input_emb, dtype=tf.float32, sequence_length=src_len) # [batch_size, src_len, 2*encoder_dim] src_encoder_outputs = tf.concat(src_encoder_outputs, 2) with tf.variable_scope("src_att_context"): src_att_context = _build_context( hps=hps, encoder_outputs=src_encoder_outputs) if hps.use_copy: with tf.variable_scope("src_copy_context"): src_copy_context = _build_context( hps=hps, encoder_outputs=src_encoder_outputs) if hps.encode_neighbor or hps.att_neighbor or hps.sum_neighbor: # [batch_size, neighbor_len] neighbor_inputs = features["neighbor_inputs"] neighbor_len = features["neighbor_len"] # [batch_size, neighbor_len, emb_dim] neighbor_input_emb = tf.nn.embedding_lookup(embeddings, neighbor_inputs) if mode == tf_estimator.ModeKeys.TRAIN and hps.emb_drop > 0.: neighbor_input_emb = tf.nn.dropout(neighbor_input_emb, keep_prob=1.0 - hps.emb_drop) if hps.binary_neighbor: neighbor_binary_input = features["neighbor_binary"] if hps.binary_dim == 1: neighbor_binary_emb = tf.to_float(neighbor_binary_input) neighbor_binary_emb = tf.expand_dims(neighbor_binary_emb, axis=-1) else: with tf.variable_scope("binary_emb"): scale = (3.0 / hps.binary_dim)**0.5 binary_embeddings = tf.get_variable( "binary_emb", [2, hps.binary_dim], dtype=tf.float32, initializer=tf.random_uniform_initializer( minval=-scale, maxval=scale)) neighbor_binary_emb = tf.nn.embedding_lookup( binary_embeddings, neighbor_binary_input) neighbor_input_emb = tf.concat( [neighbor_input_emb, neighbor_binary_emb], axis=2) with tf.variable_scope("neighbor_encoder"): # 2 * [batch_size, neighbor_len, encoder_dim] input_dim = hps.emb_dim if hps.binary_neighbor: input_dim += hps.binary_dim neighbor_encoder_outputs, neighbor_encoder_states = \ tf.nn.bidirectional_dynamic_rnn( cell_fw=get_rnn_cell( mode=mode, hps=hps, input_dim=input_dim, num_units=hps.neighbor_dim, num_layers=1, dropout=hps.encoder_drop, cell_type="lstm"), cell_bw=get_rnn_cell( mode=mode, hps=hps, input_dim=input_dim, num_units=hps.neighbor_dim, num_layers=1, dropout=hps.encoder_drop, cell_type="lstm"), inputs=neighbor_input_emb, dtype=tf.float32, sequence_length=neighbor_len) # [batch_size, neighbor_len, 2*encoder_dim] neighbor_encoder_outputs = tf.concat(neighbor_encoder_outputs, 2) if hps.att_neighbor: with tf.variable_scope("neighbor_att_context"): neighbor_att_context = _build_context( hps=hps, encoder_outputs=neighbor_encoder_outputs) if hps.use_copy: with tf.variable_scope("neighbor_copy_context"): neighbor_copy_context = _build_context( hps=hps, encoder_outputs=neighbor_encoder_outputs) att_context, copy_context = None, None if hps.att_neighbor: att_context = tf.concat([src_att_context, neighbor_att_context], 1) if hps.use_copy: copy_context = tf.concat([src_copy_context, neighbor_copy_context], 1) else: att_context = src_att_context if hps.use_copy: copy_context = src_copy_context if hps.encode_neighbor: neighbor_fw_states, neighbor_bw_states = neighbor_encoder_states neighbor_h = tf.concat( [neighbor_fw_states[-1].h, neighbor_bw_states[-1].h], axis=1) if mode == tf_estimator.ModeKeys.TRAIN and hps.drop > 0.: neighbor_h = tf.nn.dropout(neighbor_h, keep_prob=1.0 - hps.drop) mem_input = tf.layers.dense( neighbor_h, units=hps.decoder_dim, activation=tf.nn.tanh, use_bias=True, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="mem_input") if mode == tf_estimator.ModeKeys.TRAIN and hps.drop > 0.: mem_input = tf.nn.dropout(mem_input, keep_prob=1.0 - hps.drop) elif hps.sum_neighbor: src_fw_states, src_bw_states = src_encoder_states src_h = tf.concat([src_fw_states[-1].h, src_bw_states[-1].h], axis=1) if mode == tf_estimator.ModeKeys.TRAIN and hps.drop > 0.: src_h = tf.nn.dropout(src_h, keep_prob=1.0 - hps.drop) src_h = tf.layers.dense( src_h, units=hps.decoder_dim, activation=tf.nn.tanh, use_bias=True, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="proj_src_h") neighbor_encoder_outputs = tf.layers.dense( neighbor_encoder_outputs, units=hps.decoder_dim, activation=tf.nn.tanh, use_bias=True, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="proj_neighbor_out") alpha = tf.tensordot(neighbor_encoder_outputs, src_h, axes=1) alpha = tf.einsum("bij,bj->bi", neighbor_encoder_outputs, src_h) alpha = tf.nn.softmax(alpha) mem_input = tf.reduce_sum( neighbor_encoder_outputs * tf.expand_dims(alpha, -1), 1) # mem_input = tf.reduce_mean(mem_input, axis=1) if mode == tf_estimator.ModeKeys.TRAIN and hps.drop > 0.: mem_input = tf.nn.dropout(mem_input, keep_prob=1.0 - hps.drop) else: assert hps.rnn_cell != "hyper_lstm" assert hps.att_type != "hyper" mem_input = None if hps.use_bridge: with tf.variable_scope("bridge"): out_dim = hps.num_decoder_layers * hps.decoder_dim fw_states, bw_states = src_encoder_states c_states, h_states = [], [] for (fw, bw) in zip(fw_states, bw_states): c_states.append(tf.concat((fw.c, bw.c), 1)) h_states.append(tf.concat((fw.h, bw.h), 1)) cs, hs = c_states[-1], h_states[-1] if mode == tf_estimator.ModeKeys.TRAIN and hps.drop > 0.: hs = tf.nn.dropout(hs, keep_prob=1.0 - hps.drop) cs = tf.nn.dropout(cs, keep_prob=1.0 - hps.drop) h_state = tf.layers.dense( hs, units=out_dim, activation=tf.nn.tanh, use_bias=True, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="h_layer") c_state = tf.layers.dense( cs, units=out_dim, activation=tf.nn.tanh, use_bias=True, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="c_layer") else: h_state, c_state = None, None # print (att_context) # dsadsa return EncoderOutputs(embeddings=embeddings, mem_input=mem_input, att_context=att_context, copy_context=copy_context, states=(h_state, c_state))
def lagrangian_optimizer_kld( train_set, additive_slack, learning_rate, learning_rate_constraint, loops): """Implements surrogate-based Lagrangian optimizer (Algorithm 2). Specifically solves: min_{theta} sum_{G = 0, 1} KLD(p, pprG(theta)) s.t. error_rate <= additive_slack, where p is the overall proportion of positives and pprG is the positive prediction rate for group G. We frame this as a constrained optimization problem: min_{theta, xi_pos0, xi_pos1, xi_neg0, xi_neg1} { -p log(xi_pos0) - (1-p) log(xi_neg0) - p log(xi_pos1) -(1-p) log(xi_neg1)} s.t. error_rate <= additive_slack, xi_pos0 <= ppr0(theta), xi_neg0 <= npr0(theta), xi_pos1 <= ppr1(theta), xi_neg1 <= npr1(theta), and formulate the Lagrangian: max_{lambda's >= 0} min_{xi's} { -p log(xi_pos0) - (1-p) log(xi_neg0) - p log(xi_pos1) -(1-p) log(xi_neg1) + lambda_pos0 (xi_pos0 - ppr0(theta)) + lambda_neg0 (xi_neg0 - npr0(theta)) + lambda_pos1 (xi_pos1 - ppr1(theta)) + lambda_neg1 (xi_neg1 - npr1(theta))} s.t. error_rate <= additive_slack. We do best response for the slack variables xi: BR for xi_pos0 = p / lambda_pos0 BR for xi_neg0 = (1 - p) / lambda_neg0 BR for xi_pos1 = p / lambda_pos1 BR for xi_neg1 = (1 - p) / lambda_neg1 We do gradient ascent on the lambda's, where Gradient w.r.t. lambda_pos0 = BR for xi_pos0 - ppr0(theta) = p / lambda_pos0 - ppr0(theta) = Gradient w.r.t. lambda_pos0 of (p log(lambda_pos0) - lambda_pos0 ppr0(theta)) Gradient w.r.t. lambda_neg0 = Gradient w.r.t. lambda_neg0 of ((1 - p) log(lambda_neg0) - lambda_neg0 npr0(theta)) Gradient w.r.t. lambda_pos1 = Gradient w.r.t. lambda_pos1 of (p log(lambda_pos1) - lambda_pos1 ppr1(theta)) Gradient w.r.t. lambda_neg1 = Gradient w.r.t. lambda_neg1 of ((1 - p) log(lambda_neg1) - lambda_neg1 npr1(theta)). We do gradient descent on thetas's, with ppr's and npr's replaced with hinge surrogates. We use concave lower bounds on ppr's and npr's, so that when they get negated in the updates, we get convex upper bounds. See Appendix D.1 in the paper for more details. Args: train_set: (features, labels, groups) additive_slack: float, additive slack on error rate constraint learning_rate: float, learning rate for model parameters learning_rate_constraint: float, learning rate for Lagrange multipliers loops: int, number of iterations Returns: stochastic_model containing list of models and probabilities, deterministic_model. """ x_train, y_train, z_train = train_set dimension = x_train.shape[-1] tf.reset_default_graph() # Data tensors. features_tensor = tf.constant(x_train.astype("float32"), name="features") labels_tensor = tf.constant(y_train.astype("float32"), name="labels") # Linear model. weights = tf.Variable(tf.zeros(dimension, dtype=tf.float32), name="weights") threshold = tf.Variable(0, name="threshold", dtype=tf.float32) predictions_tensor = (tf.tensordot(features_tensor, weights, axes=(1, 0)) + threshold) # Group-specific predictions. predictions_group0 = tf.boolean_mask(predictions_tensor, mask=(z_train < 1)) num_examples0 = np.sum(z_train < 1) predictions_group1 = tf.boolean_mask(predictions_tensor, mask=(z_train > 0)) num_examples1 = np.sum(z_train > 0) # We use the TF Constrained Optimization (TFCO) library to set up the # constrained optimization problem. The library doesn't currently support best # responses for slack variables. So we maintain explicit Lagrange multipliers # for the slack variables, and let the library deal with the Lagrange # multipliers for the error rate constraint. # Since we need to perform a gradient descent update on the model parameters, # and an ascent update on the Lagrange multipliers on the slack variables, we # create a single "minimization" objective using stop gradients, where a # descent gradient update has the effect of minimizing over the model # parameters and maximizing over the Lagrange multipliers for the slack # variables. As noted above, the ascent update on the Lagrange multipliers for # the error rate constraint is done by the library internally. # Placeholders for Lagrange multipliers for the four slack variables. lambda_pos0 = tf.Variable(0.5, dtype=tf.float32, name="lambda_pos0") lambda_neg0 = tf.Variable(0.5, dtype=tf.float32, name="lambda_neg0") lambda_pos1 = tf.Variable(0.5, dtype=tf.float32, name="lambda_pos1") lambda_neg1 = tf.Variable(0.5, dtype=tf.float32, name="lambda_neg1") # Set up prediction rates and surrogate relaxations on them. p = np.mean(y_train) # Proportion of positives. # Positive and negative prediction rates for group 0 and group 1. ppr_group0 = tf.reduce_sum(tf.cast( tf.greater(predictions_group0, tf.zeros(num_examples0, dtype="float32")), "float32")) / num_examples0 npr_group0 = 1 - ppr_group0 ppr_group1 = tf.reduce_sum(tf.cast( tf.greater(predictions_group1, tf.zeros(num_examples1, dtype="float32")), "float32")) / num_examples1 npr_group1 = 1 - ppr_group1 # Hinge concave lower bounds on the positive and negative prediction rates. # In the gradient updates, these get negated and become convex upper bounds. # For group 0: ppr_hinge_group0 = tf.reduce_sum( 1 - tf.nn.relu(1 - predictions_group0)) * 1.0 / num_examples0 npr_hinge_group0 = tf.reduce_sum( 1 - tf.nn.relu(1 + predictions_group0)) * 1.0 / num_examples0 # For group 1: ppr_hinge_group1 = tf.reduce_sum( 1 - tf.nn.relu(1 - predictions_group1)) * 1.0 / num_examples1 npr_hinge_group1 = tf.reduce_sum( 1 - tf.nn.relu(1 + predictions_group1)) * 1.0 / num_examples1 # Set up KL-divergence objective for constrained optimization. # We use stop gradients to ensure that a single descent gradient update on the # objective has the effect of minimizing over the model parameters and # maximizing over the Lagrange multipliers for the slack variables. # KL-divergence for group 0. kld_hinge_pos_group0 = ( - tf.stop_gradient(lambda_pos0) * ppr_hinge_group0 - p * tf.log(lambda_pos0) + lambda_pos0 * tf.stop_gradient(ppr_group0)) kld_hinge_neg_group0 = ( - tf.stop_gradient(lambda_neg0) * npr_hinge_group0 - (1 - p) * tf.log(lambda_neg0) + lambda_neg0 * tf.stop_gradient(npr_group0)) kld_hinge_group0 = kld_hinge_pos_group0 + kld_hinge_neg_group0 # KL-divergence for group 1. kld_hinge_pos_group1 = ( - tf.stop_gradient(lambda_pos1) * ppr_hinge_group1 - p * tf.log(lambda_pos1) + lambda_pos1 * tf.stop_gradient(ppr_group1)) kld_hinge_neg_group1 = ( - tf.stop_gradient(lambda_neg1) * npr_hinge_group1 - (1 - p) * tf.log(lambda_neg1) + lambda_neg1 * tf.stop_gradient(npr_group1)) kld_hinge_group1 = kld_hinge_pos_group1 + kld_hinge_neg_group1 # Wrap the objective into a rate object. objective = tfco.wrap_rate(kld_hinge_group0 + kld_hinge_group1) # Set up error rate constraint for constrained optimization. context = tfco.rate_context(predictions_tensor, labels_tensor) error = tfco.error_rate(context) constraints = [error <= additive_slack] # Cretae rate minimization problem object. problem = tfco.RateMinimizationProblem(objective, constraints) # Set up optimizer. optimizer = tfco.LagrangianOptimizerV1( tf.train.AdamOptimizer(learning_rate=learning_rate), constraint_optimizer=tf.train.AdamOptimizer( learning_rate=learning_rate_constraint)) train_op = optimizer.minimize(problem) # Start TF session and initialize variables. session = tf.Session() session.run(tf.global_variables_initializer()) # We maintain a list of objectives and model weights during training. objectives = [] violations = [] models = [] # Perform full gradient updates. for ii in range(loops): # Gradient updates. session.run(train_op) # Checkpoint once in 10 iterations. if ii % 10 == 0: # Model weights. model = [session.run(weights), session.run(threshold)] models.append(model) # Objective. klds = evaluation.expected_group_klds( x_train, y_train, z_train, [model], [1.0]) objectives.append(sum(klds)) # Violation. error = evaluation.expected_error_rate( x_train, y_train, [model], [1.0]) violations.append([error - additive_slack]) # Use the recorded objectives and constraints to find the best iterate. best_iterate = tfco.find_best_candidate_index( np.array(objectives), np.array(violations)) deterministic_model = models[best_iterate] # Use shrinking to find a sparse distribution over iterates. probabilities = tfco.find_best_candidate_distribution( np.array(objectives), np.array(violations)) models_pruned = [models[i] for i in range(len(models)) if probabilities[i] > 0.0] probabilities_pruned = probabilities[probabilities > 0.0] return (models_pruned, probabilities_pruned), deterministic_model
def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_placeholder_: How much the clip is shifted. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - mfcc_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, [], name='filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volme') scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_placeholder_ = tf.placeholder(tf.int32, name='timeshift') # TODO(see--): Write test with np.roll shifted_foreground = tf_roll(scaled_foreground, self.time_shift_placeholder_) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, shifted_foreground) # removed clipping: tf.clip_by_value(background_add, -1.0, 1.0) self.background_clamp_ = background_add self.background_clamp_ = tf.reshape( self.background_clamp_, (1, model_settings['desired_samples'])) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. #stfts = tf.contrib.signal.stft( stfts = tf.signal.stft( self.background_clamp_, frame_length=model_settings['window_size_samples'], frame_step=model_settings['window_stride_samples'], fft_length=None) self.spectrogram_ = tf.abs(stfts) num_spectrogram_bins = self.spectrogram_.shape[-1].value lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0 linear_to_mel_weight_matrix = \ tf.signal.linear_to_mel_weight_matrix( model_settings['dct_coefficient_count'], num_spectrogram_bins, model_settings['sample_rate'], lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(self.spectrogram_, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(self.spectrogram_.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6) self.mfcc_ = tf.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrograms )[:, :, :model_settings['num_log_mel_features']] # :13
def multihead_attention(query_antecedent, memory_antecedent, bias, total_key_depth, total_value_depth, output_depth, num_heads, dropout_rate, shared_rel=False, max_relative_position=None, image_shapes=None, attention_type="dot_product", block_length=128, block_width=128, q_filter_width=1, kv_filter_width=1, q_padding="VALID", kv_padding="VALID", cache=None, gap_size=0, num_memory_blocks=2, name="multihead_attention", save_weights_to=None, make_image_summary=True, dropout_broadcast_dims=None, max_length=None, vars_3d=False, scale_dotproduct=True, **kwargs): """Multihead scaled-dot-product attention with input/output transformations. Args: query_antecedent: a Tensor with shape [batch, length_q, channels] memory_antecedent: a Tensor with shape [batch, length_m, channels] or None bias: bias Tensor (see attention_bias()) total_key_depth: an integer total_value_depth: an integer output_depth: an integer num_heads: an integer dividing total_key_depth and total_value_depth dropout_rate: a floating point number shared_rel: boolean to share relative embeddings max_relative_position: Maximum distance between inputs to generate unique relation embeddings for. Only relevant when using "dot_product_relative" attention. image_shapes: optional tuple of integer scalars. see comments for attention_image_summary() attention_type: a string, either "dot_product", "dot_product_relative", "local_mask_right", "local_unmasked", "masked_dilated_1d", "unmasked_dilated_1d", graph, or any attention function with the signature (query, key, value, **kwargs) block_length: an integer - relevant for "local_mask_right" block_width: an integer - relevant for "local_unmasked" q_filter_width: An integer specifying how wide you want the query to be. kv_filter_width: An integer specifying how wide you want the keys and values to be. q_padding: One of "VALID", "SAME" or "LEFT". Default is VALID: No padding. kv_padding: One of "VALID", "SAME" or "LEFT". Default is "VALID": no padding. cache: dict containing Tensors which are the results of previous attentions, used for fast decoding. Expects the dict to contrain two keys ('k' and 'v'), for the initial call the values for these keys should be empty Tensors of the appropriate shape. 'k' [batch_size, 0, key_channels] 'v' [batch_size, 0, value_channels] gap_size: Integer option for dilated attention to indicate spacing between memory blocks. num_memory_blocks: Integer option to indicate how many memory blocks to look at. name: an optional string. save_weights_to: an optional dictionary to capture attention weights for vizualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. dropout_broadcast_dims: an optional list of integers less than 4 specifying in which dimensions to broadcast the dropout decisions. saves memory. max_length: an integer - needed by relative attention vars_3d: use 3-dimensional variables for input/output transformations scale_dotproduct: whether to normalize the attention product. **kwargs (dict): Parameters for the attention function Caching: WARNING: For decoder self-attention, i.e. when memory_antecedent == None, the caching assumes that the bias contains future masking. The caching works by saving all the previous key and value values so that you are able to send just the last query location to this attention function. I.e. if the cache dict is provided it assumes the query is of the shape [batch_size, 1, hidden_dim] rather than the full memory. Returns: The result of the attention transformation. The output shape is [batch_size, length_q, hidden_dim] unless the cache dict is provided in which case only the last memory position is calculated and the output shape is [batch_size, 1, hidden_dim] Optionally returns an additional loss parameters (ex: load balance loss for the experts) returned by the attention_type function. Raises: ValueError: if the key depth or value depth are not divisible by the number of attention heads. """ if total_key_depth % num_heads != 0: raise ValueError("Key depth (%d) must be divisible by the number of " "attention heads (%d)." % (total_key_depth, num_heads)) if total_value_depth % num_heads != 0: raise ValueError("Value depth (%d) must be divisible by the number of " "attention heads (%d)." % (total_value_depth, num_heads)) vars_3d_num_heads = num_heads if vars_3d else 0 with tf.variable_scope(name, default_name="multihead_attention", values=[query_antecedent, memory_antecedent]): if cache is None or memory_antecedent is None: q, k, v = common_attention.compute_qkv( query_antecedent, memory_antecedent, total_key_depth, total_value_depth, q_filter_width, kv_filter_width, q_padding, kv_padding, vars_3d_num_heads=vars_3d_num_heads) if cache is not None: if attention_type != "dot_product": # TODO(petershaw): Support caching when using relative position # representations, i.e. "dot_product_relative" attention. raise NotImplementedError( "Caching is not guaranteed to work with attention types other than" " dot_product.") if bias is None: raise ValueError( "Bias required for caching. See function docstring " "for details.") if memory_antecedent is not None: # Encoder-Decoder Attention Cache q = common_attention.compute_attention_component( query_antecedent, total_key_depth, q_filter_width, q_padding, "q", vars_3d_num_heads=vars_3d_num_heads) k = cache["k_encdec"] v = cache["v_encdec"] else: k = common_attention.split_heads(k, num_heads) v = common_attention.split_heads(v, num_heads) decode_loop_step = kwargs.get("decode_loop_step") if decode_loop_step is None: k = cache["k"] = tf.concat([cache["k"], k], axis=2) v = cache["v"] = tf.concat([cache["v"], v], axis=2) else: # Inplace update is required for inference on TPU. # Inplace_ops only supports inplace_update on the first dimension. # The performance of current implementation is better than updating # the tensor by adding the result of matmul(one_hot, # update_in_current_step) tmp_k = tf.transpose(cache["k"], perm=[2, 0, 1, 3]) tmp_k = inplace_ops.alias_inplace_update( tmp_k, decode_loop_step, tf.squeeze(k, axis=2)) k = cache["k"] = tf.transpose(tmp_k, perm=[1, 2, 0, 3]) tmp_v = tf.transpose(cache["v"], perm=[2, 0, 1, 3]) tmp_v = inplace_ops.alias_inplace_update( tmp_v, decode_loop_step, tf.squeeze(v, axis=2)) v = cache["v"] = tf.transpose(tmp_v, perm=[1, 2, 0, 3]) q = common_attention.split_heads(q, num_heads) if cache is None: k = common_attention.split_heads(k, num_heads) v = common_attention.split_heads(v, num_heads) key_depth_per_head = total_key_depth // num_heads if not vars_3d: if scale_dotproduct: q *= key_depth_per_head**-0.5 additional_returned_value = None if callable( attention_type): # Generic way to extend multihead_attention x = attention_type(q, k, v, **kwargs) if isinstance(x, tuple): x, additional_returned_value = x # Unpack elif attention_type == "dot_product": x = common_attention.dot_product_attention( q, k, v, bias, dropout_rate, image_shapes, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=dropout_broadcast_dims) elif attention_type == "dot_product_relative": x = common_attention.dot_product_attention_relative( q, k, v, bias, max_relative_position, dropout_rate, image_shapes, make_image_summary=make_image_summary) elif attention_type == "dot_product_relative_v2": x = common_attention.dot_product_self_attention_relative_v2( q, k, v, bias, max_length, dropout_rate, image_shapes, make_image_summary=make_image_summary, dropout_broadcast_dims=dropout_broadcast_dims) elif attention_type == "local_within_block_mask_right": x = common_attention.masked_within_block_local_attention_1d( q, k, v, block_length=block_length) elif attention_type == "rel_local_mask_right": x = common_attention.masked_rel_local_attention_1d( q, k, v, block_length=block_length, make_image_summary=make_image_summary, dropout_rate=dropout_rate, share_rel_embed=shared_rel) elif attention_type == "local_mask_right": x = common_attention.masked_local_attention_1d( q, k, v, block_length=block_length, make_image_summary=make_image_summary) elif attention_type == "local_unmasked": x = common_attention.local_attention_1d(q, k, v, block_length=block_length, filter_width=block_width) elif attention_type == "masked_dilated_1d": x = common_attention.masked_dilated_self_attention_1d( q, k, v, block_length, block_width, gap_size, num_memory_blocks) else: assert attention_type == "unmasked_dilated_1d" x = common_attention.dilated_self_attention_1d( q, k, v, block_length, block_width, gap_size, num_memory_blocks) x = common_attention.combine_heads(x) # Set last dim specifically. x.set_shape(x.shape.as_list()[:-1] + [total_value_depth]) if vars_3d: o_var = tf.get_variable( "o", [num_heads, total_value_depth // num_heads, output_depth]) o_var = tf.cast(o_var, x.dtype) o_var = tf.reshape(o_var, [total_value_depth, output_depth]) x = tf.tensordot(x, o_var, axes=1) else: x = common_layers.dense(x, output_depth, use_bias=False, name="output_transform") if additional_returned_value is not None: return x, additional_returned_value return x
def lagrangian_optimizer(train_set, epsilon=epsilon, learning_rate=0.01, learning_rate_constraint=0.01, loops=2000): tf.reset_default_graph() x_train, y_train, z_train = train_set num_examples = x_train.shape[0] dimension = x_train.shape[-1] # Data tensors. features_tensor = tf.constant(x_train.astype("float32"), name="features") labels_tensor = tf.constant(y_train.astype("float32"), name="labels") # Linear model. weights = tf.Variable(tf.zeros(dimension, dtype=tf.float32), name="weights") threshold = tf.Variable(0, name="threshold", dtype=tf.float32) predictions_tensor = ( tf.tensordot(features_tensor, weights, axes=(1, 0)) + threshold) predictions_group0 = tf.boolean_mask(predictions_tensor, mask=(z_train < 1)) num0 = np.sum(z_train < 1) predictions_group1 = tf.boolean_mask(predictions_tensor, mask=(z_train > 0)) num1 = np.sum(z_train > 0) # Set up rates. context = tfco.rate_context(predictions_tensor, labels_tensor) true_positive_rate = tfco.true_positive_rate(context) true_negative_rate = tfco.true_negative_rate(context) context0 = context.subset(z_train < 1) true_positive_rate0 = tfco.true_positive_rate(context0) context1 = context.subset(z_train > 0) true_positive_rate1 = tfco.true_positive_rate(context1) # Set up slack variables. slack_tpr = tf.Variable(0.5, dtype=tf.float32) slack_tnr = tf.Variable(0.5, dtype=tf.float32) # Projection ops for slacks. projection_ops = [] projection_ops.append( tf.assign(slack_tpr, tf.clip_by_value(slack_tpr, 0.001, 0.999))) projection_ops.append( tf.assign(slack_tnr, tf.clip_by_value(slack_tnr, 0.001, 0.999))) # Set up 1 - G-mean objective. objective = tfco.wrap_rate(1.0 - tf.sqrt(slack_tpr * slack_tnr)) # Set up slack constraints. constraints = [] constraints.append(tfco.wrap_rate(slack_tpr) <= true_positive_rate) constraints.append(tfco.wrap_rate(slack_tnr) <= true_negative_rate) # Set up fairness equal-opportunity constraints. constraints.append( true_positive_rate0 <= true_positive_rate1 + epsilon) constraints.append( true_positive_rate1 <= true_positive_rate0 + epsilon) # Set up constraint optimization problem. problem = tfco.RateMinimizationProblem(objective, constraints) # Set up solver. optimizer = tf.train.AdamOptimizer(learning_rate) constraint_optimizer = tf.train.AdamOptimizer(learning_rate_constraint) lagrangian_optimizer = tfco.ProxyLagrangianOptimizerV1( optimizer=optimizer, constraint_optimizer=constraint_optimizer) train_op = lagrangian_optimizer.minimize(problem) # Start TF session and initialize variables. session = tf.Session() tf.set_random_seed(654321) # Set random seed for reproducibility. session.run(tf.global_variables_initializer()) # We maintain a list of objectives and model weights during training. objectives = [] violations = [] models = [] # Perform full gradient updates. for ii in xrange(loops): # Gradient update. session.run(train_op) # Projection. session.run(projection_ops) # Checkpoint once in 100 iterations. if ii % 100 == 0: # Model weights. model = [session.run(weights), session.run(threshold)] models.append(model) # Snapshot performace error, tpr0, tpr1 = evaluate_expected_results( train_set, [model], [1.0]) objectives.append(error) violations.append( [tpr0 - tpr1 - epsilon, tpr1 - tpr0 - epsilon]) # Use the recorded objectives and constraints to find the best iterate. # Best model best_iterate = tfco.find_best_candidate_index(np.array(objectives), np.array(violations)) best_model = models[best_iterate] # Stochastic model over a subset of classifiers. probabilities = tfco.find_best_candidate_distribution( np.array(objectives), np.array(violations)) models_pruned = [ models[i] for i in range(len(models)) if probabilities[i] > 0.0 ] probabilities_pruned = probabilities[probabilities > 0.0] # Stochastic model over all classifiers. probabilities_all = probabilities * 0.0 + 1.0 / len(probabilities) # Return Pruned models, Avg models, Best model results = { 'stochastic': (models, probabilities_all), 'pruned': (models_pruned, probabilities_pruned), 'best': best_model, 'objectives': objectives, 'violations': violations } return results
def smpl_model_batched(model_path, betas, pose, trans, simplify=False): """ Construct a compute graph that takes in parameters and outputs a tensor as model vertices. Face indices are also returned as a numpy ndarray. Parameters: --------- pose: Also known as 'theta', a [24,3] tensor indicating child joint rotation relative to parent joint. For root joint it's global orientation. Represented in a axis-angle format. betas: Parameter for model shape. A tensor of shape [10] as coefficients of PCA components. Only 10 components were released by SMPL author. trans: Global translation tensor of shape [3]. Return: ------ A tensor for vertices, and a numpy ndarray as face indices. """ # For detailed comments see smpl_np.py with open(model_path, 'rb') as f: params = pickle.load(f) J_regressor = tf.constant( np.array(params['J_regressor'].todense(), dtype=np.float64)) weights = tf.constant(params['weights'], dtype=np.float64) posedirs = tf.constant(params['posedirs'], dtype=np.float64) v_template = tf.constant(params['v_template'], dtype=np.float64) shapedirs = tf.constant(params['shapedirs'], dtype=np.float64) f = params['f'] kintree_table = params['kintree_table'] id_to_col = {kintree_table[1, i]: i for i in range(kintree_table.shape[1])} parent = { i: id_to_col[kintree_table[0, i]] for i in range(1, kintree_table.shape[1]) } v_shaped = tf.tensordot(betas, shapedirs, axes=[[1], [2]]) + v_template J = tf.matmul(J_regressor, v_shaped) pose_cube = tf.reshape(pose, (-1, 1, 3)) R_cube_big = rodrigues(pose_cube) if simplify: v_posed = v_shaped else: R_cube = R_cube_big[1:] I_cube = tf.expand_dims(tf.eye(3, dtype=tf.float64), axis=0) + \ tf.zeros((R_cube.get_shape()[0], 3, 3), dtype=tf.float64) lrotmin = tf.squeeze(tf.reshape((R_cube - I_cube), (-1, 1))) v_posed = v_shaped + tf.tensordot(lrotmin, posedirs, axes=[[1], [2]]) results = [] results.append( with_zeros( tf.concat((R_cube_big[0], tf.reshape(J[0, :], (3, 1))), axis=1))) for i in range(1, kintree_table.shape[1]): results.append( tf.matmul( results[parent[i]], with_zeros( tf.concat((R_cube_big[i], tf.reshape(J[i, :] - J[parent[i], :], (3, 1))), axis=1)))) stacked = tf.stack(results, axis=0) results = stacked - \ pack( tf.matmul( stacked, tf.reshape( tf.concat((J, tf.zeros((24, 1), dtype=tf.float64)), axis=1), (24, 4, 1) ) ) ) T = tf.tensordot(weights, results, axes=((1), (0))) rest_shape_h = tf.concat( (v_posed, tf.ones((v_posed.get_shape().as_list()[0], 1), dtype=tf.float64)), axis=1) v = tf.matmul(T, tf.reshape(rest_shape_h, (-1, 4, 1))) v = tf.reshape(v, (-1, 4))[:, :3] result = v + tf.reshape(trans, (1, 3)) return result
def step_state(state): return state + tf.reduce_sum( input_tensor=tf.tensordot(data, state, ([1], [1])))
def runMoreRnn(path=None, epochs=10, saveResult=True): trainData, validData, testData, wordId = loadWordIdsFromFiles() trainData = np.array(trainData, np.float32) # validData = np.array(validData, np.float32) testData = np.array(testData, np.float32) vocabSz = len(wordId) info = loadInfo('rnn', path) learnRate = info['learning rate'] batchSz = info['batch size'] embedSz = info['embed size'] rnnSz = info['rnn size'] winSz = info['win size'] numWin = (trainData.shape[0] - 1) // (batchSz * winSz) # each batch has winSz * numWin words batchLen = winSz * numWin testNumWin = (testData.shape[0] - 1) // (batchSz * winSz) testBatchLen = winSz * testNumWin inp = tf.placeholder(tf.int32, shape=[batchSz, winSz]) # ans = tf.placeholder(tf.int32, shape=[batchSz * winSz]) ans = tf.placeholder(tf.int32, shape=[batchSz, winSz]) E = tf.Variable(tf.random_normal([vocabSz, embedSz], stddev=0.1)) embed = tf.nn.embedding_lookup(E, inp) rnn = BasicRNNCell(rnnSz, activation='relu') initialState = rnn.zero_state(batchSz, tf.float32) output, nextState = tf.nn.dynamic_rnn(rnn, embed, initial_state=initialState) # output = tf.reshape(output, [batchSz * winSz, rnnSz]) W = tf.Variable(tf.random_normal([rnnSz, vocabSz], stddev=.1)) B = tf.Variable(tf.random_normal([vocabSz], stddev=.1)) # logits = tf.matmul(output, W) + B logits = tf.tensordot(output, W, [[2], [0]]) + B ents = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=ans) loss = tf.reduce_sum(ents) train = tf.train.GradientDescentOptimizer(learnRate).minimize(loss) trainPerp = np.zeros(epochs + 1, dtype=np.float32) trainPerp[0] = info['train perplexity'] testPerp = np.zeros(epochs + 1, dtype=np.float32) testPerp[0] = info['test perplexity'] with tf.Session() as sess: loadSession(sess, 'rnn', path) startTime = time.time() epoch = 0 print('epoch:', end=' ') while epoch < epochs: epoch += 1 win = 0 state = sess.run(initialState) testState = sess.run(initialState) # print(state, testState) winStart, winEnd = 0, winSz while win < numWin: inInp = np.array([trainData[i * batchLen + winStart:i * batchLen + winEnd] for i in range(batchSz)]) # inAns = np.reshape(np.array([trainData[i * batchLen + winStart + 1: i * batchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz) inAns = np.array([trainData[i * batchLen + winStart + 1: i * batchLen + winEnd + 1] for i in range(batchSz)]) _, state, outLoss = sess.run([train, nextState, loss], {inp: inInp, ans: inAns, nextState: state}) trainPerp[epoch] += outLoss if win < testNumWin: inInp = np.array([testData[i * testBatchLen + winStart:i * testBatchLen + winEnd] for i in range(batchSz)]) # inAns = np.reshape(np.array([testData[i * testBatchLen + winStart + 1: i * testBatchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz) inAns = np.array([testData[i * testBatchLen + winStart + 1: i * testBatchLen + winEnd + 1] for i in range(batchSz)]) testState, testOutLoss = sess.run([nextState, loss], {inp: inInp, ans: inAns, nextState: testState}) testPerp[epoch] += testOutLoss winStart, winEnd = winEnd, winEnd + winSz win += 1 print(epoch + info['epochs'], end=' ') trainPerp[1:] = np.exp(trainPerp[1:] / (trainData.shape[0] // (batchSz * batchLen) * (batchSz * batchLen))) testPerp[1:] = np.exp(testPerp[1:] / (testData.shape[0] // (batchSz * testBatchLen) * (batchSz * testBatchLen))) print(f'\nelapsed: {time.time() - startTime}') print('train perplexity:', trainPerp[-1]) print('test perplexity:', testPerp[-1]) info['epochs'] += epochs info['train perplexity'] = trainPerp[-1] info['test perplexity'] = testPerp[-1] if saveResult: save(sess, info) drawPerplexity(trainPerp, testPerp, info['epochs'] - epochs)
for l in range(num_hidden_layers): current_layer = tf.layers.dense(previous_layer, num_hidden_neurons[l], activation=tf.nn.sigmoid) previous_layer = current_layer dnn_output = tf.layers.dense(previous_layer, matrix_size) with tf.name_scope('loss'): print("dnn_output = ", dnn_output) x_trial = tf.transpose(dnn_output) print("x_trial = ", x_trial) temp1 = (tf.tensordot(tf.transpose(x_trial), x_trial, axes=1) * A) temp2 = (1 - tf.tensordot(tf.transpose(x_trial), tf.tensordot(A, x_trial, axes=1), axes=1)) * np.eye(matrix_size) func = tf.tensordot((temp1 - temp2), x_trial, axes=1) print(temp1) print(temp2) print(func) func = tf.transpose(func) x_trial = tf.transpose(x_trial) loss = tf.losses.mean_squared_error(func, x_trial) learning_rate = 0.001
def option_values(values, policy): return tf.tensordot( values[:, policy, Ellipsis], self._policy_weights[policy], axes=[1, 0])
def testPCgradBasic(self, denylist, allowlist, pcgrad_var_idx): tf.disable_eager_execution() for dtype in [tf.dtypes.float32, tf.dtypes.float64]: with self.session(graph=tf.Graph()): var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) const0_np = np.array([1., 0.], dtype=dtype.as_numpy_dtype) const1_np = np.array([-1., -1.], dtype=dtype.as_numpy_dtype) const2_np = np.array([-1., 1.], dtype=dtype.as_numpy_dtype) var0 = tf.Variable(var0_np, dtype=dtype, name='first_var/var0') var1 = tf.Variable(var1_np, dtype=dtype, name='second_var/var1') const0 = tf.constant(const0_np) const1 = tf.constant(const1_np) const2 = tf.constant(const2_np) loss0 = tf.tensordot(var0, const0, 1) + tf.tensordot(var1, const2, 1) loss1 = tf.tensordot(var0, const1, 1) + tf.tensordot(var1, const0, 1) learning_rate = lambda: 0.001 opt = tf.train.GradientDescentOptimizer(learning_rate) losses = loss0 + loss1 opt_grads = opt.compute_gradients(losses, var_list=[var0, var1]) pcgrad_opt = pcgrad.PCGrad( tf.train.GradientDescentOptimizer(learning_rate), denylist=denylist, allowlist=allowlist) pcgrad_col_opt = pcgrad.PCGrad( tf.train.GradientDescentOptimizer(learning_rate), use_collection_losses=True, denylist=denylist, allowlist=allowlist) losses = [loss0, loss1] pcgrad_grads = pcgrad_opt.compute_gradients( losses, var_list=[var0, var1]) tf.add_to_collection(pcgrad.PCGRAD_LOSSES_COLLECTION, loss0) tf.add_to_collection(pcgrad.PCGRAD_LOSSES_COLLECTION, loss1) pcgrad_grads_collection = pcgrad_col_opt.compute_gradients( None, var_list=[var0, var1]) with tf.Graph().as_default(): # Shouldn't return non-slot variables from other graphs. self.assertEmpty(opt.variables()) self.evaluate(tf.global_variables_initializer()) grad_vec, pcgrad_vec, pcgrad_col_vec = self.evaluate( [opt_grads, pcgrad_grads, pcgrad_grads_collection]) # Make sure that both methods take grads of the same vars. self.assertAllCloseAccordingToType(pcgrad_vec, pcgrad_col_vec) results = [{ 'var': var0, 'pcgrad_vec': [0.5, -1.5], 'result': [0.9995, 2.0015] }, { 'var': var1, 'pcgrad_vec': [0.5, 1.5], 'result': [2.9995, 3.9985] }] grad_var_idx = {0, 1}.difference(pcgrad_var_idx) self.assertAllCloseAccordingToType( grad_vec[0][0], [0.0, -1.0], atol=1e-5) self.assertAllCloseAccordingToType( grad_vec[1][0], [0.0, 1.0], atol=1e-5) pcgrad_vec_idx = 0 for var_idx in pcgrad_var_idx: self.assertAllCloseAccordingToType( pcgrad_vec[pcgrad_vec_idx][0], results[var_idx]['pcgrad_vec'], atol=1e-5) pcgrad_vec_idx += 1 for var_idx in grad_var_idx: self.assertAllCloseAccordingToType( pcgrad_vec[pcgrad_vec_idx][0], grad_vec[var_idx][0], atol=1e-5) pcgrad_vec_idx += 1 self.evaluate(opt.apply_gradients(pcgrad_grads)) self.assertAllCloseAccordingToType( self.evaluate([results[idx]['var'] for idx in pcgrad_var_idx]), [results[idx]['result'] for idx in pcgrad_var_idx])
def apply_ccm(image, ccm): """Applies a color correction matrix.""" shape = tf.shape(image) image = tf.reshape(image, [-1, 3]) image = tf.tensordot(image, ccm, axes=[[-1], [-1]]) return tf.reshape(image, shape)
def broadcast_matmul_train(x, variational_params, clip_alpha=None, eps=common.EPSILON): R"""Training computation for VD matrix multiplication with N input matrices. Multiplies a 3D tensor `x` with a set of 2D parameters. Each 2D matrix `x[i, :, :]` in the input tensor is multiplied indendently with the parameters, resulting in a 3D output tensor with shape `x.shape[:2] + weight_parameters[0].shape[1]`. Args: x: 3D Tensor representing the input batch. variational_params: 2-tuple of Tensors, where the first tensor is the unscaled weight values and the second is the log of the alpha values for the hard concrete distribution. clip_alpha: Int or None. If integer, we clip the log \alpha values to [-clip_alpha, clip_alpha]. If None, don't clip the values. eps: Small constant value to use in log and sqrt operations to avoid NaNs. Returns: Output Tensor of the batched matmul operation. Raises: RuntimeError: If the variational_params argument is not a 2-tuple. """ theta, log_sigma2 = _verify_variational_params(variational_params) theta.get_shape().assert_has_rank(2) log_sigma2.get_shape().assert_has_rank(2) # The input data must have be rank 2 or greater assert x.get_shape().ndims >= 2 input_rank = x.get_shape().ndims if clip_alpha is not None: # Compute the log_alphas and then compute the # log_sigma2 again so that we can clip on the # log alpha magnitudes log_alpha = common.compute_log_alpha(log_sigma2, theta, eps, clip_alpha) log_sigma2 = common.compute_log_sigma2(log_alpha, theta, eps) # Compute the mean and standard deviation of the distributions over the # activations mu_activation = tf.tensordot(x, theta, [[input_rank - 1], [0]]) var_activation = tf.tensordot(tf.square(x), tf.exp(log_sigma2), [[input_rank - 1], [0]]) std_activation = tf.sqrt(var_activation + eps) # Reshape the output back to the rank of the input input_shape = x.get_shape().as_list() weight_shape = theta.get_shape().as_list() output_shape = input_shape[:-1] + [weight_shape[1]] mu_activation.set_shape(output_shape) std_activation.set_shape(output_shape) # NOTE: We sample noise for each weight in theta, which will be shared by # each matrix product that was done. This is equivalent to sampling the same # set of weights for all matrix products done by this op in an iteration. # The element-wise multiply below broadcasts. num_pad_dims = len(output_shape) - 2 padding = [tf.constant(1, dtype=tf.int32) for _ in range(num_pad_dims)] # NOTE: On GPU, the first dim may not be defined w/ the Transformer. Create # a tf.Tensor from the list shape and TF should match the first dim # appropriately batch_size = tf.shape(x)[0] data_dim = tf.shape(theta)[-1] noise_shape = tf.stack([batch_size] + padding + [data_dim], axis=0) output = mu_activation + std_activation * tf.random_normal(noise_shape) return output
def _static_subsample(self, indicator, batch_size, labels): """Returns subsampled minibatch. Args: indicator: boolean tensor of shape [N] whose True entries can be sampled. N should be a complie time constant. batch_size: desired batch size. This scalar cannot be None. labels: boolean tensor of shape [N] denoting positive(=True) and negative (=False) examples. N should be a complie time constant. Returns: sampled_idx_indicator: boolean tensor of shape [N], True for entries which are sampled. It ensures the length of output of the subsample is always batch_size, even when number of examples set to True in indicator is less than batch_size. Raises: ValueError: if labels and indicator are not 1D boolean tensors. """ # Check if indicator and labels have a static size. if not indicator.shape.is_fully_defined(): raise ValueError( 'indicator must be static in shape when is_static is' 'True') if not labels.shape.is_fully_defined(): raise ValueError('labels must be static in shape when is_static is' 'True') if not isinstance(batch_size, int): raise ValueError( 'batch_size has to be an integer when is_static is' 'True.') input_length = tf.shape(indicator)[0] # Set the number of examples set True in indicator to be at least # batch_size. num_true_sampled = tf.reduce_sum(tf.cast(indicator, tf.float32)) additional_false_sample = tf.less_equal( tf.cumsum(tf.cast(tf.logical_not(indicator), tf.float32)), batch_size - num_true_sampled) indicator = tf.logical_or(indicator, additional_false_sample) # Shuffle indicator and label. Need to store the permutation to restore the # order post sampling. permutation = tf.random_shuffle(tf.range(input_length)) indicator = ops.matmul_gather_on_zeroth_axis( tf.cast(indicator, tf.float32), permutation) labels = ops.matmul_gather_on_zeroth_axis(tf.cast(labels, tf.float32), permutation) # index (starting from 1) when indicator is True, 0 when False indicator_idx = tf.where(tf.cast(indicator, tf.bool), tf.range(1, input_length + 1), tf.zeros(input_length, tf.int32)) # Replace -1 for negative, +1 for positive labels signed_label = tf.where( tf.cast(labels, tf.bool), tf.ones(input_length, tf.int32), tf.scalar_mul(-1, tf.ones(input_length, tf.int32))) # negative of index for negative label, positive index for positive label, # 0 when indicator is False. signed_indicator_idx = tf.multiply(indicator_idx, signed_label) sorted_signed_indicator_idx = tf.nn.top_k(signed_indicator_idx, input_length, sorted=True).values [num_positive_samples, num_negative_samples ] = self._get_num_pos_neg_samples(sorted_signed_indicator_idx, batch_size) sampled_idx = self._get_values_from_start_and_end( sorted_signed_indicator_idx, num_positive_samples, num_negative_samples, batch_size) # Shift the indices to start from 0 and remove any samples that are set as # False. sampled_idx = tf.abs(sampled_idx) - tf.ones(batch_size, tf.int32) sampled_idx = tf.multiply( tf.cast(tf.greater_equal(sampled_idx, tf.constant(0)), tf.int32), sampled_idx) sampled_idx_indicator = tf.cast( tf.reduce_sum(tf.one_hot(sampled_idx, depth=input_length), axis=0), tf.bool) # project back the order based on stored permutations reprojections = tf.one_hot(permutation, depth=input_length, dtype=tf.float32) return tf.cast( tf.tensordot(tf.cast(sampled_idx_indicator, tf.float32), reprojections, axes=[0, 0]), tf.bool)
def attention(ratelayer, inputs, tag, attention_size=32): ratelayer.attention_size = attention_size ratelayer.tag = tag if isinstance(inputs, tuple): print("Attention layer - inputs is tuple, concat") # In case of Bi-RNN, concatenate the forward and the backward RNN outputs. inputs = tf.concat(inputs, 2) if ratelayer.time_major: # (T,B,D) => (B,T,D) inputs = tf.transpose(inputs, [1, 0, 2]) hidden_size = inputs.shape[ 2].value # D value - hidden size of the RNN layer print("hidden_size in attention layer", hidden_size) print("Att input shape", inputs.shape) # Trainable parameters with tf.variable_scope('v_' + ratelayer.tag): w_omega = tf.get_variable(initializer=tf.random_normal( [hidden_size + FLAGS.latent_dim, ratelayer.attention_size], stddev=0.1), name='w_omega') ratelayer.vars['w_omega'] = w_omega # b_omega = tf.get_variable(initializer=tf.random_normal( # [ratelayer.attention_size], stddev=0.1), name='b_omega') # ratelayer.vars['b_omega'] = b_omega u_omega = tf.get_variable(initializer=tf.random_normal( [ratelayer.attention_size], stddev=0.1), name='u_omega') ratelayer.vars['u_omega'] = u_omega b_v = tf.get_variable(initializer=tf.random_normal([1], stddev=0.1), name='b_v') ratelayer.vars['b_v'] = b_v # init for projection vars ratelayer.vars['project_' + self.tag] = tf.get_variable( initializer=tf.random_normal( [FLAGS.latent_dim, FLAGS.latent_dim], stddev=0.1), name='project_' + ratelayer.tag + '_matrix') ratelayer.vars['project_bias_' + ratelayer.tag] = tf.get_variable( initializer=tf.random_normal( [FLAGS.latent_dim], stddev=0.1), name='b_projection_' + ratelayer.tag) # transform and tile ratelayer.vars['projected_' + ratelayer.tag + '_latent'] = \ dot(ratelayer.vars[ratelayer.tag + '_latent'], ratelayer.vars['project_' + ratelayer.tag]) \ + ratelayer.vars['project_bias_' + ratelayer.tag] ratelayer.vars['projected_' + ratelayer.tag + '_latent'] = \ tf.nn.sigmoid(ratelayer.vars['projected_'+ratelayer.tag+'_latent']) projected_latent = tf.tile( tf.expand_dims(ratelayer.vars['projected_' + ratelayer.tag + '_latent'], axis=0), [inputs.shape[0], 1, 1]) # concat and non-linear attention additive one like # in https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html v1 = tf.concat([inputs, projected_latent], axis=2) v = tf.tanh(tf.tensordot(v1, w_omega, axes=1)) vu = tf.tensordot(v, u_omega, axes=1, name='vu') # For each of the timestamps its vector of size A from `v` is reduced with `u` vector print("vu shape", vu.shape) # vu shape (4, 2005) alphas = tf.nn.softmax(vu, name='alphas', axis=0) # (B,T) shape # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 0) return output, alphas
def body(self, features): hparams = self._hparams ps_devices = self._ps_devices single_device = (len(ps_devices) == 1) assert hparams.num_model_shards % len(ps_devices) == 0 shards_per_device = hparams.num_model_shards // len(ps_devices) model_devices = [ ps_devices[i // shards_per_device] for i in range(hparams.num_model_shards) ] print("model_devices = %s" % model_devices) mp = expert_utils.Parallelism(model_devices, reuse=False) targets_vocab_size = self._problem_hparams.vocabulary[ "targets"].vocab_size # squeeze out channels, heights targets = tf.squeeze(features["targets_raw"], [2, 3]) targets_embedding_var = mp( tf.get_variable, "embedding", [[targets_vocab_size, hparams.model_d]] * mp.n, initializer=tf.random_normal_initializer(0.0, hparams.model_d**-0.5)) shifted_targets = common_layers.shift_right_2d(targets) # Bypass the symbol modality and use a different embedding on each shard. if single_device: targets_embedding_var_combined = tf.concat(targets_embedding_var, 1) decoder_input_combined = common_layers.embedding( shifted_targets, targets_vocab_size, hparams.model_d * mp.n, multiplier=hparams.model_d**0.5, embedding_var=targets_embedding_var_combined, ) decoder_input = tf.split(decoder_input_combined, mp.n, axis=2) else: targets_embedding_var_combined = None decoder_input = mp( common_layers.embedding, shifted_targets, targets_vocab_size, hparams.model_d, multiplier=hparams.model_d**0.5, embedding_var=targets_embedding_var, ) decoder_self_attention_bias = mp( common_attention.attention_bias_lower_triangle, tf.shape(targets)[1]) if "targets_segmentation" in features: # "Packed" dataset - keep the examples from seeing each other. targets_segmentation = features["targets_segmentation"] targets_position = features["targets_position"] decoder_self_attention_bias = mp( tf.add, decoder_self_attention_bias, mp(common_attention.attention_bias_same_segment, targets_segmentation, targets_segmentation)) decoder_input = mp( common_attention.add_timing_signal_1d_given_position, decoder_input, targets_position) else: targets_position = None decoder_self_attention_bias = mp( common_attention.attention_bias_lower_triangle, tf.shape(targets)[1]) decoder_input = mp(common_attention.add_timing_signal_1d, decoder_input) if self.has_input: inputs = tf.squeeze(features["inputs_raw"], [2, 3]) inputs_vocab_size = self._problem_hparams.vocabulary[ "inputs"].vocab_size # share everything for now share_inputs_and_targets_embedding = True if share_inputs_and_targets_embedding: assert inputs_vocab_size == targets_vocab_size inputs_embedding_var = targets_embedding_var inputs_embedding_var_combined = targets_embedding_var_combined if single_device: encoder_input_combined = common_layers.embedding( inputs, inputs_vocab_size, hparams.model_d * mp.n, multiplier=hparams.model_d**0.5, embedding_var=inputs_embedding_var_combined, ) encoder_input = tf.split(encoder_input_combined, mp.n, axis=2) else: encoder_input = mp( common_layers.embedding, inputs, inputs_vocab_size, hparams.model_d, multiplier=hparams.model_d**0.5, embedding_var=inputs_embedding_var, ) if "inputs_segmentation" in features: # "Packed" dataset - keep the examples from seeing each other. inputs_segmentation = features["inputs_segmentation"] inputs_position = features["inputs_position"] encoder_self_attention_bias = mp( common_attention.attention_bias_same_segment, inputs_segmentation, inputs_segmentation) encoder_decoder_attention_bias = mp( common_attention.attention_bias_same_segment, targets_segmentation, inputs_segmentation) encoder_input = mp( common_attention.add_timing_signal_1d_given_position, encoder_input, inputs_position) else: encoder_padding = tf.to_float(tf.equal(inputs, 0)) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding inputs_position = None encoder_input = mp(common_attention.add_timing_signal_1d, encoder_input) # encoder stack here with tf.variable_scope("encoder"): encoder_input = mp(tf.nn.dropout, encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = _layer_stack(mp, encoder_input, encoder_self_attention_bias, hparams.encoder_layers, hparams) else: encoder_decoder_attention_bias = None encoder_output = None with tf.variable_scope("decoder"): decoder_input = mp(tf.nn.dropout, decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_output = _layer_stack( mp, decoder_input, decoder_self_attention_bias, layers=hparams.decoder_layers, hparams=hparams, encoder_output=encoder_output, encoder_decoder_attention_bias=encoder_decoder_attention_bias) # Bypass the symbol modality and compute logits directly. # We compute a different set of logits on each shard, and sum them. # Share the weights with the target embedding. output_var = targets_embedding_var output_var_combined = targets_embedding_var_combined if single_device: decoder_output = tf.concat(decoder_output, 2) logits = tf.tensordot(decoder_output, output_var_combined, [[2], [1]]) num, denom = common_layers.padded_cross_entropy( logits, targets, hparams.label_smoothing) training_loss = num / denom else: logits = mp(tf.tensordot, decoder_output, output_var, [[[2], [1]]] * mp.n) logits = expert_utils.all_reduce_ring(logits, mp) # On each device, we compute the loss for a part of the batch. # This is faster than computing the whole loss on one shard. mp, logits = expert_utils.reduce_by_device(mp, logits, lambda l: l[0]) def _loss_for_shard(logits, targets, shard): logits = common_layers.approximate_split(logits, mp.n, 0)[shard] targets = common_layers.approximate_split(targets, mp.n, 0)[shard] return common_layers.padded_cross_entropy( logits, targets, hparams.label_smoothing) num, denom = mp(_loss_for_shard, logits, targets, range(mp.n)) training_loss = tf.add_n(num) / tf.add_n(denom) logits = logits[0] logits = tf.expand_dims(tf.expand_dims(logits, 2), 3) # override training loss so that it is not computed externally. losses = {"training": training_loss} return logits, losses
def graph_fn(tensora, tensorb): return tf.tensordot(tensora, tensorb, axes=1)
def boolean_mask(boxlist, indicator, fields=None, scope=None, use_static_shapes=False, indicator_sum=None): """Select boxes from BoxList according to indicator and return new BoxList. `boolean_mask` returns the subset of boxes that are marked as "True" by the indicator tensor. By default, `boolean_mask` returns boxes corresponding to the input index list, as well as all additional fields stored in the boxlist (indexing into the first dimension). However one can optionally only draw from a subset of fields. Args: boxlist: BoxList holding N boxes indicator: a rank-1 boolean tensor fields: (optional) list of fields to also gather from. If None (default), all fields are gathered from. Pass an empty fields list to only gather the box coordinates. scope: name scope. use_static_shapes: Whether to use an implementation with static shape gurantees. indicator_sum: An integer containing the sum of `indicator` vector. Only required if `use_static_shape` is True. Returns: subboxlist: a BoxList corresponding to the subset of the input BoxList specified by indicator Raises: ValueError: if `indicator` is not a rank-1 boolean tensor. """ with tf.name_scope(scope, 'BooleanMask'): if indicator.shape.ndims != 1: raise ValueError('indicator should have rank 1') if indicator.dtype != tf.bool: raise ValueError('indicator should be a boolean tensor') if use_static_shapes: if not (indicator_sum and isinstance(indicator_sum, int)): raise ValueError('`indicator_sum` must be a of type int') selected_positions = tf.cast(indicator, dtype=tf.float32) indexed_positions = tf.cast(tf.multiply( tf.cumsum(selected_positions), selected_positions), dtype=tf.int32) one_hot_selector = tf.one_hot(indexed_positions - 1, indicator_sum, dtype=tf.float32) sampled_indices = tf.cast(tf.tensordot(tf.cast(tf.range( tf.shape(indicator)[0]), dtype=tf.float32), one_hot_selector, axes=[0, 0]), dtype=tf.int32) return gather(boxlist, sampled_indices, use_static_shapes=True) else: subboxlist = box_list.BoxList( tf.boolean_mask(boxlist.get(), indicator)) if fields is None: fields = boxlist.get_extra_fields() for field in fields: if not boxlist.has_field(field): raise ValueError( 'boxlist must contain all specified fields') subfieldlist = tf.boolean_mask(boxlist.get_field(field), indicator) subboxlist.add_field(field, subfieldlist) return subboxlist
def startLstm(epochs=10, saveResult=True): trainData, validData, testData, wordId = loadWordIdsFromFiles() trainData = np.array(trainData, np.float32) # validData = np.array(validData, np.float32) testData = np.array(testData, np.float32) vocabSz = len(wordId) learnRate = 0.001 embedSz = 128 rnnSz, batchSz, winSz = 512, 10, 5 numWin = (trainData.shape[0] - 1) // (batchSz * winSz) # each batch has winSz * numWin words batchLen = winSz * numWin testNumWin = (testData.shape[0] - 1) // (batchSz * winSz) testBatchLen = winSz * testNumWin inp = tf.placeholder(tf.int32, shape=[batchSz, winSz]) # ans = tf.placeholder(tf.int32, shape=[batchSz * winSz]) ans = tf.placeholder(tf.int32, shape=[batchSz, winSz]) E = tf.Variable(tf.random_normal([vocabSz, embedSz], stddev=0.1)) embed = tf.nn.embedding_lookup(E, inp) rnn = LSTMCell(rnnSz) initialState = rnn.zero_state(batchSz, tf.float32) output, nextState = tf.nn.dynamic_rnn(rnn, embed, initial_state=initialState) # output = tf.reshape(output, [batchSz * winSz, rnnSz]) W = tf.Variable(tf.random_normal([rnnSz, vocabSz], stddev=.1)) B = tf.Variable(tf.random_normal([vocabSz], stddev=.1)) # logits = tf.matmul(output, W) + B logits = tf.tensordot(output, W, [[2], [0]]) + B ents = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=ans) loss = tf.reduce_sum(ents) train = tf.train.GradientDescentOptimizer(learnRate).minimize(loss) trainPerp = np.zeros(epochs, dtype=np.float32) testPerp = np.zeros(epochs, dtype=np.float32) with tf.Session() as sess: startTime = time.time() sess.run(tf.global_variables_initializer()) epoch = 0 print('epoch:', end=' ') while epoch < epochs: win = 0 inState = sess.run(initialState) testState = sess.run(initialState) # print(inState, testState) winStart, winEnd = 0, winSz while win < numWin: inInp = np.array([trainData[i * batchLen + winStart:i * batchLen + winEnd] for i in range(batchSz)]) # inAns = np.reshape(np.array([trainData[i * batchLen + winStart + 1: i * batchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz) inAns = np.array([trainData[i * batchLen + winStart + 1: i * batchLen + winEnd + 1] for i in range(batchSz)]) _, inState, outLoss = sess.run([train, nextState, loss], {inp: inInp, ans: inAns, nextState: inState}) trainPerp[epoch] += outLoss if win < testNumWin: inInp = np.array([testData[i * testBatchLen + winStart:i * testBatchLen + winEnd] for i in range(batchSz)]) # inAns = np.reshape(np.array([testData[i * testBatchLen + winStart + 1: i * testBatchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz) inAns = np.array([testData[i * testBatchLen + winStart + 1: i * testBatchLen + winEnd + 1] for i in range(batchSz)]) testState, testOutLoss = sess.run([nextState, loss], {inp: inInp, ans: inAns, nextState: testState}) testPerp[epoch] += testOutLoss winStart, winEnd = winEnd, winEnd + winSz win += 1 epoch += 1 print(epoch, end=' ') trainPerp = np.exp(trainPerp / (trainData.shape[0] // (batchSz * batchLen) * (batchSz * batchLen))) testPerp = np.exp(testPerp / (testData.shape[0] // (batchSz * testBatchLen) * (batchSz * testBatchLen))) print(f'\nelapsed: {time.time() - startTime}') print('train perplexity:', trainPerp[-1]) print('test perplexity:', testPerp[-1]) info = {'style': 'lstm', 'batch size': batchSz, 'embed size': embedSz, 'rnn size': rnnSz, 'win size': winSz, 'learning rate': learnRate, 'epochs': epochs, 'train perplexity': trainPerp[-1], 'test perplexity': testPerp[-1]} if saveResult: save(sess, info) drawPerplexity(trainPerp, testPerp)