def cnn(self, name_scope, char_embedded): char_embedded = tf.expand_dims(char_embedded, -1) pooled_outputs = list() for i, filter_size in enumerate(self.config.filter_sizes): with tf.variable_scope(f"{name_scope}_conv1_{filter_size}"): filter_shape = [filter_size, self.config.char_embedding_dim, 1, self.config.n_filter] w_filter = weight_variable(shape=filter_shape, name='w_filter') beta = bias_variable(shape=[self.config.n_filter], name='beta_filter') conv = tf.nn.bias_add( tf.nn.conv2d(char_embedded, w_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv"), beta) h = tf.nn.relu(conv, name="relu") with tf.variable_scope(f"{name_scope}_conv2_{filter_size}"): filter_shape = [filter_size, 1, self.config.n_filter, self.config.n_filter] w_filter = weight_variable(shape=filter_shape, name='w_filter') beta = bias_variable(shape=[self.config.n_filter], name='beta_filter') conv = tf.nn.bias_add( tf.nn.conv2d(h, w_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv"), beta) h = tf.nn.relu(conv, name="relu") pooled = tf.nn.max_pool(h, ksize=[1, self.config.char_max_len - filter_size * 2 + 2, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) h_pool = tf.concat(pooled_outputs, 3) cnn_char_enc = tf.reshape(h_pool, [self.config.batch_size, -1, self.config.n_filter * len(self.config.filter_sizes)]) return cnn_char_enc
def textcnn(self, X_inputs, n_step): """ TextCNN 模型。 """ inputs = tf.expand_dims(X_inputs, -1) pooled_outputs = list() for i, filter_size in enumerate(self.settings.filter_sizes): with tf.variable_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [ filter_size, self.settings.hidden_size * 2 + self.settings.embedding_dim, 1, self.settings.n_filter ] W_filter = weight_variable(shape=filter_shape, name='W_filter') conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv") h = tf.nn.relu(conv, name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, n_step - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) h_pool = tf.concat(pooled_outputs, 3) h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total]) return h_pool_flat # shape = [batch_size, self.n_filter_total]
def __init__(self, inputNode, dictionary, alpha, lr): [batch, f] = inputNode.get_shape().as_list() [num_class, dict_size, f] = dictionary.get_shape().as_list() actShape = [num_class, batch, dict_size] self.potential_init = tf.random_uniform(actShape, 0, 1.05 * alpha, dtype=tf.float32) self.potential = utils.weight_variable(actShape, "potential", 1e-3) self.activation = utils.weight_variable(actShape, "activation", 1e-3) self.recon = tf.matmul(self.activation, dictionary) expand_input = tf.expand_dims(inputNode, 0) error = expand_input - self.recon #self.recon_error = 0.5 * tf.reduce_mean(tf.reduce_sum(error**2, axis=2)) self.recon_error = 0.5 * tf.reduce_sum(error**2, axis=[1, 2]) #self.l1_sparsity = tf.reduce_sum(tf.abs(self.activation)) self.l1_sparsity = tf.reduce_sum(tf.abs(self.activation), axis=[1, 2]) self.nnz = tf.count_nonzero(self.activation, axis=[1, 2]) / (batch * dict_size) self.loss = self.recon_error + alpha * self.l1_sparsity self.calc_activation = self.activation.assign( tf.sign(self.potential) * tf.nn.relu(tf.abs(self.potential) - alpha)) self.reset_potential = self.potential.assign(self.potential_init) opt = tf.train.AdamOptimizer(lr) #Calculate recon gradient wrt activation recon_grad = opt.compute_gradients(self.recon_error, [self.activation]) #Apply gradient (plus shrinkage) to potential #d_potential = [(recon_grad[0][0] + (self.potential - self.activation)/(num_class*batch), self.potential)] d_potential = [(recon_grad[0][0] + (self.potential - self.activation), self.potential)] self.train_step = opt.apply_gradients(d_potential)
def __init__(self, inputNode, l1_weight, dict_size, sc_lr, dict_lr, layer_type=None, patch_size=None, stride=None, mask=None): curr_input = inputNode #Model variables and outputs self.model = {} assert (layer_type is not None) with tf.name_scope("lca_layer"): input_shape = curr_input.get_shape().as_list() if (len(input_shape) == 3): [batch, input_size, input_features] = input_shape else: [batch, input_features] = input_shape if ("sc_fc" == layer_type): curr_input = tf.reshape(curr_input, [batch, -1]) input_features = curr_input.get_shape().as_list()[1] D_shape = [input_features, dict_size] act_shape = [batch, dict_size] reduce_axis = [1] else: D_shape = [patch_size, input_features, dict_size] assert (input_size % stride == 0) act_shape = [batch, input_size // stride, dict_size] reduce_axis = [1, 2] curr_dict = utils.l2_weight_variable(D_shape, "dictionary") curr_potential = utils.weight_variable(act_shape, "potential", std=1e-3) curr_activation = utils.weight_variable(act_shape, "activation", std=1e-3) if ("sc_fc" == layer_type): curr_recon = tf.matmul(curr_activation, curr_dict, transpose_b=True) elif ("sc_conv" == layer_type): curr_recon = tf.contrib.nn.conv1d_transpose( curr_activation, curr_dict, [batch, input_size, input_features], stride, padding='SAME') else: assert (0) curr_error = curr_input - curr_recon curr_recon_error = 0.5 * tf.reduce_mean( tf.reduce_sum(curr_error**2, axis=reduce_axis)) curr_l1_sparsity = tf.reduce_mean( tf.reduce_sum(tf.abs(curr_activation), axis=reduce_axis)) curr_loss = curr_recon_error + 0.5 * l1_weight * curr_l1_sparsity self.model["error"] = curr_error self.model["recon_error"] = curr_recon_error self.model["potential"] = curr_potential self.model["activation"] = curr_activation self.model["recon"] = curr_recon self.model["l1_sparsity"] = curr_l1_sparsity self.model["loss"] = curr_loss #Ops calc_act = tf.nn.relu(curr_potential - l1_weight) self.calc_activation = curr_activation.assign(calc_act) low_init_val = .8 * l1_weight high_init_val = 1.1 * l1_weight potential_init = tf.random_uniform(act_shape, low_init_val, high_init_val, dtype=tf.float32) self.reset_potential = curr_potential.assign(potential_init) #Save all variables self.model["dictionary"] = curr_dict self.model["output"] = curr_activation self.model["input"] = curr_input with tf.name_scope("stats"): #Calculate stats num_total_act = 1 for s in act_shape: num_total_act *= s curr_nnz = tf.count_nonzero(curr_activation) / num_total_act #Calculate means/std of activations #Do this across batches #Normalize each feature/dictionary element individually if (len(act_shape) == 3): moment_reduce_axis = [0, 1] tile_input = [act_shape[0], act_shape[1], 1] elif (len(act_shape) == 2): moment_reduce_axis = 0 tile_input = [act_shape[0], 1] else: assert (0) act_norm = tf.norm(curr_activation, axis=moment_reduce_axis, keepdims=True) act_mean, act_var = tf.nn.moments(curr_activation, axes=moment_reduce_axis, keep_dims=True) act_std = tf.sqrt(act_var) act_max = tf.reduce_max(curr_activation) pot_norm = tf.norm(curr_potential, axis=moment_reduce_axis, keepdims=True) pot_mean, pot_var = tf.nn.moments(curr_potential, axes=moment_reduce_axis, keep_dims=True) pot_std = tf.sqrt(pot_var) input_norm = tf.norm(curr_input, axis=moment_reduce_axis) output_norm = tf.norm(curr_activation, axis=moment_reduce_axis) self.model["nnz"] = curr_nnz self.model["act_norm"] = act_norm self.model["act_mean"] = act_mean self.model["act_std"] = act_std self.model["act_max"] = act_max self.model["pot_norm"] = pot_norm self.model["pot_mean"] = pot_mean self.model["pot_std"] = pot_std self.model["input_norm"] = input_norm self.model["output_norm"] = output_norm with tf.name_scope("optimizer"): #Define optimizer #TODO different learning rates? opt = tf.train.AdamOptimizer(sc_lr) #Calculate recon gradient wrt activation recon_grad = opt.compute_gradients(self.model["recon_error"], self.model["activation"]) #Apply gradient (plus shrinkage) to potential #Needs to be a list of number of gradients, each element as a tuple of (gradient, wrt) (grad, var) = recon_grad[0] shrink_term = (1 / batch) * (self.model["potential"] - self.model["activation"]) d_potential = [(grad + shrink_term, self.model["potential"])] self.train_step = opt.apply_gradients(d_potential) #Reset must be called after apply_gradients to define opt variables self.reset_opt = tf.group([v.initializer for v in opt.variables()]) #Dictionary update variables opt_D = tf.train.AdamOptimizer(dict_lr) self.update_D = opt_D.minimize(self.model["recon_error"], var_list=[self.model["dictionary"]]) #Normalize D curr_dict = self.model["dictionary"] dict_shape = curr_dict.get_shape().as_list() if (len(dict_shape) == 3): curr_norm = tf.norm(curr_dict, axis=(0, 1)) elif (len(dict_shape) == 2): curr_norm = tf.norm(curr_dict, axis=0) else: assert (0) #curr_norm = tf.maximum(tf.ones(dict_shape), curr_norm) self.normalize_D = curr_dict.assign(curr_dict / curr_norm)
def __init__(self): self.model_name = 'bigru' self.settings = BiGRUSetting() self.max_f1 = 0.0 self.is_training = True with tf.name_scope('Inputs'): self.title_input = tf.placeholder(tf.int64, [None, self.settings.title_len], name='title_inputs') self.detail_input = tf.placeholder( tf.int64, [None, self.settings.detail_len], name='detail_inputs') self.class_input = tf.placeholder(tf.float32, [None, self.settings.class_num], name='class_input') self.title_length = tf.placeholder(tf.int64, [None], name='title_length') self.detail_length = tf.placeholder(tf.int64, [None], name='detail_length') self.keep_prob = tf.placeholder(tf.float32, []) """ 构建embedding层 """ with tf.variable_scope('embedding'): self.embedding = tf.get_variable( name='embedding', shape=[self.settings.voc_size, self.settings.embedding_dim], initializer=tf.contrib.layers.xavier_initializer()) """ 构建stack_bi_gru+Attention层 """ with tf.variable_scope('bi_gru_title'): title_embedded = tf.nn.embedding_lookup(self.embedding, self.title_input) title_bi_gru_output = self.stack_bi_gru_layer( title_embedded, self.title_length) title_attention_output = attention_layer( title_bi_gru_output, self.settings.bi_gru_hidden_dim * 2) with tf.variable_scope('bi_gru_detail'): detail_embedded = tf.nn.embedding_lookup(self.embedding, self.detail_input) detail_bi_gru_output = self.stack_bi_gru_layer( detail_embedded, self.detail_length) detail_attention_output = attention_layer( detail_bi_gru_output, self.settings.bi_gru_hidden_dim * 2) """ 构建fully connected层 """ with tf.variable_scope('fc'): concat_output = tf.concat( [title_attention_output, detail_attention_output], axis=1) W_fc = weight_variable([ self.settings.bi_gru_hidden_dim * 4, self.settings.fc_hidden_dim ], name='Weight_fc') fc_output = tf.matmul(concat_output, W_fc, name='h_fc') fc_bn_relu = tf.nn.relu(fc_output, name="relu") """ 构建输出层 """ with tf.variable_scope('output'): W_out = weight_variable( [self.settings.fc_hidden_dim, self.settings.class_num], name='Weight_out') b_out = bias_variable([self.settings.class_num], name='bias_out') self.y_pred = tf.nn.xw_plus_b(fc_bn_relu, W_out, b_out, name='y_pred') self.sigmoid_y_pred = tf.nn.sigmoid(self.y_pred) """ loss """ with tf.variable_scope('loss'): self.loss = add_loss(self.y_pred, self.class_input) """ train """ with tf.variable_scope('training_ops'): self.train_op = add_train_op(lr=self.settings.lr, loss=self.loss) self.saver = tf.train.Saver(max_to_keep=1, name=self.model_name) print(f'{self.model_name} init finish')
def __init__(self): super().__init__('rcnn') self.settings = RCNNSetting() self.n_filter_total = self.settings.n_filter * len( self.settings.filter_sizes) with tf.name_scope('Inputs'): self.title_input = tf.placeholder(tf.int64, [None, self.settings.title_len], name='title_inputs') self.detail_input = tf.placeholder( tf.int64, [None, self.settings.detail_len], name='detail_inputs') self.class_input = tf.placeholder(tf.float32, [None, self.settings.class_num], name='class_input') self.title_length = tf.placeholder(tf.int64, [None], name='title_length') self.detail_length = tf.placeholder(tf.int64, [None], name='detail_length') self.keep_prob = tf.placeholder(tf.float32, []) """ 构建embedding层 """ with tf.variable_scope('embedding'): self.embedding = tf.get_variable( name='embedding', shape=[self.settings.voc_size, self.settings.embedding_dim], initializer=tf.contrib.layers.xavier_initializer()) """ 构建RCNN层 """ with tf.variable_scope('rcnn_text'): output_title = self.rcnn_layer(self.title_input, self.settings.title_len, self.title_length) with tf.variable_scope('rcnn_content'): output_content = self.rcnn_layer(self.detail_input, self.settings.detail_len, self.detail_length) concat_output = tf.concat([output_title, output_content], axis=1) """ 构建fully connected层 """ with tf.variable_scope('fc_bn'): W_fc = weight_variable( [self.n_filter_total * 2, self.settings.fc_hidden_dim], name='Weight_fc') fc_output = tf.matmul(concat_output, W_fc, name='h_fc') fc_bn_relu = tf.nn.relu(fc_output, name="relu") fc_bn_drop = tf.nn.dropout(fc_bn_relu, self.keep_prob) """ 构建输出层 """ with tf.variable_scope('output'): W_out = weight_variable( [self.settings.fc_hidden_dim, self.settings.class_num], name='Weight_out') b_out = bias_variable([self.settings.class_num], name='bias_out') self.y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred') self.sigmoid_y_pred = tf.nn.sigmoid(self.y_pred) """ loss """ with tf.variable_scope('loss'): self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=self.y_pred, labels=self.class_input)) """ train """ with tf.variable_scope('training_ops'): self.train_op = add_train_op(lr=self.settings.lr, loss=self.loss, global_step=self.global_step) self.saver = tf.train.Saver(max_to_keep=1, name=self.model_name) print(f'{self.model_name} init finish')
def __init__(self, inputNode, num_layers, l1_weight, dict_size, sc_lr, dict_lr, layer_type=None, patch_size=None, stride=None, mask=None, err_weight=None, act_weight=None, top_down_weight=None, normalize_act=None, inject_act_bool=None, inject_act=None): curr_input = inputNode #Model variables and outputs self.model = {} self.model["dictionary"] = [] self.model["potential"] = [] self.model["activation"] = [] self.model["recon"] = [] self.model["input"] = [] self.model["output"] = [] self.model["error"] = [] self.model["recon_error"] = [] self.model["l1_sparsity"] = [] self.model["nnz"] = [] self.model["loss"] = [] self.model["act_norm"] = [] self.model["act_mean"] = [] self.model["act_std"] = [] self.model["act_max"] = [] self.model["pot_norm"] = [] self.model["pot_mean"] = [] self.model["pot_std"] = [] self.model["input_norm"] = [] self.model["output_norm"] = [] assert(layer_type is not None) #Model operations self.calc_activation = [] self.reset_potential = [] switch_fc = False if(err_weight is None): err_weight = [1 for i in range(num_layers)] if(act_weight is None): act_weight = [1 for i in range(num_layers)] if(top_down_weight is None): top_down_weight = [1 for i in range(num_layers)] for l in range(num_layers): with tf.name_scope("lca_layer_"+str(l)): curr_layer_type = layer_type[l] curr_dict_size = dict_size[l] curr_stride = stride[l] curr_patch_size = patch_size[l] curr_l1_weight = l1_weight[l] curr_normalize = normalize_act[l] input_shape = curr_input.get_shape().as_list() if(len(input_shape) == 3): [batch, input_size, input_features] = input_shape else: [batch, input_features] = input_shape if("sc_fc" == curr_layer_type): switch_fc = True curr_input = tf.reshape(curr_input, [batch, -1]) input_features = curr_input.get_shape().as_list()[1] D_shape = [input_features, curr_dict_size] act_shape = [batch, curr_dict_size] reduce_axis = [1] else: assert(not switch_fc) D_shape = [curr_patch_size, input_features, curr_dict_size] assert(input_size % curr_stride == 0) act_shape = [batch, input_size//curr_stride, curr_dict_size] reduce_axis = [1, 2] curr_dict = utils.l2_weight_variable(D_shape, "dictionary"+str(l)) curr_potential = utils.weight_variable(act_shape, "potential"+str(l), std=1e-3) curr_activation = utils.weight_variable(act_shape, "activation"+str(l), std=1e-3) if("sc_fc" == curr_layer_type): curr_recon = tf.matmul(curr_activation, curr_dict, transpose_b=True) elif("sc_conv" == curr_layer_type): curr_recon = tf.contrib.nn.conv1d_transpose(curr_activation, curr_dict, [batch, input_size, input_features], curr_stride, padding='SAME') else: assert(0) curr_error = curr_input - curr_recon curr_recon_error = err_weight[l] * 0.5 * tf.reduce_mean(tf.reduce_sum(curr_error**2, axis=reduce_axis)) curr_l1_sparsity = err_weight[l] * tf.reduce_mean(tf.reduce_sum(tf.abs(curr_activation), axis=reduce_axis)) #curr_recon_error = err_weight[l] * 0.5 * tf.reduce_mean(curr_error**2) #curr_l1_sparsity = err_weight[l] * tf.reduce_mean(tf.abs(curr_activation)) curr_loss = curr_recon_error + 0.5 * curr_l1_weight * curr_l1_sparsity self.model["error"].append(curr_error) self.model["recon_error"].append(curr_recon_error) self.model["potential"].append(curr_potential) self.model["activation"].append(curr_activation) self.model["recon"].append(curr_recon) self.model["l1_sparsity"].append(curr_l1_sparsity) self.model["loss"].append(curr_loss) #Ops #Use inject act if last layer for semi-supervised learning calc_act = tf.nn.relu(curr_potential - curr_l1_weight) if(l == num_layers - 1 and inject_act_bool is not None): set_act = tf.where(inject_act_bool, inject_act, calc_act) self.calc_activation.append(curr_activation.assign(set_act)) else: self.calc_activation.append(curr_activation.assign(calc_act)) if(curr_l1_weight == 0): low_init_val = -.1 high_init_val = .1 else: low_init_val = .8*curr_l1_weight high_init_val = 1.1*curr_l1_weight potential_init = tf.random_uniform(act_shape, low_init_val, high_init_val, dtype=tf.float32) self.reset_potential.append(curr_potential.assign(potential_init)) num_total_act = 1 for s in act_shape: num_total_act *= s curr_nnz = tf.count_nonzero(curr_activation) / num_total_act #Save all variables self.model["dictionary"].append(curr_dict) self.model["output"].append(curr_activation) self.model["input"].append(curr_input) self.model["nnz"].append(curr_nnz) #Calculate means/std of activations #Do this across batches #Normalize each feature/dictionary element individually if(len(act_shape) == 3): moment_reduce_axis = [0, 1] tile_input = [act_shape[0], act_shape[1], 1] elif(len(act_shape) == 2): moment_reduce_axis = 0 tile_input = [act_shape[0], 1] else: assert(0) act_norm = tf.norm(curr_activation, axis=moment_reduce_axis, keepdims=True) act_mean, act_var = tf.nn.moments(curr_activation, axes=moment_reduce_axis, keep_dims=True) act_std = tf.sqrt(act_var) act_max = tf.reduce_max(curr_activation) pot_norm = tf.norm(curr_potential, axis=moment_reduce_axis, keepdims=True) pot_mean, pot_var = tf.nn.moments(curr_potential, axes=moment_reduce_axis, keep_dims=True) pot_std = tf.sqrt(pot_var) self.model["act_norm"].append(act_norm) self.model["act_mean"].append(act_mean) self.model["act_std"].append(act_std) self.model["act_max"].append(act_max) self.model["pot_norm"].append(pot_norm) self.model["pot_mean"].append(pot_mean) self.model["pot_std"].append(pot_std) input_norm = tf.norm(curr_input, axis=moment_reduce_axis) self.model["input_norm"].append(input_norm) if(curr_normalize): #curr_input = ((curr_activation - act_mean)/(act_std+1e-8)) * act_weight[l] curr_input = ((curr_potential - pot_mean)/(pot_std+1e-8)) * act_weight[l] else: #curr_input = curr_activation * act_weight[l] curr_input = curr_potential * act_weight[l] output_norm = tf.norm(curr_input, axis=moment_reduce_axis) self.model["output_norm"].append(output_norm) #Stop gradient, as we explcitly compute top down feedback curr_input = tf.stop_gradient(curr_input) with tf.name_scope("optimizer"): #Group ops self.calc_activation = tf.group(*self.calc_activation) self.reset_potential = tf.group(*self.reset_potential) #Define optimizer #TODO different learning rates? opt = tf.train.AdamOptimizer(sc_lr) total_recon_error = tf.reduce_sum(self.model["recon_error"]) self.model["total_recon_error"] = total_recon_error #Calculate recon gradient wrt activation recon_grad = opt.compute_gradients(total_recon_error, self.model["activation"]) #Apply gradient (plus shrinkage) to potential #Needs to be a list of number of gradients, each element as a tuple of (gradient, wrt) d_potential = [] for i, (grad, var) in enumerate(recon_grad): shrink_term = err_weight[i] * (1/batch) * (self.model["potential"][i] - self.model["activation"][i]) #The top down term doesn't exist with the recon loss as written, since potential #isnt connected to the total recon loss if(i < (num_layers - 1)): top_down_term = top_down_weight[i] * self.model["error"][i+1] else: top_down_term = 0 d_potential.append((grad + shrink_term - top_down_term, self.model["potential"][i])) self.train_step = opt.apply_gradients(d_potential) #Reset must be called after apply_gradients to define opt variables self.reset_opt = tf.group([v.initializer for v in opt.variables()]) #Dictionary update variables opt_D = tf.train.AdamOptimizer(dict_lr) self.update_D = opt_D.minimize(total_recon_error, var_list=[self.model["dictionary"]]) #Normalize D self.normalize_D = [] for l in range(num_layers): curr_dict = self.model["dictionary"][l] dict_shape = curr_dict.get_shape().as_list() if(len(dict_shape) == 3): curr_norm = tf.norm(curr_dict, axis=(0, 1)) else: curr_norm = tf.norm(curr_dict, axis=0) #curr_norm = tf.maximum(tf.ones(dict_shape), curr_norm) self.normalize_D.append(curr_dict.assign(curr_dict/curr_norm)) self.normalize_D = tf.group(*self.normalize_D) with tf.name_scope("weight_recon"): #Allows calculating reconstruction from each layer layer_weights = [] for l in range(num_layers): recon_l_fc = ("sc_fc" == layer_type[l]) recon_l_num_dict = dict_size[l] recon_act = tf.eye(recon_l_num_dict) if(not recon_l_fc): recon_act = recon_act[:, tf.newaxis, :] switch_conv = not recon_l_fc curr_act = recon_act curr_pot = None for ll in reversed(range(l+1)): curr_dict = self.model["dictionary"][ll] curr_layer_type = layer_type[ll] curr_stride = stride[ll] curr_patch_size = patch_size[ll] curr_l1_weight = l1_weight[ll] curr_normalize = normalize_act[ll] #Find activity given potential #Don't normalize layer we're visualizing if(ll != l): #Normalize the potential and calculate next activity if(curr_normalize): curr_pot = (curr_pot/act_weight[ll]) * (self.model["pot_std"][ll] + 1e-8) + self.model["pot_mean"][ll] else: curr_pot = curr_pot/act_weight[ll] curr_act = tf.nn.relu(curr_pot - curr_l1_weight) #Reshape if needed (fc -> conv layer) if("sc_conv" == curr_layer_type): input_shape = curr_act.get_shape().as_list() if(not switch_conv): switch_conv = True input_shape = self.model["output"][ll].get_shape().as_list() input_shape[0] = recon_l_num_dict curr_act = tf.reshape(curr_act, input_shape) #Reconstruct given the activity if("sc_fc" == curr_layer_type): curr_pot = tf.matmul(curr_act, curr_dict, transpose_b=True) else: if(recon_l_fc): if(ll == 0): output_shape = inputNode.get_shape().as_list() else: output_shape = self.model["output"][ll-1].get_shape().as_list() output_shape[0] = recon_l_num_dict else: num_x = input_shape[1] num_out_x = curr_patch_size + ((num_x-1) * curr_stride) if(ll == 0): output_features = inputNode.get_shape().as_list()[-1] else: output_features = self.model["output"][ll-1].get_shape().as_list()[-1] output_shape = [recon_l_num_dict, num_out_x, output_features] if(recon_l_fc): padding='SAME' else: padding='VALID' curr_pot = tf.contrib.nn.conv1d_transpose(curr_act, curr_dict, output_shape, curr_stride, padding=padding) layer_weights.append(curr_pot) self.model["layer_weights"] = layer_weights
def __init__(self, inputNode, num_layers, l1_weight, dict_size, sc_lr, dict_lr, layer_type=None, patch_size=None, stride=None, mask=None, err_weight=None, act_weight=None, top_down_weight=None, normalize_act=None, inject_act_bool=None, inject_act=None): curr_input = inputNode #Model variables and outputs self.model = {} self.model["dictionary"] = [] self.model["potential"] = [] self.model["activation"] = [] self.model["recon"] = [] self.model["input"] = [] self.model["output"] = [] self.model["error"] = [] self.model["recon_error"] = [] self.model["l1_sparsity"] = [] self.model["nnz"] = [] self.model["loss"] = [] self.model["act_norm"] = [] self.model["act_mean"] = [] self.model["act_std"] = [] self.model["act_max"] = [] self.model["pot_norm"] = [] self.model["pot_mean"] = [] self.model["pot_std"] = [] self.model["input_norm"] = [] self.model["output_norm"] = [] assert (layer_type is not None) #Model operations self.calc_activation = [] self.reset_potential = [] switch_fc = False if (err_weight is None): err_weight = [1 for i in range(num_layers)] if (act_weight is None): act_weight = [1 for i in range(num_layers)] if (top_down_weight is None): top_down_weight = [1 for i in range(num_layers)] for l in range(num_layers): with tf.name_scope("lca_layer_" + str(l)): curr_layer_type = layer_type[l] curr_dict_size = dict_size[l] curr_stride = stride[l] curr_patch_size = patch_size[l] curr_l1_weight = l1_weight[l] curr_normalize = normalize_act[l] input_shape = curr_input.get_shape().as_list() if (len(input_shape) == 3): [batch, input_size, input_features] = input_shape else: [batch, input_features] = input_shape if ("sc_fc" == curr_layer_type): switch_fc = True curr_input = tf.reshape(curr_input, [batch, -1]) input_features = curr_input.get_shape().as_list()[1] D_shape = [input_features, curr_dict_size] act_shape = [batch, curr_dict_size] reduce_axis = [1] else: assert (not switch_fc) D_shape = [curr_patch_size, input_features, curr_dict_size] assert (input_size % curr_stride == 0) act_shape = [ batch, input_size // curr_stride, curr_dict_size ] reduce_axis = [1, 2] curr_dict = utils.l2_weight_variable(D_shape, "dictionary" + str(l)) curr_potential = utils.weight_variable(act_shape, "potential" + str(l), std=1e-3) curr_activation = utils.weight_variable(act_shape, "activation" + str(l), std=1e-3) if ("sc_fc" == curr_layer_type): curr_recon = tf.matmul(curr_activation, curr_dict, transpose_b=True) elif ("sc_conv" == curr_layer_type): curr_recon = tf.contrib.nn.conv1d_transpose( curr_activation, curr_dict, [batch, input_size, input_features], curr_stride, padding='SAME') else: assert (0) curr_error = curr_input - curr_recon curr_recon_error = err_weight[l] * 0.5 * tf.reduce_mean( tf.reduce_sum(curr_error**2, axis=reduce_axis)) curr_l1_sparsity = err_weight[l] * tf.reduce_mean( tf.reduce_sum(tf.abs(curr_activation), axis=reduce_axis)) #curr_recon_error = err_weight[l] * 0.5 * tf.reduce_mean(curr_error**2) #curr_l1_sparsity = err_weight[l] * tf.reduce_mean(tf.abs(curr_activation)) curr_loss = curr_recon_error + 0.5 * curr_l1_weight * curr_l1_sparsity self.model["error"].append(curr_error) self.model["recon_error"].append(curr_recon_error) self.model["potential"].append(curr_potential) self.model["activation"].append(curr_activation) self.model["recon"].append(curr_recon) self.model["l1_sparsity"].append(curr_l1_sparsity) self.model["loss"].append(curr_loss) #Ops #Use inject act if last layer for semi-supervised learning calc_act = tf.nn.relu(curr_potential - curr_l1_weight) if (l == num_layers - 1 and inject_act_bool is not None): set_act = tf.where(inject_act_bool, inject_act, calc_act) self.calc_activation.append( curr_activation.assign(set_act)) else: self.calc_activation.append( curr_activation.assign(calc_act)) if (curr_l1_weight == 0): low_init_val = -.1 high_init_val = .1 else: low_init_val = .8 * curr_l1_weight high_init_val = 1.1 * curr_l1_weight potential_init = tf.random_uniform(act_shape, low_init_val, high_init_val, dtype=tf.float32) self.reset_potential.append( curr_potential.assign(potential_init)) num_total_act = 1 for s in act_shape: num_total_act *= s curr_nnz = tf.count_nonzero(curr_activation) / num_total_act #Save all variables self.model["dictionary"].append(curr_dict) self.model["output"].append(curr_activation) self.model["input"].append(curr_input) self.model["nnz"].append(curr_nnz) #Calculate means/std of activations #Do this across batches #Normalize each feature/dictionary element individually if (len(act_shape) == 3): moment_reduce_axis = [0, 1] tile_input = [act_shape[0], act_shape[1], 1] elif (len(act_shape) == 2): moment_reduce_axis = 0 tile_input = [act_shape[0], 1] else: assert (0) act_norm = tf.norm(curr_activation, axis=moment_reduce_axis, keepdims=True) act_mean, act_var = tf.nn.moments(curr_activation, axes=moment_reduce_axis, keep_dims=True) act_std = tf.sqrt(act_var) act_max = tf.reduce_max(curr_activation) pot_norm = tf.norm(curr_potential, axis=moment_reduce_axis, keepdims=True) pot_mean, pot_var = tf.nn.moments(curr_potential, axes=moment_reduce_axis, keep_dims=True) pot_std = tf.sqrt(pot_var) self.model["act_norm"].append(act_norm) self.model["act_mean"].append(act_mean) self.model["act_std"].append(act_std) self.model["act_max"].append(act_max) self.model["pot_norm"].append(pot_norm) self.model["pot_mean"].append(pot_mean) self.model["pot_std"].append(pot_std) input_norm = tf.norm(curr_input, axis=moment_reduce_axis) self.model["input_norm"].append(input_norm) if (curr_normalize): #curr_input = ((curr_activation - act_mean)/(act_std+1e-8)) * act_weight[l] curr_input = ((curr_potential - pot_mean) / (pot_std + 1e-8)) * act_weight[l] else: #curr_input = curr_activation * act_weight[l] curr_input = curr_potential * act_weight[l] output_norm = tf.norm(curr_input, axis=moment_reduce_axis) self.model["output_norm"].append(output_norm) #Stop gradient, as we explcitly compute top down feedback curr_input = tf.stop_gradient(curr_input) with tf.name_scope("optimizer"): #Group ops self.calc_activation = tf.group(*self.calc_activation) self.reset_potential = tf.group(*self.reset_potential) #Define optimizer #TODO different learning rates? opt = tf.train.AdamOptimizer(sc_lr) total_recon_error = tf.reduce_sum(self.model["recon_error"]) self.model["total_recon_error"] = total_recon_error #Calculate recon gradient wrt activation recon_grad = opt.compute_gradients(total_recon_error, self.model["activation"]) #Apply gradient (plus shrinkage) to potential #Needs to be a list of number of gradients, each element as a tuple of (gradient, wrt) d_potential = [] for i, (grad, var) in enumerate(recon_grad): shrink_term = err_weight[i] * (1 / batch) * ( self.model["potential"][i] - self.model["activation"][i]) #The top down term doesn't exist with the recon loss as written, since potential #isnt connected to the total recon loss if (i < (num_layers - 1)): top_down_term = top_down_weight[i] * self.model["error"][ i + 1] else: top_down_term = 0 d_potential.append((grad + shrink_term - top_down_term, self.model["potential"][i])) self.train_step = opt.apply_gradients(d_potential) #Reset must be called after apply_gradients to define opt variables self.reset_opt = tf.group([v.initializer for v in opt.variables()]) #Dictionary update variables opt_D = tf.train.AdamOptimizer(dict_lr) self.update_D = opt_D.minimize(total_recon_error, var_list=[self.model["dictionary"]]) #Normalize D self.normalize_D = [] for l in range(num_layers): curr_dict = self.model["dictionary"][l] dict_shape = curr_dict.get_shape().as_list() if (len(dict_shape) == 3): curr_norm = tf.norm(curr_dict, axis=(0, 1)) else: curr_norm = tf.norm(curr_dict, axis=0) #curr_norm = tf.maximum(tf.ones(dict_shape), curr_norm) self.normalize_D.append(curr_dict.assign(curr_dict / curr_norm)) self.normalize_D = tf.group(*self.normalize_D) with tf.name_scope("weight_recon"): #Allows calculating reconstruction from each layer layer_weights = [] for l in range(num_layers): recon_l_fc = ("sc_fc" == layer_type[l]) recon_l_num_dict = dict_size[l] recon_act = tf.eye(recon_l_num_dict) if (not recon_l_fc): recon_act = recon_act[:, tf.newaxis, :] switch_conv = not recon_l_fc curr_act = recon_act curr_pot = None for ll in reversed(range(l + 1)): curr_dict = self.model["dictionary"][ll] curr_layer_type = layer_type[ll] curr_stride = stride[ll] curr_patch_size = patch_size[ll] curr_l1_weight = l1_weight[ll] curr_normalize = normalize_act[ll] #Find activity given potential #Don't normalize layer we're visualizing if (ll != l): #Normalize the potential and calculate next activity if (curr_normalize): curr_pot = (curr_pot / act_weight[ll]) * ( self.model["pot_std"][ll] + 1e-8) + self.model["pot_mean"][ll] else: curr_pot = curr_pot / act_weight[ll] curr_act = tf.nn.relu(curr_pot - curr_l1_weight) #Reshape if needed (fc -> conv layer) if ("sc_conv" == curr_layer_type): input_shape = curr_act.get_shape().as_list() if (not switch_conv): switch_conv = True input_shape = self.model["output"][ll].get_shape( ).as_list() input_shape[0] = recon_l_num_dict curr_act = tf.reshape(curr_act, input_shape) #Reconstruct given the activity if ("sc_fc" == curr_layer_type): curr_pot = tf.matmul(curr_act, curr_dict, transpose_b=True) else: if (recon_l_fc): if (ll == 0): output_shape = inputNode.get_shape().as_list() else: output_shape = self.model["output"][ ll - 1].get_shape().as_list() output_shape[0] = recon_l_num_dict else: num_x = input_shape[1] num_out_x = curr_patch_size + ( (num_x - 1) * curr_stride) if (ll == 0): output_features = inputNode.get_shape( ).as_list()[-1] else: output_features = self.model["output"][ ll - 1].get_shape().as_list()[-1] output_shape = [ recon_l_num_dict, num_out_x, output_features ] if (recon_l_fc): padding = 'SAME' else: padding = 'VALID' curr_pot = tf.contrib.nn.conv1d_transpose( curr_act, curr_dict, output_shape, curr_stride, padding=padding) layer_weights.append(curr_pot) self.model["layer_weights"] = layer_weights
def buildModel(self): with tf.device(self.params.device): with tf.name_scope("Variables"): #Dictionary elements D_shape = [ self.params.num_classes, self.params.dict_size, self.params.num_features ] if (self.params.init_weights is None): self.D = utils.l2_weight_variable(D_shape, "dictionary") else: if (len(self.params.init_weights.shape) == 2): init_weights = self.params.init_weights[np.newaxis, ...] init_weights = np.tile(init_weights, [self.params.num_classes, 1, 1]) else: init_weights = self.params.init_weights self.D = tf.Variable(init_weights.astype(np.float32), name="dictionary") #Binary classification W_shape = [ self.params.num_classes, self.params.dict_size, self.params.num_features ] self.W = utils.weight_variable(W_shape, "class_weights") self.input = tf.placeholder( tf.float32, shape=[self.params.batch_size, self.params.num_features], name="input") self.labels = tf.placeholder(tf.int64, shape=[self.params.batch_size], name="labels") self.norm_input = (self.input - tf.reduce_mean( self.input, axis=1, keepdims=True)) / tf.norm( self.input, axis=1, keepdims=True) #Set binary labels for each class onehot_labels = tf.transpose( tf.one_hot(self.labels, self.params.num_classes), [1, 0]) #go from [0, 1] to [-1, 1] onehot_labels = onehot_labels * 2 - 1 #Add to tensorboard self.varDict["D"] = self.D self.varDict["W"] = self.W self.varDict["labels"] = self.labels self.varDict["onehot_labels"] = onehot_labels if (self.params.image_shape is not None): reshape_image = tf.reshape(self.norm_input, (self.params.batch_size, ) + self.params.image_shape) self.imageDict["norm_image"] = reshape_image else: self.varDict["norm_input"] = self.norm_input with tf.name_scope("SC"): self.scObj = lcaSC(self.norm_input, self.D, self.params.l1_weight, self.params.sc_lr) sc_activation = self.scObj.activation self.varDict["sc_activation"] = sc_activation self.scalarDict["sc_recon_err"] = tf.reduce_mean( self.scObj.recon_error) self.scalarDict["sc_l1_sparsity"] = tf.reduce_mean( self.scObj.l1_sparsity) self.scalarDict["sc_loss"] = tf.reduce_mean(self.scObj.loss) self.scalarDict["sc_nnz"] = tf.reduce_mean(self.scObj.nnz) if (self.params.image_shape is not None): reshape_recon = tf.reshape(self.scObj.recon, ( self.params.num_classes, self.params.batch_size, ) + self.params.image_shape) for i in range(self.params.num_classes): self.imageDict["recon_class_" + str(i)] = reshape_recon[i, ...] else: self.varDict["recon"] = self.scObj.recon with tf.name_scope("feedforward"): tile_input = tf.tile(self.norm_input[tf.newaxis, :, :], [self.params.num_classes, 1, 1]) feed_forward = tf.matmul(tile_input, self.W, transpose_b=True) #Taking inner product of feed_forward with sc_activation #i.e., diag(matmul(feed_forward, sc_activation)) feed_forward = tf.reduce_sum(feed_forward * sc_activation, axis=2) self.est_labels = tf.argmax(feed_forward, axis=0) self.varDict["feed_forward"] = feed_forward self.varDict["est_labels"] = self.est_labels with tf.variable_scope('accuracy'): #Calculate accuracy self.injectBool = tf.placeholder_with_default( False, shape=(), name="injectBool") self.injectAcc = tf.placeholder_with_default(0.0, shape=None, name="injectAcc") calc_accuracy = tf.reduce_mean( tf.cast(tf.equal(self.est_labels, self.labels), tf.float32)) accuracy = tf.cond(self.injectBool, lambda: self.injectAcc, lambda: calc_accuracy) self.scalarDict["accuracy"] = accuracy with tf.name_scope("loss"): supervised_loss = tf.reduce_sum( tf.log(1 + tf.exp(-onehot_labels * feed_forward)), axis=1) + (self.params.weight_decay / 2) * tf.norm( self.W, axis=[1, 2]) self.scalarDict["supervised_loss"] = tf.reduce_mean( supervised_loss) with tf.name_scope("opt"): D_covar = tf.matmul( self.D, self.D, transpose_b=True) + self.params.l2_weight * tf.eye( self.params.dict_size, batch_shape=[self.params.num_classes]) #Calculate supervised gradients [sup_grad_wrt_a, sup_grad_wrt_W] = tf.gradients(supervised_loss, [sc_activation, self.W]) #D_covar^-1 * gradient sup_grad_wrt_a = tf.transpose(sup_grad_wrt_a, [0, 2, 1]) beta = tf.matrix_solve(D_covar, sup_grad_wrt_a) #compute learning rate train_step = tf.Variable(0, name='train_step', dtype=tf.int64) #Updates the tf train_step with the global timestep of the object update_timestep = tf.assign_add(train_step, 1) lr = tf.minimum( self.params.start_lr, self.params.start_lr * (self.params.decay_time / tf.cast(train_step, tf.float32))) #Update W #Note that the paper adds weight decay on W, but this is encompassed into the gradient wrt W self.update_W = tf.assign_add(self.W, -lr * sup_grad_wrt_W) D_grad_term_1 = tf.matmul(sc_activation, tf.matmul(beta, -self.D, transpose_a=True), transpose_a=True) D_grad_term_2 = tf.matmul(beta, (tile_input - self.scObj.recon)) self.update_D = tf.assign_add( self.D, -lr * (D_grad_term_1 + D_grad_term_2)) #Normalize D norm_D = tf.norm(self.D, axis=2, keepdims=True) #Only normalize if norm > 1, i.e., l2 dict element always <= 1 norm_D = tf.maximum(tf.ones(D_shape), norm_D) #Normalize after update #with tf.control_dependencies([self.update_D]): self.normalize_D = self.D.assign(self.D / norm_D) #Group all update ops #Always make sure the tf timestep is in sync with global timestep with tf.control_dependencies([update_timestep]): self.update_step = tf.group(self.update_W, self.update_D) self.scalarDict["learning_rate"] = lr
def cnn_layer(self, X_inputs, n_step): """ TextCNN 模型。 Args: X_inputs: tensor.shape=(batch_size, n_step) Returns: title_outputs: tensor.shape=(batch_size, self.n_filter_total) """ inputs = tf.nn.embedding_lookup(self.embedding, X_inputs) inputs = tf.expand_dims(inputs, -1) pooled_outputs = list() for i, filter_size in enumerate(self.settings.filter_sizes): with tf.variable_scope("conv1%s" % filter_size): # Convolution Layer filter_shape = [ filter_size, self.settings.embedding_dim, 1, self.settings.n_filter ] W_filter = weight_variable(shape=filter_shape, name='W_filter') beta = bias_variable(shape=[self.settings.n_filter], name='beta_filter') # tf.summary.histogram('beta', beta) conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv") # conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True) # 在激活层前面加 BN # Apply nonlinearity, batch norm scaling is not useful with relus # batch norm offsets are used instead of biases,使用 BN 层的 offset,不要 biases h = tf.nn.relu(conv, name="relu") with tf.variable_scope("conv2%s" % filter_size): filter_shape = [ filter_size, 1, self.settings.n_filter, self.settings.n_filter ] W_filter = weight_variable(shape=filter_shape, name='W_filter') beta = bias_variable(shape=[self.settings.n_filter], name='beta_filter') # tf.summary.histogram('beta', beta) conv = tf.nn.conv2d(h, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv") # conv_bn, update_ema = self.batch_norm(conv, beta, convolutional=True) # 在激活层前面加 BN # Apply nonlinearity, batch norm scaling is not useful with relus # batch norm offsets are used instead of biases,使用 BN 层的 offset,不要 biases # h = tf.nn.relu(conv_bn, name="relu") h = tf.nn.relu(conv, name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, n_step - filter_size * 2 + 2, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) # self.update_emas.append(update_ema) h_pool = tf.concat(pooled_outputs, 3) h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total]) return h_pool_flat # shape = [batch_size, self.n_filter_total]
def __init__(self): self.model_name = 'transformer' self.settings = TransformerSetting() self.max_f1 = 0.0 self.is_training = True with tf.name_scope('Inputs'): self.title_input = tf.placeholder(tf.int64, [None, self.settings.title_len], name='title_inputs') self.detail_input = tf.placeholder( tf.int64, [None, self.settings.detail_len], name='detail_inputs') self.class_input = tf.placeholder(tf.float32, [None, self.settings.class_num], name='class_input') self.keep_prob = tf.placeholder(tf.float32, []) """"===========title encoder start================""" """ 构建embedding层 """ self.title_embedded, self.lookup_table = embedding( self.title_input, vocab_size=self.settings.voc_size, num_units=self.settings.embedding_dim, scale=True, scope="title_embedding") self.title_embedded += embedding(tf.tile( tf.expand_dims(tf.range(self.settings.title_len), 0), [self.settings.batch_size, 1]), vocab_size=self.settings.title_len, num_units=self.settings.embedding_dim, zero_pad=False, scale=False, scope="title_position_embedding")[0] """ Dropout """ self.title_embedded = tf.layers.dropout(self.title_embedded, rate=self.keep_prob, training=tf.convert_to_tensor( self.is_training)) ## Blocks for i in range(self.settings.num_blocks): with tf.variable_scope("title_num_blocks_{}".format(i)): ### Multihead Attention self.title_embedded = multihead_attention( queries=self.title_embedded, keys=self.title_embedded, num_units=self.settings.hidden_dim, num_heads=self.settings.num_heads, dropout_rate=self.keep_prob, is_training=self.is_training, causality=False) ### Feed Forward self.title_embedded = feedforward( self.title_embedded, num_units=[ 4 * self.settings.hidden_dim, self.settings.hidden_dim ]) """ sum """ self.title_encoder = tf.reduce_sum(self.title_embedded, axis=1) """"===========title encoder end================""" """"===========description encoder start================""" """ 构建embedding层 """ self.description_embedded = tf.nn.embedding_lookup( self.lookup_table, self.detail_input) * (self.settings.embedding_dim**0.5) self.description_embedded += embedding( tf.tile(tf.expand_dims(tf.range(self.settings.detail_len), 0), [self.settings.batch_size, 1]), vocab_size=self.settings.detail_len, num_units=self.settings.embedding_dim, zero_pad=False, scale=False, scope="description_position_embedding")[0] """ Dropout """ self.description_embedded = tf.layers.dropout( self.description_embedded, rate=self.keep_prob, training=tf.convert_to_tensor(self.is_training)) ## Blocks for i in range(self.settings.num_blocks): with tf.variable_scope("description_num_blocks_{}".format(i)): ### Multihead Attention self.description_embedded = multihead_attention( queries=self.description_embedded, keys=self.description_embedded, num_units=self.settings.hidden_dim, num_heads=self.settings.num_heads, dropout_rate=self.keep_prob, is_training=self.is_training, causality=False) ### Feed Forward self.description_embedded = feedforward( self.description_embedded, num_units=[ 4 * self.settings.hidden_dim, self.settings.hidden_dim ]) """ sum """ self.description_encoder = tf.reduce_sum(self.description_embedded, axis=1) """"===========description encoder end================""" """ 构建fully connected层 """ with tf.variable_scope('fc'): concat_output = tf.concat( [self.title_encoder, self.description_encoder], axis=1) W_fc = weight_variable( [self.settings.hidden_dim * 2, self.settings.fc_hidden_dim], name='Weight_fc') fc_output = tf.matmul(concat_output, W_fc, name='h_fc') fc_bn_relu = tf.nn.relu(fc_output, name="relu") """ 构建输出层 """ with tf.variable_scope('output'): W_out = weight_variable( [self.settings.fc_hidden_dim, self.settings.class_num], name='Weight_out') b_out = bias_variable([self.settings.class_num], name='bias_out') self.y_pred = tf.nn.xw_plus_b(fc_bn_relu, W_out, b_out, name='y_pred') self.sigmoid_y_pred = tf.nn.sigmoid(self.y_pred) """ loss """ with tf.variable_scope('loss'): self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=self.y_pred, labels=self.class_input)) """ train """ with tf.variable_scope('training_ops'): self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.settings.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.loss, global_step=self.global_step) self.saver = tf.train.Saver(max_to_keep=1, name='cnn') print(f'{self.model_name} init finish')