def test_measurement(): opt = YFOptimizer(zero_debias=False) w = tf.Variable(np.ones([n_dim, ] ), dtype=tf.float32, name="w", trainable=True) b = tf.Variable(np.ones([1, ], dtype=np.float32), dtype=tf.float32, name="b", trainable=True) x = tf.constant(np.ones([n_dim, ], dtype=np.float32), dtype=tf.float32) loss = tf.multiply(w, x) + b tvars = tf.trainable_variables() w_grad_val = tf.placeholder(tf.float32, shape=(n_dim, ) ) b_grad_val = tf.placeholder(tf.float32, shape=(1, ) ) apply_op = opt.apply_gradients(zip([w_grad_val, b_grad_val], tvars) ) init_op = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init_op) target_h_max = 0.0 target_h_min = 0.0 g_norm_squared_avg = 0.0 g_norm_avg = 0.0 g_avg = 0.0 target_dist = 0.0 for i in range(n_iter): feed_dict = {w_grad_val: (i + 1) * np.ones( [n_dim, ], dtype=np.float32), b_grad_val: (i + 1) * np.ones( [1, ], dtype=np.float32) } res = sess.run( [opt._curv_win, opt._h_max, opt._h_min, opt._grad_var, opt._dist_to_opt_avg, apply_op], feed_dict=feed_dict) g_norm_squared_avg = 0.999 * g_norm_squared_avg \ + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2) g_norm_avg = 0.999 * g_norm_avg \ + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) ) g_avg = 0.999 * g_avg + 0.001 * (i + 1) target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2*(n_dim + 1) target_h_min = 0.999 * target_h_min + 0.001 * max(1, i + 2 - 20)**2*(n_dim + 1) target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1) target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg # print "iter ", i, " h max ", res[1], target_h_max, " h min ", res[2], target_h_min, \ # " var ", res[3], target_var, " dist ", res[4], target_dist assert np.abs(target_h_max - res[1] ) < np.abs(target_h_max) * 1e-3 assert np.abs(target_h_min - res[2] ) < np.abs(target_h_min) * 1e-3 assert np.abs(target_var - res[3] ) < np.abs(res[3] ) * 1e-3 assert np.abs(target_dist - res[4] ) < np.abs(res[4] ) * 1e-3 print "sync measurement test passed!"
def __init__(self, is_training, config): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) # lstm_cell = tf.contrib.rnn.BasicLSTMCell(size, forget_bias=1.0, # state_is_tuple=True) # if is_training and config.keep_prob < 1: # lstm_cell = tf.contrib.rnn.DropoutWrapper( # lstm_cell, output_keep_prob=config.keep_prob) # cell = tf.contrib.rnn.MultiRNNCell([lstm_cell] * config.num_layers, # state_is_tuple=True) # Slightly better results can be obtained with forget gate biases # initialized to 1 but the hyperparameters of the model would need to be # different than reported in the paper. def lstm_cell(): # With the latest TensorFlow source code (as of Mar 27, 2017), # the BasicLSTMCell will need a reuse parameter which is unfortunately not # defined in TensorFlow 1.0. To maintain backwards compatibility, we add # an argument check here: if 'reuse' in inspect.getargspec( tf.contrib.rnn.BasicLSTMCell.__init__).args: return tf.contrib.rnn.BasicLSTMCell( size, forget_bias=1.0, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) else: return tf.contrib.rnn.BasicLSTMCell(size, forget_bias=1.0, state_is_tuple=True) attn_cell = lstm_cell if is_training and config.keep_prob < 1: def attn_cell(): return tf.contrib.rnn.DropoutWrapper( lstm_cell(), output_keep_prob=config.keep_prob) cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) self._initial_state = cell.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size]) inputs = tf.nn.embedding_lookup(embedding, self._input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) # inputs = [tf.squeeze(input_, [1]) # for input_ in tf.split(inputs, num_steps, 1)] # outputs, state = tf.contrib.rnn.static_rnn(cell, inputs, initial_state=self._initial_state) outputs = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) outputs.append(cell_output) output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) logits = tf.matmul(output, softmax_w) + softmax_b loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps])]) cost = tf.reduce_sum(loss) / batch_size self._norm_loss = cost / num_steps self._cost = loss self._final_state = state if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self._norm_loss, tvars), config.max_grad_norm) if config.opt_method == "Adam": print("using Adam") optimizer = tf.train.AdamOptimizer(self.lr) elif config.opt_method == "YF": print("using YF") self.optimizer = optimizer = YFOptimizer() elif config.opt_method == "momSGD": print("uisng mom SGD") optimizer = tf.train.MomentumOptimizer(self.lr, 0.9) elif config.opt_method == "SGD": print("uisng SGD") optimizer = tf.train.GradientDescentOptimizer(self.lr) elif config.opt_method == "Adagrad": print("using adagrad") optimizer = tf.train.AdagradOptimizer(self.lr) else: print("Optimizer is not supported") self._train_op = optimizer.apply_gradients(zip(grads, tvars)) self.train_loss_summary = tf.summary.scalar('train_loss', self._norm_loss) self.writer = tf.summary.FileWriter( os.path.join(config.log_dir, time.strftime("%Y-%m-%d-%H-%M-%S")))
def _init_graph(self): self.graph = tf.Graph() with self.graph.as_default(): tf.set_random_seed(self.random_seed) # placeholder self.feat_index = tf.placeholder(tf.int32, shape=[None, self.field_size], name="feat_index") # None * F self.feat_value = tf.placeholder(tf.float32, shape=[None, self.field_size], name="feat_value") # None * F logger.info(self.feat_index.shape) logger.info(self.feat_value.shape) self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label") # None * 1 self.dropout_keep_fm = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_fm") self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_deep") self.train_phase = tf.placeholder(tf.bool, name="train_phase") # 初始化模型的参数 self.weights = self._initialize_weights() pprint(self.weights) # model self.embeddings = tf.nn.embedding_lookup( self.weights["feature_embeddings"], self.feat_index ) # None * F(39) * K # feature_embeddings= 259 * k feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1]) # None * 39 * 1 self.embeddings = tf.multiply(self.embeddings, feat_value) # 将连续变量做一个乘法处理 logger.info(self.embeddings) # None * 39 * K(8) # ---------- first order term ---------- self.y_first_order = tf.nn.embedding_lookup( self.weights["feature_bias"], self.feat_index) # None * F * 1 # feature_bias 259 * 1 self.y_first_order = tf.reduce_sum( tf.multiply(self.y_first_order, feat_value), 2) # None * F(39) # 线性组合部分, 常数项没有? self.y_first_order = tf.nn.dropout( self.y_first_order, self.dropout_keep_fm[0]) # None * F # ---------- second order term --------------- # sum_square part # 元素和的平方 self.summed_features_emb = tf.reduce_sum(self.embeddings, 1) # None * K self.summed_features_emb_square = tf.square( self.summed_features_emb) # None * K # square_sum part # 平方的加和 self.squared_features_emb = tf.square(self.embeddings) self.squared_sum_features_emb = tf.reduce_sum( self.squared_features_emb, 1) # None * K # second order self.y_second_order = 0.5 * tf.subtract( self.summed_features_emb_square, self.squared_sum_features_emb) # None * K self.y_second_order = tf.nn.dropout( self.y_second_order, self.dropout_keep_fm[1]) # None * K # ---------- Deep component ---------- self.y_deep = tf.reshape(self.embeddings, shape=[ -1, self.field_size * self.embedding_size ]) # None * (F*K) # FM 和 deep 共享输入 self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0]) for i in range(0, len(self.deep_layers)): self.y_deep = tf.add( tf.matmul(self.y_deep, self.weights["layer_%d" % i]), self.weights["bias_%d" % i]) # None * layer[i] * 1 if self.batch_norm: self.y_deep = self.batch_norm_layer( self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" % i) # None * layer[i] * 1 self.y_deep = self.deep_layers_activation(self.y_deep) self.y_deep = tf.nn.dropout( self.y_deep, self.dropout_keep_deep[1 + i]) # dropout at each Deep layer # ---------- DeepFM ---------- if self.use_fm and self.use_deep: concat_input = tf.concat( [self.y_first_order, self.y_second_order, self.y_deep], axis=1) # None *(F + K + deeplayers[-1] nodes) elif self.use_fm: concat_input = tf.concat( [self.y_first_order, self.y_second_order], axis=1) # elif self.use_deep: concat_input = self.y_deep logger.info(concat_input) self.out = tf.add( tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"]) # loss if self.loss_type == "logloss": self.out = tf.nn.sigmoid(self.out) self.loss = tf.losses.log_loss(self.label, self.out) elif self.loss_type == "mse": self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out)) # l2 regularization on weights if self.l2_reg > 0: self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)( self.weights["concat_projection"]) if self.use_deep: for i in range(len(self.deep_layers)): self.loss += tf.contrib.layers.l2_regularizer( self.l2_reg)(self.weights["layer_%d" % i]) # optimizer if self.optimizer_type == "adam": self.optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) elif self.optimizer_type == "adagrad": self.optimizer = tf.train.AdagradOptimizer( learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss) elif self.optimizer_type == "gd": self.optimizer = tf.train.GradientDescentOptimizer( learning_rate=self.learning_rate).minimize(self.loss) elif self.optimizer_type == "momentum": self.optimizer = tf.train.MomentumOptimizer( learning_rate=self.learning_rate, momentum=0.95).minimize(self.loss) elif self.optimizer_type == "yellowfin": self.optimizer = YFOptimizer(learning_rate=self.learning_rate, momentum=0.0).minimize(self.loss) # init self.saver = tf.train.Saver() init = tf.global_variables_initializer() self.sess = self._init_session() self.sess.run(init) # save_path = self.saver.save(self.sess, save_path=os.path.join(SUB_DIR, "model"), global_step=0) # logger.info("模型初始化完成,保存路径为:{}".format(save_path)) # writer = tf.summary.FileWriter("./logs", self.sess.graph) # number of params total_parameters = 0 for variable in self.weights.values(): shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters if self.verbose > 0: print("#params: %d" % total_parameters)
cudnn.benchmark = True criterion = nn.CrossEntropyLoss() if args.opt_method == "SGD": logging.info("using SGD") optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) elif args.opt_method == "Adam": logging.info("using Adam") optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=5e-4) elif args.opt_method == "YF": logging.info("using YF") optimizer = YFOptimizer(net.parameters(), lr=args.lr, mu=args.mu, weight_decay=5e-4) else: raise Exception("Optimizer not supported") # Training def train(epoch, opt, loss_list,\ local_curv_list,\ max_curv_list,\ min_curv_list,\ lr_list,\ lr_t_list,\ mu_t_list,\ dr_list,\
def test_measurement(): opt = YFOptimizer(zero_debias=False) w = tf.Variable(np.ones([ n_dim, ]), dtype=tf.float32, name="w", trainable=True) b = tf.Variable(np.ones([ 1, ], dtype=np.float32), dtype=tf.float32, name="b", trainable=True) x = tf.constant(np.ones([ n_dim, ], dtype=np.float32), dtype=tf.float32) loss = tf.multiply(w, x) + b tvars = tf.trainable_variables() w_grad_val = tf.placeholder(tf.float32, shape=(n_dim, )) b_grad_val = tf.placeholder(tf.float32, shape=(1, )) apply_op = opt.apply_gradients(zip([w_grad_val, b_grad_val], tvars)) init_op = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init_op) target_h_max = 0.0 target_h_min = 0.0 g_norm_squared_avg = 0.0 g_norm_avg = 0.0 g_avg = 0.0 target_dist = 0.0 for i in range(n_iter): feed_dict = { w_grad_val: (i + 1) * np.ones([ n_dim, ], dtype=np.float32), b_grad_val: (i + 1) * np.ones([ 1, ], dtype=np.float32) } res = sess.run([ opt._curv_win, opt._h_max, opt._h_min, opt._grad_var, opt._dist_to_opt_avg, apply_op ], feed_dict=feed_dict) g_norm_squared_avg = 0.999 * g_norm_squared_avg \ + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2) g_norm_avg = 0.999 * g_norm_avg \ + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) ) g_avg = 0.999 * g_avg + 0.001 * (i + 1) target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2 * (n_dim + 1) target_h_min = 0.999 * target_h_min + 0.001 * max( 1, i + 2 - 20)**2 * (n_dim + 1) target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1) target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg # print "iter ", i, " h max ", res[1], target_h_max, " h min ", res[2], target_h_min, \ # " var ", res[3], target_var, " dist ", res[4], target_dist assert np.abs(target_h_max - res[1]) < np.abs(target_h_max) * 1e-3 assert np.abs(target_h_min - res[2]) < np.abs(target_h_min) * 1e-3 assert np.abs(target_var - res[3]) < np.abs(res[3]) * 1e-3 assert np.abs(target_dist - res[4]) < np.abs(res[4]) * 1e-3 print "sync measurement test passed!"
def __init__(self, is_training, config, input_, opt_method='sgd'): self._input = input_ batch_size = input_.batch_size num_steps = input_.num_steps size = config.hidden_size vocab_size = config.vocab_size # Slightly better results can be obtained with forget gate biases # initialized to 1 but the hyperparameters of the model would need to be # different than reported in the paper. def lstm_cell(): # With the latest TensorFlow source code (as of Mar 27, 2017), # the BasicLSTMCell will need a reuse parameter which is unfortunately not # defined in TensorFlow 1.0. To maintain backwards compatibility, we add # an argument check here: if 'reuse' in inspect.getargspec( tf.contrib.rnn.BasicLSTMCell.__init__).args: return tf.contrib.rnn.BasicLSTMCell( size, forget_bias=0.0, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) else: return tf.contrib.rnn.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=True) attn_cell = lstm_cell if is_training and config.keep_prob < 1: def attn_cell(): return tf.contrib.rnn.DropoutWrapper( lstm_cell(), output_keep_prob=config.keep_prob) cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) self._initial_state = cell.zero_state(batch_size, data_type()) with tf.device("cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size], dtype=data_type()) inputs = tf.nn.embedding_lookup(embedding, input_.input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) # Simplified version of tensorflow.models.rnn.rnn.py's rnn(). # This builds an unrolled LSTM for tutorial purposes only. # In general, use the rnn() or state_saving_rnn() from rnn.py. # # The alternative version of the code below is: # # inputs = tf.unstack(inputs, num=num_steps, axis=1) # outputs, state = tf.nn.rnn(cell, inputs, initial_state=self._initial_state) outputs = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) outputs.append(cell_output) output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size], dtype=data_type()) softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) logits = tf.matmul(output, softmax_w) + softmax_b loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( [logits], [tf.reshape(input_.targets, [-1])], [tf.ones([batch_size * num_steps], dtype=data_type())]) # self._cost = cost = tf.reduce_sum(loss) / batch_size self._cost = cost = tf.reduce_sum(loss) / (batch_size * num_steps) self._final_state = state if not is_training: return self._lr = tf.Variable(0.0, trainable=False) self._mu = tf.Variable(0.0, trainable=False) self._grad_norm_thresh = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() self.tvars = tvars self.grads = tf.gradients(cost, tvars) grads_clip, self.grad_norm = tf.clip_by_global_norm( self.grads, self._grad_norm_thresh) if opt_method == 'sgd': optimizer = tf.train.GradientDescentOptimizer(self._lr) self._train_op = optimizer.apply_gradients( zip(grads_clip, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) elif opt_method == 'mom': print("using sgd mom") optimizer = tf.train.MomentumOptimizer(self._lr, self._mu) self._train_op = optimizer.apply_gradients( zip(grads_clip, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) elif opt_method == 'adam': optimizer = tf.train.AdamOptimizer(self._lr) self._train_op = optimizer.apply_gradients( zip(grads_clip, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) elif opt_method == 'YF': optimizer = YFOptimizer(lr=1.0, mu=0.0) self._train_op = optimizer.apply_gradients(zip(self.grads, tvars)) else: raise Exception("optimizer not supported") self._new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self._lr, self._new_lr) self._new_mu = tf.placeholder(tf.float32, shape=[], name="new_momentum") self._mu_update = tf.assign(self._mu, self._new_mu) self._new_grad_norm_thresh = tf.placeholder( tf.float32, shape=[], name="new_grad_norm_thresh") self._grad_norm_thresh_update = tf.assign(self._grad_norm_thresh, self._new_grad_norm_thresh)
def learn(dataset, rank=2, scale=1., learning_rate=1e-1, tol=1e-8, epochs=100, use_yellowfin=False, use_adagrad=False, print_freq=1, model_save_file=None, model_load_file=None, batch_size=16, num_workers=None, lazy_generation=False, log_name=None, warm_start=None, learn_scale=False, checkpoint_freq=1000, sample=1., subsample=None, exponential_rescale=None, extra_steps=1, use_svrg=False, T=10, use_hmds=False): # Log configuration formatter = logging.Formatter('%(asctime)s %(message)s') logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%FT%T', ) if log_name is not None: logging.info(f"Logging to {log_name}") log = logging.getLogger() fh = logging.FileHandler(log_name) fh.setFormatter(formatter) log.addHandler(fh) logging.info(f"Commandline {sys.argv}") if model_save_file is None: logging.warn("No Model Save selected!") G = load_graph.load_graph(dataset) GM = nx.to_scipy_sparse_matrix(G) # grab scale if warm starting: if warm_start: scale = pandas.read_csv(warm_start, index_col=0).as_matrix()[0, -1] n = G.order() logging.info(f"Loaded Graph {dataset} with {n} nodes scale={scale}") Z = None def collate(ls): x, y = zip(*ls) return torch.cat(x), torch.cat(y) if lazy_generation: if subsample is not None: z = DataLoader(GraphRowSubSampler(G, scale, subsample), batch_size, shuffle=True, collate_fn=collate) else: z = DataLoader(GraphRowSampler(G, scale), batch_size, shuffle=True, collate_fn=collate) logging.info("Built Data Sampler") else: Z = gh.build_distance(G, scale, num_workers=int(num_workers) if num_workers is not None else 16) # load the whole matrix logging.info(f"Built distance matrix with {scale} factor") if subsample is not None: z = DataLoader(GraphRowSubSampler(G, scale, subsample, Z=Z), batch_size, shuffle=True, collate_fn=collate) else: idx = torch.LongTensor([(i, j) for i in range(n) for j in range(i + 1, n)]) Z_sampled = gh.dist_sample_rebuild_pos_neg( Z, sample) if sample < 1 else Z vals = torch.DoubleTensor( [Z_sampled[i, j] for i in range(n) for j in range(i + 1, n)]) z = DataLoader(TensorDataset(idx, vals), batch_size=batch_size, shuffle=True, pin_memory=torch.cuda.is_available()) logging.info("Built data loader") if model_load_file is not None: logging.info(f"Loading {model_load_file}...") m = cudaify(torch.load(model_load_file)) logging.info( f"Loaded scale {m.scale.data[0]} {torch.sum(m.w.data)} {m.epoch}") else: logging.info(f"Creating a fresh model warm_start?={warm_start}") m_init = None if warm_start: # load from DataFrame; assume that the julia combinatorial embedding has been saved ws_data = pandas.read_csv(warm_start, index_col=0).as_matrix() scale = ws_data[0, ws_data.shape[1] - 1] m_init = torch.DoubleTensor(ws_data[:, range(ws_data.shape[1] - 1)]) elif use_hmds: # m_init = torch.DoubleTensor(mds_warmstart.get_normalized_hyperbolic(mds_warmstart.get_model(dataset,rank,scale)[1])) m_init = torch.DoubleTensor( mds_warmstart.get_model(dataset, rank, scale)[1]) logging.info( f"\t Warmstarting? {warm_start} {m_init.size() if warm_start else None} {G.order()}" ) m = cudaify( Hyperbolic_Emb(G.order(), rank, initialize=m_init, learn_scale=learn_scale, exponential_rescale=exponential_rescale)) m.normalize() m.epoch = 0 logging.info( f"Constructed model with rank={rank} and epochs={m.epoch} isnan={np.any(np.isnan(m.w.cpu().data.numpy()))}" ) # # Build the Optimizer # # TODO: Redo this in a sensible way!! # opt = torch.optim.SGD(m.parameters(), lr=learning_rate) if use_yellowfin: from yellowfin import YFOptimizer opt = YFOptimizer(m.parameters()) if use_adagrad: opt = torch.optim.Adagrad(m.parameters()) if use_svrg: from svrg import SVRG base_opt = torch.optim.Adagrad if use_adagrad else torch.optim.SGD opt = SVRG(m.parameters(), lr=learning_rate, T=T, data_loader=z, opt=base_opt) logging.info(opt) # Log stats from import: when warmstarting, check that it matches Julia's stats logging.info(f"*** Initial Checkpoint. Computing Stats") major_stats(GM, 1 + m.scale.data[0], n, m, lazy_generation, Z, z) logging.info("*** End Initial Checkpoint\n") for i in range(m.epoch, m.epoch + epochs): l = 0.0 m.train(True) if use_svrg: for data in z: def closure(data=data, target=None): _data = data if target is None else (data, target) c = m.loss(cu_var(_data)) c.backward() return c.data[0] l += opt.step(closure) # Projection m.normalize() else: opt.zero_grad() # This is handled by the SVRG. for the_step in range(extra_steps): # Accumulate the gradient for u in z: _loss = m.loss(cu_var(u, requires_grad=False)) _loss.backward() l += _loss.data[0] Hyperbolic_Parameter.correct_metric( m.parameters()) # NB: THIS IS THE NEW CALL # print("Scale before step: ", m.scale.data) opt.step() # print("Scale after step: ", m.scale.data) # Projection m.normalize() #l += step(m, opt, u).data[0] # Logging code if l < tol: logging.info("Found a {l} solution. Done at iteration {i}!") break if i % print_freq == 0: logging.info(f"{i} loss={l}") if i % checkpoint_freq == 0: logging.info(f"\n*** Major Checkpoint. Computing Stats and Saving") major_stats(GM, 1 + m.scale.data[0], n, m, True, Z, z) if model_save_file is not None: fname = f"{model_save_file}.{m.epoch}" logging.info( f"Saving model into {fname} {torch.sum(m.w.data)} ") torch.save(m, fname) logging.info("*** End Major Checkpoint\n") m.epoch += 1 logging.info(f"final loss={l}") if model_save_file is not None: fname = f"{model_save_file}.final" logging.info( f"Saving model into {fname}-final {torch.sum(m.w.data)} {m.scale.data[0]}" ) torch.save(m, fname) major_stats(GM, 1 + m.scale.data[0], n, m, lazy_generation, Z, z)
def _init_graph(self): # 新生成的图作为整个 tensorflow 运行环境的默认图 self.graph = tf.Graph() with self.graph.as_default(): tf.set_random_seed(self.random_seed) # shape说明是一个二维矩阵,第一维是样本个数吗?像. # batch_size * 39 self.feat_index = tf.placeholder(tf.int32, shape=[None, None], name="feat_index") # None * F # shape说明是一个二维矩阵: batch_size * 39 self.feat_value = tf.placeholder(tf.float32, shape=[None, None], name="feat_value") # None * F self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label") # None * 1 self.dropout_keep_fm = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_fm") self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_deep") self.train_phase = tf.placeholder(tf.bool, name="train_phase") # 随机初始化weight self.weights = self._initialize_weights() # print(self.feat_index.shape): (1024, 39) # batch_size * 39 * 8(vi、vj的维度???感觉这样不对,这是V的shape,vi、vj的维度应该是batch_size*39*1) self.embeddings = tf.nn.embedding_lookup( self.weights["feature_embeddings"], self.feat_index ) # 之后,self.embeddings的shape:None * F * K(embbed之后的维度) # 变成了一个列向量 # 原来是 batch_size * 39====》 batch_size * 39 * 1 # reshape将shape变的跟self.embeddings的一致,为了后面两者做multiply feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1]) print(self.embeddings.shape, feat_value.shape) # batch_size * 39 * 8《== batch_size * 39 * 8,batch_size * 39 * 1 # broadcast了feature value # 这里与embedding没有什么关系了 self.embeddings = tf.multiply(self.embeddings, feat_value) # ----------wide fm: first order term ---------- # None * 39 * 1 ,如(1024, 39, 1) # 从# 259 * 1的矩阵(self.weights["feature_bias"])中找39行(self.feat_index)。 self.y_first_order = tf.nn.embedding_lookup( self.weights["feature_bias"], self.feat_index) # None * F * 1 # self.y_first_order_weights = self.y_first_order # 临时加上去的。后面删掉 self.y_first_order_tmp = self.y_first_order # self.y_first_order, feat_value都是batch_size * 39 * 1 # 算完之后就是batch_size * 39 self.y_first_order = tf.reduce_sum( tf.multiply(self.y_first_order, feat_value), 2) # None * F # 这个是一个向量,跟二阶、和dnn output出来的高阶,concatenate到一起。根据fm原理不太一样,应该output出来一个scalar才是, # 但是看concatenate之后,算了一个内积,那么也就是这个计算,作为一阶,是多余的,后面concatenate之后才是真的一阶。 # 但算了两次线性,跟一次线性计算,最终效果是一样的。 self.y_first_order = tf.nn.dropout( self.y_first_order, self.dropout_keep_fm[0]) # None * F # ----------wide fm: second order term --------------- # sum_square part # batch_size * 8 self.summed_features_emb = tf.reduce_sum(self.embeddings, 1) # None * K # batch_size * 8 self.summed_features_emb_square = tf.square( self.summed_features_emb) # None * K # square_sum part # batch_size * 8 self.squared_features_emb = tf.square(self.embeddings) # batch_size * 8 self.squared_sum_features_emb = tf.reduce_sum( self.squared_features_emb, 1) # None * K # second order # batch_size * 8。每个样本成了一个vector了 self.y_second_order = 0.5 * tf.subtract( self.summed_features_emb_square, self.squared_sum_features_emb) # None * K # batch_size * 8 self.y_second_order = tf.nn.dropout( self.y_second_order, self.dropout_keep_fm[1]) # None * K # ---------- Deep component ---------- self.y_deep = tf.reshape( self.embeddings, shape=[-1, self.field_size * self.embedding_size]) # None * (F*K) self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0]) for i in range(0, len(self.deep_layers)): self.y_deep = tf.add( tf.matmul(self.y_deep, self.weights["layer_%d" % i]), self.weights["bias_%d" % i]) # None * layer[i] * 1 if self.batch_norm: self.y_deep = self.batch_norm_layer( self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" % i) # None * layer[i] * 1 self.y_deep = self.deep_layers_activation(self.y_deep) self.y_deep = tf.nn.dropout( self.y_deep, self.dropout_keep_deep[1 + i]) # dropout at each Deep layer # ---------- DeepFM ---------- if self.use_fm and self.use_deep: # 一阶和二阶的concat到一起 # (1024, 39)+(1024, 8)+(1024, 32)===>(1024, 79) concat_input = tf.concat( [self.y_first_order, self.y_second_order, self.y_deep], axis=1) elif self.use_fm: concat_input = tf.concat( [self.y_first_order, self.y_second_order], axis=1) elif self.use_deep: concat_input = self.y_deep self.concat_input = concat_input # 这是一个scalar neiji = tf.matmul(concat_input, self.weights["concat_projection"]) # 获得了一个scalar self.out = tf.add(neiji, self.weights["concat_bias"]) # loss if self.loss_type == "logloss": self.out = tf.nn.sigmoid(self.out) self.loss = tf.losses.log_loss(self.label, self.out) elif self.loss_type == "mse": self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out)) # l2 regularization on weights if self.l2_reg > 0: self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)( self.weights["concat_projection"]) if self.use_deep: for i in range(len(self.deep_layers)): self.loss += tf.contrib.layers.l2_regularizer( self.l2_reg)(self.weights["layer_%d" % i]) # optimizer if self.optimizer_type == "adam": self.optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) elif self.optimizer_type == "adagrad": self.optimizer = tf.train.AdagradOptimizer( learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss) elif self.optimizer_type == "gd": self.optimizer = tf.train.GradientDescentOptimizer( learning_rate=self.learning_rate).minimize(self.loss) elif self.optimizer_type == "momentum": self.optimizer = tf.train.MomentumOptimizer( learning_rate=self.learning_rate, momentum=0.95).minimize(self.loss) elif self.optimizer_type == "yellowfin": self.optimizer = YFOptimizer(learning_rate=self.learning_rate, momentum=0.0).minimize(self.loss) # init self.saver = tf.train.Saver() init = tf.global_variables_initializer() self.sess = self._init_session() writer = tf.summary.FileWriter("logs/", self.sess.graph) # 第一个参数指定生成文件的目录。 self.sess.run(init) # print(self.embeddings.shape, feat_value.shape) # 统计weight个数(number of params) total_parameters = 0 for variable in self.weights.values(): shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters if self.verbose > 0: print("#params: %d" % total_parameters)
def _init_graph(self): self.graph = tf.Graph() with self.graph.as_default(): tf.set_random_seed(self.random_seed) self.feat_index = tf.placeholder(tf.int32, shape=[None, None], name="feat_index") # None * F self.feat_value = tf.placeholder(tf.float32, shape=[None, None], name="feat_value") # None * F self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label") # None * 1 self.dropout_keep_fm = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_fm") self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_deep") self.train_phase = tf.placeholder(tf.bool, name="train_phase") self.weights = self._initialize_weights() # model self.embeddings = tf.nn.embedding_lookup( self.weights["feature_embeddings"], self.feat_index) # None * F * K feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1]) self.embeddings = tf.multiply(self.embeddings, feat_value) # ---------- first order term ---------- self.y_first_order = tf.nn.embedding_lookup( self.weights["feature_bias"], self.feat_index) # None * F * 1 self.y_first_order = tf.reduce_sum( tf.multiply(self.y_first_order, feat_value), 2) # None * F self.y_first_order = tf.nn.dropout( self.y_first_order, self.dropout_keep_fm[0]) # None * F # ---------- second order term --------------- # sum_square part self.summed_features_emb = tf.reduce_sum(self.embeddings, 1) # None * K self.summed_features_emb_square = tf.square( self.summed_features_emb) # None * K # square_sum part self.squared_features_emb = tf.square(self.embeddings) self.squared_sum_features_emb = tf.reduce_sum( self.squared_features_emb, 1) # None * K # second order self.y_second_order = 0.5 * tf.subtract( self.summed_features_emb_square, self.squared_sum_features_emb) # None * K self.y_second_order = tf.nn.dropout( self.y_second_order, self.dropout_keep_fm[1]) # None * K # high order if self.use_fm and self.use_deep: z = tf.layers.Dense( self.embedding_size, kernel_initializer=tf.glorot_uniform_initializer( seed=2017), dtype=tf.float32, bias_initializer=tf.zeros_initializer())( self.y_second_order) z = tf.nn.relu(z) y_second_order = tf.nn.dropout(z, 0.5) if self.use_xfm: field_nums = [self.field_size] final_len = 0 # self.embeddings = None * F * K hidden_nn_layers = [self.embeddings] final_result = [] split_tensor0 = tf.split(hidden_nn_layers[-1], self.embedding_size * [1], 2) for idx, layer_size in enumerate([self.field_size] * 3): split_tensor = tf.split(hidden_nn_layers[-1], self.embedding_size * [1], 2) dot_result_m = tf.matmul(split_tensor0, split_tensor, transpose_b=True) dot_result_o = tf.reshape( dot_result_m, shape=[ self.embedding_size, -1, field_nums[0] * field_nums[-1] ]) dot_result = tf.transpose(dot_result_o, perm=[1, 0, 2]) filters = tf.get_variable( name="f_" + str(idx), shape=[1, field_nums[-1] * field_nums[0], layer_size], dtype=tf.float32) curr_out = tf.nn.conv1d(dot_result, filters=filters, stride=1, padding='VALID') # if bians: b = tf.get_variable(name="f_b" + str(idx), shape=[layer_size], dtype=tf.float32, initializer=tf.zeros_initializer()) curr_out = tf.nn.bias_add(curr_out, b) curr_out = tf.nn.relu(curr_out) curr_out = tf.transpose(curr_out, perm=[0, 2, 1]) direct_connect = curr_out next_hidden = curr_out final_len += layer_size field_nums.append(int(layer_size)) final_result.append(direct_connect) hidden_nn_layers.append(next_hidden) result = tf.concat(final_result, axis=1) result = tf.reduce_sum(result, -1) # res net w_nn_output1 = tf.get_variable(name='w_nn_output1', shape=[final_len, 128], dtype=tf.float32) b_nn_output1 = tf.get_variable( name='b_nn_output1', shape=[128], dtype=tf.float32, initializer=tf.zeros_initializer()) exFM_out0 = tf.nn.xw_plus_b(result, w_nn_output1, b_nn_output1) exFM_out1 = tf.nn.dropout(exFM_out0, 0.3) exFM_out1 = tf.nn.relu(exFM_out1) w_nn_output2 = tf.get_variable( name='w_nn_output2', shape=[128 + final_len, self.embedding_size], dtype=tf.float32) b_nn_output2 = tf.get_variable( name='b_nn_output2', shape=[self.embedding_size], dtype=tf.float32, initializer=tf.zeros_initializer()) exFM_in = tf.concat([exFM_out1, result], axis=1, name="user_emb") self.exFM_out = tf.nn.xw_plus_b(exFM_in, w_nn_output2, b_nn_output2) # ---------- Deep component ---------- self.y_deep = tf.reshape( self.embeddings, shape=[-1, self.field_size * self.embedding_size]) # None * (F*K) self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0]) for i in range(0, len(self.deep_layers)): self.y_deep = tf.add( tf.matmul(self.y_deep, self.weights["layer_%d" % i]), self.weights["bias_%d" % i]) # None * layer[i] * 1 if self.batch_norm: self.y_deep = self.batch_norm_layer( self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" % i) # None * layer[i] * 1 self.y_deep = self.deep_layers_activation(self.y_deep) self.y_deep = tf.nn.dropout( self.y_deep, self.dropout_keep_deep[1 + i]) # dropout at each Deep layer # ---------- DeepFM ---------- if self.use_fm and self.use_deep: concat_input = tf.concat( [self.y_first_order, self.y_second_order, self.y_deep], axis=1) elif self.use_xfm and self.use_deep: concat_input = tf.concat( [self.y_first_order, self.exFM_out, self.y_deep], axis=1) elif self.use_fm: concat_input = tf.concat( [self.y_first_order, self.y_second_order], axis=1) elif self.use_deep: concat_input = self.y_deep self.out = tf.add( tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"]) # loss if self.loss_type == "logloss": self.out = tf.nn.sigmoid(self.out) self.loss = tf.losses.log_loss(self.label, self.out) elif self.loss_type == "mse": self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out)) # l2 regularization on weights if self.l2_reg > 0: self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)( self.weights["concat_projection"]) if self.use_deep: for i in range(len(self.deep_layers)): self.loss += tf.contrib.layers.l2_regularizer( self.l2_reg)(self.weights["layer_%d" % i]) # optimizer if self.optimizer_type == "adam": self.optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) elif self.optimizer_type == "adagrad": self.optimizer = tf.train.AdagradOptimizer( learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss) elif self.optimizer_type == "gd": self.optimizer = tf.train.GradientDescentOptimizer( learning_rate=self.learning_rate).minimize(self.loss) elif self.optimizer_type == "momentum": self.optimizer = tf.train.MomentumOptimizer( learning_rate=self.learning_rate, momentum=0.95).minimize(self.loss) elif self.optimizer_type == "yellowfin": self.optimizer = YFOptimizer(learning_rate=self.learning_rate, momentum=0.0).minimize(self.loss) # init self.model_path = './model' self.saver = tf.train.Saver() init = tf.global_variables_initializer() self.sess = self._init_session() self.sess.run(init) # number of params total_parameters = 0 for variable in self.weights.values(): shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters if self.verbose > 0: print("#params: %d" % total_parameters)
def _init_graph(self): self.graph = tf.Graph() with self.graph.as_default(): tf.set_random_seed(self.rand_seed) self.feat_index = tf.placeholder(tf.int32, shape=[None, None], name="feat_index") self.feat_value = tf.placeholder(tf.float32, shape=[None, None], name="feat_value") self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label") self.dropout_keep_fm = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_fm") self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_deep") self.train_phase = tf.placeholder(tf.bool, name="train_phase") self.weight = self._init_weight() # 创建DeepFM模型图 self.embeddings = tf.nn.embedding_lookup( self.weight['feat_embeddings'], self.feat_index) feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1]) self.embeddings = tf.multiply(self.embeddings, feat_value) # None*F*K # 一阶项 self.y_first_order = tf.nn.embedding_lookup( self.weight["feat_bias"], self.feat_index) self.y_first_order = tf.reduce_sum( tf.multiply(self.y_first_order, feat_value), 2) self.y_first_order = tf.nn.dropout(self.y_first_order, self.dropout_keep_fm[0]) # 二阶项 # 先求和后平方 sum_feat_emb = tf.reduce_sum(self.embeddings, 1) self.sum_square_feat_emb = tf.square(sum_feat_emb) # 先平方后求和 sqrt_feat_emb = tf.square(self.embeddings) self.sqrt_sum_feat_emb = tf.reduce_sum(sqrt_feat_emb, 1) # 二阶项 self.y_second_order = 0.5 * tf.subtract(self.sum_square_feat_emb, self.sqrt_sum_feat_emb) self.y_second_order = tf.nn.dropout(self.y_second_order, self.dropout_keep_fm[1]) # deep部分 self.y_deep = tf.reshape( self.embeddings, [-1, self.field_size * self.embedding_size]) self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0]) for i in range(0, len(self.deep_layer)): self.y_deep = tf.add( tf.matmul(self.y_deep, self.weight["layer_%d" % i]), self.weight["bias_%d" % i]) if self.batch_norm: self.y_deep = self.batch_norm_layer(self.y_deep, self.train_phase, scope_bn="bn_%d" % i) self.y_deep = self.deep_layer_activation(self.y_deep) # 三部分向量拼接 if self.use_deep and self.use_fm: concat_input = tf.concat( [self.y_first_order, self.y_second_order, self.y_deep], axis=1) elif self.use_fm: concat_input = tf.concat( [self.y_first_order, self.y_second_order], axis=1) elif self.use_deep: concat_input = self.y_deep self.out = tf.add( tf.matmul(concat_input, self.weight['concat_projection']), self.weight['concat_bias']) # 损失函数 if self.loss_type == 'logloss': self.out = tf.sigmoid(self.out) self.loss = tf.losses.log_loss(self.label, self.out) elif self.loss_type == 'mse': self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out)) # 是否加正则 if self.l2_reg > 0: self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)( self.weight["concat_projection"]) if self.use_deep: for i in range(len(self.deep_layer)): self.loss += tf.contrib.layers.l2_regularizer( self.l2_reg)(self.weight["layer_%d" % i]) # 优化器 if self.opt_type == 'adam': self.opt = tf.train.AdamOptimizer(self.learning_rate).minimize( self.loss) elif self.opt_type == 'adagrade': self.opt = tf.train.AdagradOptimizer( self.learning_rate).minimize(self.loss) elif self.opt_type == 'gd': self.opt = tf.train.GradientDescentOptimizer( self.learning_rate).minimize(self.loss) elif self.opt_type == 'momentum': self.opt = tf.train.MomentumOptimizer( self.learning_rate).minimize(self.loss) elif self.opt_type == 'yellowfin': self.opt = YFOptimizer( learning_rate=self.learning_rate).minimize(self.loss) # 建立init self.saver = tf.train.Saver() init = tf.global_variables_initializer() self.sess = self._init_session() self.sess.run(init) # 打印参数数量 total_parametres = 0 for variable in self.weight.values(): shape = variable.get_shape() variable_parametres = 1 for dim in shape: variable_parametres *= dim total_parametres += variable_parametres if self.verbose > 0: print("# params: %d" % total_parametres)
for p in m2.parameters(): p.requires_grad = args.learn_bn if args.learn_inhibition: for p in model.module.parameters(): p.requires_grad=True params = trainableParams(model) print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters() if p.requires_grad)/1000000.0)) if opt_ == 'sgd': print('optimizer.... - sgd') optimizer = optim.SGD(params , lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif opt_ == 'adam': optimizer = optim.Adam(params) elif opt_ == 'yf': print('USING YF OPTIMIZER') optimizer = YFOptimizer( params, lr=args.lr, mu=0.0, weight_decay=args.weight_decay, clip_thresh=2.0, curv_win_width=20) optimizer._sparsity_debias = False else: raise Exception('unsupported optimizer type',opt_) nParamsPath = os.path.join(args.checkpoint, 'n_params.txt') with open(nParamsPath, 'w') as f: s1 = 'active_params {} \n'.format(sum(p.numel() for p in model.parameters() if p.requires_grad)) f.write(s1) s2 = 'total_params {} \n'.format(sum(p.numel() for p in model.parameters())) f.write(s2) if args.print_params_and_exit: exit() # Resume title = 'cifar-10-' + args.arch
def train_eval_model(graph_hyper_params): def construct_train_data(pos_train_data, neg_train_data, graph_hyper_params): # global pos_train_data, neg_train_data, start_neg pos_len, neg_len = len(pos_train_data), len(neg_train_data) # print start_neg, pos_len, neg_len if graph_hyper_params['neg_start'] * pos_len + graph_hyper_params['neg_size'] * pos_len < neg_len: this_neg_train_data = neg_train_data[graph_hyper_params['neg_start'] * pos_len: \ graph_hyper_params['neg_start'] * pos_len + graph_hyper_params[ 'neg_size'] * pos_len] else: print 'fianl ! fianl ! fianl ! fianl !' this_neg_train_data = pd.concat([neg_train_data[graph_hyper_params['neg_start'] * pos_len:], neg_train_data[: pos_len - max(0,neg_len - graph_hyper_params['neg_start'] * pos_len)]]) train_data = pd.concat([pos_train_data, this_neg_train_data]) return shuffle(train_data) print graph_hyper_params print 'read data start !' pos_train_data, neg_train_data, predict_data1, predict_data2, user_data, ad_data, feature_conf_dict, uid_map, aid_map = get_prod_dataset(graph_hyper_params['formal']) print 'read data done !' # 重新 split train dev o_dev_size = graph_hyper_params['o_dev_size'] dev_data = pd.concat([pos_train_data[:o_dev_size], neg_train_data[:o_dev_size]]) pos_train_data, neg_train_data = pos_train_data[o_dev_size:], neg_train_data[o_dev_size:] print 'dev_size:', len(dev_data) print 'pos-neg-len:', len(pos_train_data), len(neg_train_data) train_data = construct_train_data(pos_train_data, neg_train_data, graph_hyper_params) if graph_hyper_params['only_train']: if graph_hyper_params['formal']: formal_set = set(list(train_data['uid']) + list(dev_data['uid'])) else: formal_set = set(list(train_data['uid']) + list(dev_data['uid']) + [1, 2, 3, 4]) user_data = user_data[user_data['uid'].isin(formal_set)] import gc gc.collect() print 'map row start' uid_map_row, aid_map_row = dict(zip(user_data['uid'].values, np.arange(len(user_data)))), dict(zip(ad_data['aid'].values, np.arange(len(ad_data)))) print 'map row end' print feature_conf_dict graph = tf.Graph() with graph.as_default(): # 对 creativeSize 这一个连续特征的处理 if graph_hyper_params['creativeSize_pro'] == 'min_max': print 'min-max norm creativeSize', ad_data['creativeSize'].max(), ad_data['creativeSize'].min() norm_cs = (ad_data['creativeSize'] * 1.0 - ad_data['creativeSize'].min()) / ( ad_data['creativeSize'].max() - ad_data['creativeSize'].min()) ad_data = ad_data.drop(['creativeSize'], axis=1) ad_data['creativeSize'] = norm_cs creativesize_p = tf.placeholder(tf.float32, [None, 1], name="creativeSize") elif graph_hyper_params['creativeSize_pro'] == 'li_san': print '离散化 creativeSize' sh = ShrinkSep() ad_data['creativeSize'] = ad_data['creativeSize'].apply(sh) feature_conf_dict['creativeSize'] = len(sh.d) + 1 creativesize_p = tf.placeholder(tf.int32, [None, 1], name="creativeSize") else: print 'no process creativeSize' # ****************************************************************** place holder start uid_p = tf.placeholder(tf.int32, [None, 1], name="uid") lbs_p = tf.placeholder(tf.int32, [None, 1], name="LBS") age_p = tf.placeholder(tf.int32, [None, 1], name="age") carrier_p = tf.placeholder(tf.int32, [None, 1], name="carrier") consumptionability_p = tf.placeholder(tf.int32, [None, 1], name="consumptionAbility") education_p = tf.placeholder(tf.int32, [None, 1], name="education") gender_p = tf.placeholder(tf.int32, [None, 1], name="gender") house_p = tf.placeholder(tf.int32, [None, 1], name="house") os_p = tf.placeholder(tf.int32, [None, 1], name="os") ct_p = tf.placeholder(tf.int32, [None, 1], name="ct") # marriagestatus_p = tf.placeholder(tf.int32, [None, 1], name="marriageStatus") appidaction_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['appIdAction'][1]], name="appidaction_index") appidaction_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['appIdAction'][1]], name="appidaction_val") appIdInstall_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['appIdInstall'][1]], name="appIdInstall_index") appIdInstall_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['appIdInstall'][1]], name="appIdInstall_val") marriagestatus_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['marriageStatus'][0]], name="marriageStatus_index") marriagestatus_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['marriageStatus'][0]], name="marriageStatus_val") interest1_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['interest1'][0]], name="interest1_index") interest1_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['interest1'][0]], name="interest1_val") interest2_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['interest2'][0]], name="interest2_index") interest2_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['interest2'][0]], name="interest2_val") interest3_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['interest3'][0]], name="interest3_index") interest3_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['interest3'][0]], name="interest3_val") interest4_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['interest4'][0]], name="interest4_index") interest4_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['interest4'][0]], name="interest4_val") interest5_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['interest5'][0]], name="interest5_index") interest5_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['interest5'][0]], name="interest5_val") # kmeans type # clu_200_p = tf.placeholder(tf.int32, [None, 1], name="clu_200_p") # clu_400_p = tf.placeholder(tf.int32, [None, 1], name="clu_400_p") kw1_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['kw1'][1]], name="kw1_index") kw1_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['kw1'][1]], name="kw1_val") kw2_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['kw2'][1]], name="kw2_index") kw2_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['kw2'][1]], name="kw2_val") kw3_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['kw3'][1]], name="kw3_index") kw3_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['kw3'][1]], name="kw3_val") topic1_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['topic1'][1]], name="topic1_index") topic1_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['topic1'][1]], name="topic1_val") topic2_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['topic2'][1]], name="topic2_index") topic2_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['topic2'][1]], name="topic2_val") topic3_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['topic3'][1]], name="topic3_index") topic3_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['topic3'][1]], name="topic3_val") aid_p = tf.placeholder(tf.int32, [None, 1], name="aid") advertiserid_p = tf.placeholder(tf.int32, [None, 1], name="advertiserId") campaignid_p = tf.placeholder(tf.int32, [None, 1], name="campaignId") creativeid_p = tf.placeholder(tf.int32, [None, 1], name="creativeId") adcategoryid_p = tf.placeholder(tf.int32, [None, 1], name="adCategoryId") productid_p = tf.placeholder(tf.int32, [None, 1], name="productId") producttype_p = tf.placeholder(tf.int32, [None, 1], name="productType") true_label = tf.placeholder(tf.float32, [None, 1], name="true_label") # ****************************************************************** place holder end pred_val, model_loss, network_params = inference(uid_p, lbs_p, age_p, carrier_p, consumptionability_p, education_p, gender_p, house_p, os_p, ct_p, marriagestatus_index_p, marriagestatus_val_p, appidaction_index_p, appidaction_val_p, appIdInstall_index_p, appIdInstall_val_p, interest1_index_p, interest1_val_p, interest2_index_p, interest2_val_p, interest3_index_p, interest3_val_p, interest4_index_p, interest4_val_p, interest5_index_p, interest5_val_p, kw1_index_p, kw1_val_p, kw2_index_p, kw2_val_p, kw3_index_p, kw3_val_p, topic1_index_p, topic1_val_p, topic2_index_p, topic2_val_p, topic3_index_p, topic3_val_p, aid_p, advertiserid_p, campaignid_p, creativeid_p, adcategoryid_p, productid_p, producttype_p, creativesize_p, true_label, feature_conf_dict, graph_hyper_params) global_step = tf.Variable(0, name="global_step", trainable=False) train_step = None learning_rate = tf.Variable(float(graph_hyper_params['learn_rate']), trainable=False, dtype=tf.float32) learning_rate_decay_op = learning_rate.assign(learning_rate * 0.5) if graph_hyper_params['opt'] == 'adam': train_step = tf.train.AdamOptimizer(learning_rate).minimize(model_loss, global_step=global_step) elif graph_hyper_params['opt'] == 'adgrad': train_step = tf.train.AdagradOptimizer(learning_rate).minimize(model_loss, global_step=global_step) elif graph_hyper_params['opt'] == 'adadelta': train_step = tf.train.AdadeltaOptimizer(learning_rate).minimize(model_loss, global_step=global_step) elif graph_hyper_params['opt'] == 'ftrl': train_step = tf.train.FtrlOptimizer(learning_rate).minimize(model_loss, global_step=global_step) elif graph_hyper_params['opt'] == 'sgd': train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(model_loss, global_step=global_step) elif graph_hyper_params['opt'] == "yellowfin": train_step = YFOptimizer(learning_rate=learning_rate, momentum=0.0).minimize(model_loss, global_step=global_step) else: print 'No optimizer !' time_now = 'model_' + str(graph_hyper_params['model']) + datetime.now().strftime("_%Y_%m_%d_%H_%M_%S") checkpoint_dir = os.path.abspath("./checkpoints/dmf_tencent/" + time_now) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) def get_fed_dict(b_data, split_vector_data, feature_conf_dict, predict=False): if graph_hyper_params['formal']: aid_list = b_data['aid'].values uid_list = b_data['uid'].values else: if len(b_data) == 4: aid_list, uid_list = [1, 2, 3, 4], [1, 2, 3, 4] elif len(b_data) == 3: aid_list, uid_list = [1, 2, 3], [1, 2, 3] else: aid_list, uid_list = [1], [1] # print 11 # d1 = datetime.now() b_u_d, b_a_d = [], [] for b_uid in uid_list: b_u_d.append(user_data.iloc[uid_map_row[b_uid]]) for b_aid in aid_list: b_a_d.append(ad_data.iloc[aid_map_row[b_aid]]) b_u_d = pd.concat(b_u_d, axis=1).transpose() b_a_d = pd.concat(b_a_d, axis=1).transpose() # d3 = datetime.now() # print 12 # pd.concat([data.iloc[1].to_frame(), data.iloc[2].to_frame()], axis=1).transpose() fed_dict = {} fed_dict[uid_p] = np.expand_dims(b_u_d['uid'], axis=1) fed_dict[lbs_p] = np.expand_dims(b_u_d['LBS'], axis=1) fed_dict[age_p] = np.expand_dims(b_u_d['age'], axis=1) fed_dict[carrier_p] = np.expand_dims(b_u_d['carrier'], axis=1) fed_dict[consumptionability_p] = np.expand_dims(b_u_d['consumptionAbility'], axis=1) fed_dict[education_p] = np.expand_dims(b_u_d['education'], axis=1) fed_dict[gender_p] = np.expand_dims(b_u_d['gender'], axis=1) fed_dict[house_p] = np.expand_dims(b_u_d['house'], axis=1) fed_dict[os_p] = np.expand_dims(b_u_d['os'], axis=1) fed_dict[ct_p] = np.expand_dims(b_u_d['ct'], axis=1) # fed_dict[marriagestatus_p] = np.expand_dims(b_u_d['marriageStatus'], axis=1) # print 121 appidaction_li = split_vector_data(b_u_d['appIdAction']) # print 1212 fed_dict[appidaction_index_p], fed_dict[appidaction_val_p] = appidaction_li[0], appidaction_li[1] appIdInstall_li = split_vector_data(b_u_d['appIdInstall']) fed_dict[appIdInstall_index_p], fed_dict[appIdInstall_val_p] = appIdInstall_li[0], appIdInstall_li[1] # print 122 marriagestatus_li = split_vector_data(b_u_d['marriageStatus'], interest='marriageStatus', feature_config=feature_conf_dict) fed_dict[marriagestatus_index_p], fed_dict[marriagestatus_val_p] = marriagestatus_li[0], marriagestatus_li[1] interest1_li = split_vector_data(b_u_d['interest1'], interest='interest1', feature_config=feature_conf_dict) fed_dict[interest1_index_p], fed_dict[interest1_val_p] = interest1_li[0], interest1_li[1] interest2_li = split_vector_data(b_u_d['interest2'], interest='interest2', feature_config=feature_conf_dict) fed_dict[interest2_index_p], fed_dict[interest2_val_p] = interest2_li[0], interest2_li[1] interest3_li = split_vector_data(b_u_d['interest3'], interest='interest3', feature_config=feature_conf_dict) fed_dict[interest3_index_p], fed_dict[interest3_val_p] = interest3_li[0], interest3_li[1] interest4_li = split_vector_data(b_u_d['interest4'], interest='interest4', feature_config=feature_conf_dict) fed_dict[interest4_index_p], fed_dict[interest4_val_p] = interest4_li[0], interest4_li[1] interest5_li = split_vector_data(b_u_d['interest5'], interest='interest5', feature_config=feature_conf_dict) fed_dict[interest5_index_p], fed_dict[interest5_val_p] = interest5_li[0], interest5_li[1] # print 123 kw1_li = split_vector_data(b_u_d['kw1']) fed_dict[kw1_index_p], fed_dict[kw1_val_p] = kw1_li[0], kw1_li[1] kw2_li = split_vector_data(b_u_d['kw2']) fed_dict[kw2_index_p], fed_dict[kw2_val_p] = kw2_li[0], kw2_li[1] kw3_li = split_vector_data(b_u_d['kw3']) fed_dict[kw3_index_p], fed_dict[kw3_val_p] = kw3_li[0], kw3_li[1] # print 124 topic1_li = split_vector_data(b_u_d['topic1']) fed_dict[topic1_index_p], fed_dict[topic1_val_p] = topic1_li[0], topic1_li[1] topic2_li = split_vector_data(b_u_d['topic2']) fed_dict[topic2_index_p], fed_dict[topic2_val_p] = topic2_li[0], topic2_li[1] topic3_li = split_vector_data(b_u_d['topic3']) fed_dict[topic3_index_p], fed_dict[topic3_val_p] = topic3_li[0], topic3_li[1] # print 125 # # ad fed_dict[aid_p] = np.expand_dims(b_a_d['aid'], axis=1) fed_dict[advertiserid_p] = np.expand_dims(b_a_d['advertiserId'], axis=1) fed_dict[campaignid_p] = np.expand_dims(b_a_d['campaignId'], axis=1) fed_dict[creativeid_p] = np.expand_dims(b_a_d['creativeId'], axis=1) fed_dict[adcategoryid_p] = np.expand_dims(b_a_d['adCategoryId'], axis=1) fed_dict[productid_p] = np.expand_dims(b_a_d['productId'], axis=1) fed_dict[producttype_p] = np.expand_dims(b_a_d['productType'], axis=1) # print 13 # fed_dict[creativesize_p] = np.expand_dims(b_a_d['creativeSize'], axis=1) if graph_hyper_params['creativeSize_pro'] == 'min_max': fed_dict[creativesize_p] = np.expand_dims(b_a_d['creativeSize'], axis=1).astype(np.float32) elif graph_hyper_params['creativeSize_pro'] == 'li_san': fed_dict[creativesize_p] = np.expand_dims(b_a_d['creativeSize'], axis=1) else: print 'wrong feed' # label # print 14 if not predict: fed_dict[true_label] = np.expand_dims(b_data['label'].values, axis=1).astype(np.float32) # print 15 # d4 = datetime.now() # print d2-d1, d3-d2, d4-d3 # print fed_dict[true_label] # print len(fed_dict[true_label]), len(fed_dict[aid_p]), len(fed_dict[uid_p]), return fed_dict def eval_on_dev(split_vector_data): e_b_s = len(dev_data) / graph_hyper_params['batch_size'] auc_true, auc_pre = [], [] # auc = [] for index in tqdm(range(e_b_s)): start = index * graph_hyper_params['batch_size'] end = (index + 1) * graph_hyper_params['batch_size'] if (index + 1) * graph_hyper_params['batch_size'] < len(dev_data) else len(dev_data) b_dev_data = dev_data[start:end] fed_dict = get_fed_dict(b_dev_data, split_vector_data, feature_conf_dict) pred_value, pre_pred_value, final_vec, uu, vv = sess.run([pred_val, network_params[0], network_params[1], network_params[2], network_params[3]], feed_dict=fed_dict) pre_real_val = np.array(pred_value).reshape((-1)) auc_true = auc_true + list(b_dev_data['label'].values) auc_pre = auc_pre + pre_real_val.tolist() if True in np.isnan(pre_real_val): print 'contain nan: ', np.array(pre_pred_value).reshape((-1)) print np.array(final_vec).reshape((-1)) print np.array(uu).reshape((-1)) print np.array(vv).reshape((-1)) # auc.append() # auc_pre = np.array(auc_pre) # auc_pre = np.exp(auc_pre) / np.exp(auc_pre).sum() # print auc_true # print auc_pre fpr, tpr, thresholds = metrics.roc_curve(auc_true, auc_pre, pos_label=1) auc_v, gni = metrics.auc(fpr, tpr), gini_norm(auc_true, auc_pre) auc_pre_2 = np.array(auc_pre) auc_pre_2.sort() print('dev_pre_top2=%.4f %.4f min2=%.4f %.4f' % (auc_pre_2.tolist()[-1], auc_pre_2.tolist()[-2], auc_pre_2.tolist()[0], auc_pre_2.tolist()[1])) return auc_v, gni best_auc = 0.0 split_vector_data = SplitClass() sess = tf.Session() sess.run(tf.global_variables_initializer()) for epoch in range(graph_hyper_params['epoch']): # 只训练 1 轮 e_b_s = len(train_data) / graph_hyper_params['batch_size'] one_epoch_loss, one_epoch_batchnum = 0.0, 0.0 for index in tqdm(range(e_b_s)): # print 0 start = index * graph_hyper_params['batch_size'] end = (index + 1) * graph_hyper_params['batch_size'] if (index + 1) * graph_hyper_params['batch_size'] < len(train_data) else len(train_data) b_data = train_data[start:end] # print 1 # d1 = datetime.now() fed_dict = get_fed_dict(b_data, split_vector_data, feature_conf_dict) # d2 = datetime.now() # print 2 _, loss_val, pre_tr_val = sess.run([train_step, model_loss, network_params[0]], feed_dict=fed_dict) # print 3 # d3 = datetime.now() # print d2-d1, d3-d2 one_epoch_loss += loss_val one_epoch_batchnum += 1. if graph_hyper_params['debug']: print datetime.now(), index, loss_val pre_tr_val = np.array(pre_tr_val).reshape((-1)) if graph_hyper_params['debug'] or True in np.isnan(pre_tr_val): print pre_tr_val if index != 0 and index % ((e_b_s - 1) / graph_hyper_params['show_peroid']) == 0: split_vector_data.clean() auc, gn = eval_on_dev(split_vector_data) best_auc = max(auc, best_auc) format_str = '%s epoch=%.2f avg_loss=%.4f auc=%.4f best_auc=%.4f gn=%.4f' print (format_str % (datetime.now().strftime("%Y-%m-%d %H:%M:%S"), (epoch + 1.0 * (index+1) / e_b_s), one_epoch_loss / one_epoch_batchnum, auc, best_auc, gn)) one_epoch_loss = one_epoch_batchnum = 0.0 # pass # predict_data = predict_data1 # if graph_hyper_params['formal']: # graph_hyper_params['batch_size'] = 1024 # e_b_s = len(predict_data) / graph_hyper_params['batch_size'] if len(predict_data) % graph_hyper_params[ # 'batch_size'] == 0 else len(predict_data) / graph_hyper_params['batch_size'] + 1 # # del split_vector_data # # gc.collect() # # split_vector_data = SplitClass() # split_vector_data.clean() # pred = [] # for index in tqdm(range(e_b_s)): # start = index * graph_hyper_params['batch_size'] # end = (index + 1) * graph_hyper_params['batch_size'] if (index + 1) * graph_hyper_params['batch_size'] < len(predict_data) else len(predict_data) + 1 # b_predict_data = predict_data[start:end] # # print len(b_predict_data), start, end # # fed_dict = get_fed_dict(b_dev_data, split_vector_data, feature_conf_dict) # fed_dict = get_fed_dict(b_predict_data, split_vector_data, feature_conf_dict, predict=True) # fed_dict[train_p] = False # fed_dict[dropout_p] = np.array([1.0]) # pred_value = sess.run([pred_val], feed_dict=fed_dict) # # print pred_value # pre_real_val = np.array(pred_value).reshape((-1)) # pred = pred + pre_real_val.tolist() # # predict_data['pred_label'] = pred # csv_data = predict_data[['ori_aid', 'ori_uid', 'pred_label']] # csv_data.columns = ['aid', 'uid', 'score'] # csv_path = os.path.join(checkpoint_dir, 'n' + str(graph_hyper_params['neg_start']) + '_submission.csv') # csv_data.to_csv(csv_path, index=False) # print 'submission_path:', csv_path pass
def _init_graph(self): self.graph = tf.Graph() with self.graph.as_default(): # todo tf.set_random_seed()函数,使用之后后面设置的随机数都不需要设置seed,而可以跨会话生成相同的随机数。 tf.set_random_seed(self.random_seed) self.feat_index = tf.placeholder(tf.int32, shape=[None, None], name="feat_index") # None * F self.feat_value = tf.placeholder(tf.float32, shape=[None, None], name="feat_value") # None * F self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label") # None * 1 self.dropout_keep_fm = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_fm") self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_deep") self.train_phase = tf.placeholder(tf.bool, name="train_phase") self.weights = self._initialize_weights() # model # todo 下面这一步就是根据原始特征和embedding矩阵相乘得到的每个特征的embedding。 # embedding的形成方式,首先经过FM也好Deep也好得到一个size为(M,k)的权重矩阵。这里field是尚未做onehot前特征矩阵的一维, # onehot后每个field只有1个数值不为0,这个不为0特征对应的(M,k)的权重矩阵就是这个field的embedding,这里说的都是类别类型特征。 # 开始对field有些误解,说一下自己对类别类型特征的embedding的形成方式的理解:首先经过FM也好Deep也好得到一个size为(M,K)的权重矩阵。这里field是尚未做onehot前特征矩阵的一个特征,onehot后每个field只有一个数值不为0(等于1),这个等于1对应的(M,k)的权重矩阵的一行就是这个field的embedding。 self.embeddings = tf.nn.embedding_lookup( self.weights["feature_embeddings"], self.feat_index) # None * F * K feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1]) # todo self.embeddings=Tensor("embedding_lookup/Identity:0", shape=(?, ?, 8), dtype=float32) # print(f"self.embeddings={self.embeddings}") # todo multiply 是元素级别的相乘,也就是两个相乘的数元素各自相乘,而不是矩阵乘法,注意和tf.matmul区别。 # 这里embedding和feat_value相乘主要是考虑到数值型特征,类别型特征value都是1乘不乘都一样。 self.embeddings = tf.multiply(self.embeddings, feat_value) # todo self.embeddings=Tensor("Mul:0", shape=(?, 39, 8), dtype=float32), # feat_value=Tensor("Reshape:0", shape=(?, 39, 1), dtype=float32) # print(f"self.embeddings={self.embeddings},feat_value={feat_value}") # ---------- first order term ---------- self.y_first_order = tf.nn.embedding_lookup( self.weights["feature_bias"], self.feat_index) # None * F * 1 # todo 这里为什么会喂入稀疏矩阵 self.y_first_order = tf.reduce_sum( tf.multiply(self.y_first_order, feat_value), 2) # None * F self.y_first_order = tf.nn.dropout( self.y_first_order, self.dropout_keep_fm[0]) # None * F # ---------- second order term --------------- # todo FM的输入确实是稀疏矩阵,但是DeepFM的图中画不是FM的输入是稠密矩阵吗? # FM的输入是稀疏矩阵,deep输入是稠密矩阵(这里deep的稠密矩阵是基于FM的矩阵分解得到的对吧?) # sum_square part self.summed_features_emb = tf.reduce_sum(self.embeddings, 1) # None * K self.summed_features_emb_square = tf.square( self.summed_features_emb) # None * K # square_sum part self.squared_features_emb = tf.square(self.embeddings) self.squared_sum_features_emb = tf.reduce_sum( self.squared_features_emb, 1) # None * K # second order self.y_second_order = 0.5 * tf.subtract( self.summed_features_emb_square, self.squared_sum_features_emb) # None * K self.y_second_order = tf.nn.dropout( self.y_second_order, self.dropout_keep_fm[1]) # None * K # ---------- Deep component ---------- # todo 这里反向传播的时候会更新embedding的值吗?-- 应该会更新吧,如果只使用deep的时候肯定会更新,FM和deep一起使用怎么会不更新呢? self.y_deep = tf.reshape( self.embeddings, shape=[-1, self.field_size * self.embedding_size]) # None * (F*K) self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0]) for i in range(0, len(self.deep_layers)): self.y_deep = tf.add( tf.matmul(self.y_deep, self.weights["layer_%d" % i]), self.weights["bias_%d" % i]) # None * layer[i] * 1 if self.batch_norm: self.y_deep = self.batch_norm_layer( self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" % i) # None * layer[i] * 1 self.y_deep = self.deep_layers_activation(self.y_deep) self.y_deep = tf.nn.dropout( self.y_deep, self.dropout_keep_deep[1 + i]) # dropout at each Deep layer # ---------- DeepFM ---------- if self.use_fm and self.use_deep: concat_input = tf.concat( [self.y_first_order, self.y_second_order, self.y_deep], axis=1) elif self.use_fm: concat_input = tf.concat( [self.y_first_order, self.y_second_order], axis=1) elif self.use_deep: concat_input = self.y_deep self.out = tf.add( tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"]) # loss if self.loss_type == "logloss": self.out = tf.nn.sigmoid(self.out) self.loss = tf.losses.log_loss(self.label, self.out) elif self.loss_type == "mse": self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out)) # l2 regularization on weights if self.l2_reg > 0: # todo 这里FM的正则只加FM之后全连接的正则,没有加前面FM中的正则 # regularizer = tf.contrib.layers.l2_regularizer(scale=0.1) #scale代表正则化系数的值 # tf中通过tf.contrib.layers.l2_regularizer(scale=0.1) 创建一个正则化方法,此处是L2正则化,#scale代表正则化系数的值。 self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)( self.weights["concat_projection"]) if self.use_deep: for i in range(len(self.deep_layers)): self.loss += tf.contrib.layers.l2_regularizer( self.l2_reg)(self.weights["layer_%d" % i]) # optimizer if self.optimizer_type == "adam": self.optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) elif self.optimizer_type == "adagrad": self.optimizer = tf.train.AdagradOptimizer( learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss) elif self.optimizer_type == "gd": self.optimizer = tf.train.GradientDescentOptimizer( learning_rate=self.learning_rate).minimize(self.loss) elif self.optimizer_type == "momentum": self.optimizer = tf.train.MomentumOptimizer( learning_rate=self.learning_rate, momentum=0.95).minimize(self.loss) elif self.optimizer_type == "yellowfin": pass self.optimizer = YFOptimizer(learning_rate=self.learning_rate, momentum=0.0).minimize(self.loss) # init # 这里建了saver没用到吧 self.saver = tf.train.Saver() init = tf.global_variables_initializer() self.sess = self._init_session() self.sess.run(init) # number of params # todo 对模型训练没什么帮助 total_parameters = 0 for variable in self.weights.values(): shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters if self.verbose > 0: print("#params: %d" % total_parameters)
verbose=False) if torch.cuda.is_available(): newmodel.cuda() print('newmodel', newmodel) ####################################### # INSTANTIATE LOSS AND OPTIMIZER CLASS# ####################################### criterion = nn.CrossEntropyLoss() params = [p for p in newmodel.parameters() if p.requires_grad] wnd_size = 40 learning_rate = .5 # TODO: here learning rate is fixed, so need to find out some methods, maybe not fixed? # optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) optimizer = YFOptimizer(params, lr=learning_rate, mu=0.0, weight_decay=5e-4, clip_thresh=2.0, curv_win_width=wnd_size) optimizer._sparsity_debias = True ######################### # TRAINING WITH NEWMODEL# ######################### iter = 0 for epoch in range(num_epoches): for i, (images, labels) in enumerate(train_loader): if torch.cuda.is_available(): images = Variable(images.cuda()) labels = Variable(labels.cuda()) else: images = Variable(images)
def __init__(self, args, training=True, opt_method="Adam"): self.args = args if not training: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn.BasicRNNCell elif args.model == 'gru': cell_fn = rnn.GRUCell elif args.model == 'lstm': cell_fn = rnn.BasicLSTMCell elif args.model == 'nas': cell_fn = rnn.NASCell else: raise Exception("model type not supported: {}".format(args.model)) cells = [] for _ in range(args.num_layers): cell = cell_fn(args.rnn_size) if training and (args.output_keep_prob < 1.0 or args.input_keep_prob < 1.0): cell = rnn.DropoutWrapper(cell, input_keep_prob=args.input_keep_prob, output_keep_prob=args.output_keep_prob) cells.append(cell) self.cell = cell = rnn.MultiRNNCell(cells, state_is_tuple=True) self.input_data = tf.placeholder( tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder( tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) # dropout beta testing: double check which one should affect next line if training and args.output_keep_prob: inputs = tf.nn.dropout(inputs, args.output_keep_prob) inputs = tf.split(inputs, args.seq_length, 1) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = legacy_seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if not training else None, scope='rnnlm') output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = legacy_seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])]) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length with tf.name_scope('cost'): self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length # for eval visualization in tensorboard self.eval_cost = tf.identity(self.cost) self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) with tf.name_scope('optimizer'): if opt_method == "Adam": print "using Adam" self.optimizer = optimizer = tf.train.AdamOptimizer(self.lr) elif opt_method == "YF": print "using YF" self.optimizer = optimizer = YFOptimizer(learning_rate=args.learning_rate, momentum=0.0) elif opt_method == "SGD": print "using SGD" self.optimizer = optimizer = tf.train.MomentumOptimizer(self.lr, 0.9) else: raise Exception("please use either adam or YF") self.train_op = optimizer.apply_gradients(zip(grads, tvars)) # instrument tensorboard self.train_summary = [ \ tf.summary.histogram('logits', self.logits), tf.summary.histogram('loss', loss), tf.summary.scalar('train_loss', self.cost) ]
model.add(Activation('relu')) model.add(Conv2D(64, (3, 3))) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(num_classes)) model.add(Activation('softmax')) # Modified to use yellowfin or adam optimizer if yellowfinopt: opt = keras.optimizers.TFOptimizer(YFOptimizer()) else: opt = keras.optimizers.Adam() # Compile the model model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 # subsamples for quick test #x_train2 = x_train[0:2000, :,:,:]
# mix_softmax = MixtureSoftmax(batch_size=batch_size, word_gru_hidden = 50, feature_dim = 0, n_classes=num_classes) if use_cuda: word_attn.cuda() mix_softmax.cuda() softmax = nn.Softmax() sigmoid = nn.Sigmoid() learning_rate = 0.0001 print("lr thresh", args.lr_thresh) optimizer = YFOptimizer(mix_softmax.parameters(), beta=0.999, lr=learning_rate, mu=0.0, zero_debias=False, clip_thresh=None, auto_clip_fac=None, curv_win_width=20, force_non_inc_step=False, use_disk_checkpoint=True) # word_optmizer = YFOptimizer(word_attn.parameters(), lr=learning_rate, mu=0.0, auto_clip_fac=2.0) # mix_optimizer = YFOptimizer(mix_softmax.parameters(), lr=learning_rate, mu=0.0, auto_clip_fac=2.0) criterion = nn.MultiLabelSoftMarginLoss(size_average=True) import time import math def timeSince(since):
def test_lr_mu(): opt = YFOptimizer(learning_rate=0.5, momentum=0.5, zero_debias=False) w = tf.Variable(np.ones([ n_dim, ]), dtype=tf.float32, name="w", trainable=True) b = tf.Variable(np.ones([ 1, ], dtype=np.float32), dtype=tf.float32, name="b", trainable=True) x = tf.constant(np.ones([ n_dim, ], dtype=np.float32), dtype=tf.float32) loss = tf.multiply(w, x) + b tvars = tf.trainable_variables() w_grad_val = tf.Variable(np.zeros([ n_dim, ]), dtype=tf.float32, trainable=False) b_grad_val = tf.Variable(np.zeros([ 1, ]), dtype=tf.float32, trainable=False) apply_op = opt.apply_gradients(zip([w_grad_val, b_grad_val], tvars)) init_op = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init_op) target_h_max = 0.0 target_h_min = 0.0 g_norm_squared_avg = 0.0 g_norm_avg = 0.0 g_avg = 0.0 target_dist = 0.0 target_lr = 0.5 target_mu = 0.5 for i in range(n_iter): sess.run( tf.assign(w_grad_val, (i + 1) * np.ones([ n_dim, ], dtype=np.float32))) sess.run( tf.assign(b_grad_val, (i + 1) * np.ones([ 1, ], dtype=np.float32))) res = sess.run([ opt._curv_win, opt._h_max, opt._h_min, opt._grad_var, opt._dist_to_opt_avg, opt._lr_var, opt._mu_var, apply_op ]) res[5] = opt._lr_var.eval() res[6] = opt._mu_var.eval() g_norm_squared_avg = 0.999 * g_norm_squared_avg \ + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2) g_norm_avg = 0.999 * g_norm_avg \ + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) ) g_avg = 0.999 * g_avg + 0.001 * (i + 1) target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2 * (n_dim + 1) target_h_min = 0.999 * target_h_min + 0.001 * max( 1, i + 2 - 20)**2 * (n_dim + 1) target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1) target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg if i > 0: lr, mu = tune_everything(target_dist**2, target_var, 1, target_h_min, target_h_max) target_lr = 0.999 * target_lr + 0.001 * lr target_mu = 0.999 * target_mu + 0.001 * mu # print "iter ", i, " h max ", res[1], target_h_max, " h min ", res[2], target_h_min, \ # " var ", res[3], target_var, " dist ", res[4], target_dist # print "iter ", i, " lr ", res[5], target_lr, " mu ", res[6], target_mu assert np.abs(target_h_max - res[1]) < np.abs(target_h_max) * 1e-3 assert np.abs(target_h_min - res[2]) < np.abs(target_h_min) * 1e-3 assert np.abs(target_var - res[3]) < np.abs(res[3]) * 1e-3 assert np.abs(target_dist - res[4]) < np.abs(res[4]) * 1e-3 assert target_lr == 0.0 or np.abs(target_lr - res[5]) < np.abs(res[5]) * 1e-3 assert target_mu == 0.0 or np.abs(target_mu - res[6]) < np.abs(res[6]) * 5e-3 print "lr and mu computing test passed!"
def test_lr_mu(zero_debias=False): dtype = torch.FloatTensor w = Variable(torch.ones(n_dim, 1).type(dtype), requires_grad=True) b = Variable(torch.ones(1).type(dtype), requires_grad=True) x = Variable(torch.ones(1, n_dim).type(dtype), requires_grad=False) opt = YFOptimizer([w, b], lr=1.0, mu=0.0, zero_debias=zero_debias) target_h_max = 0.0 target_h_min = 0.0 g_norm_squared_avg = 0.0 g_norm_avg = 0.0 g_avg = 0.0 target_dist = 0.0 target_lr = 1.0 target_mu = 0.0 for i in range(n_iter): opt.zero_grad() loss = (x.mm(w) + b).sum() loss.backward() w.grad.data = (i + 1) * torch.ones([ n_dim, ]).type(dtype) b.grad.data = (i + 1) * torch.ones([ 1, ]).type(dtype) opt.step() res = [ opt._h_max, opt._h_min, opt._grad_var, opt._dist_to_opt, opt._lr, opt._mu ] g_norm_squared_avg = 0.999 * g_norm_squared_avg \ + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2) g_norm_avg = 0.999 * g_norm_avg \ + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) ) g_avg = 0.999 * g_avg + 0.001 * (i + 1) target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2 * (n_dim + 1) target_h_min = 0.999 * target_h_min + 0.001 * max( 1, i + 2 - 20)**2 * (n_dim + 1) if zero_debias: target_var = g_norm_squared_avg/(1-0.999**(i + 1) ) \ - g_avg**2 * (n_dim + 1) / (1-0.999**(i + 1) )**2 else: target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1) target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg if i == 0: continue if zero_debias: # print "iter ", i, " h max ", res[0], target_h_max/(1-0.999**(i + 1) ), \ # " h min ", res[1], target_h_min/(1-0.999**(i + 1) ), \ # " var ", res[2], target_var, \ # " dist ", res[3], target_dist/(1-0.999**(i + 1) ) assert np.abs(target_h_max / (1 - 0.999**(i + 1)) - res[0]) < np.abs(res[0]) * 1e-3 assert np.abs(target_h_min / (1 - 0.999**(i + 1)) - res[1]) < np.abs(res[1]) * 1e-3 assert np.abs(target_var - res[2]) < np.abs(target_var) * 1e-3 assert np.abs(target_dist / (1 - 0.999**(i + 1)) - res[3]) < np.abs(res[3]) * 1e-3 else: # print "iter ", i, " h max ", res[0], target_h_max, " h min ", res[1], target_h_min, \ # " var ", res[2], target_var, " dist ", res[3], target_dist assert np.abs(target_h_max - res[0]) < np.abs(target_h_max) * 1e-3 assert np.abs(target_h_min - res[1]) < np.abs(target_h_min) * 1e-3 assert np.abs(target_var - res[2]) < np.abs(res[2]) * 1e-3 assert np.abs(target_dist - res[3]) < np.abs(res[3]) * 1e-3 if i > 0: if zero_debias: lr, mu = tune_everything( (target_dist / (1 - 0.999**(i + 1)))**2, target_var, 1, target_h_min / (1 - 0.999**(i + 1)), target_h_max / (1 - 0.999**(i + 1))) else: lr, mu = tune_everything(target_dist**2, target_var, 1, target_h_min, target_h_max) lr = np.real(lr) mu = np.real(mu) target_lr = 0.999 * target_lr + 0.001 * lr target_mu = 0.999 * target_mu + 0.001 * mu # print "lr ", target_lr, res[4], " mu ", target_mu, res[5] assert target_lr == 0.0 or np.abs(target_lr - res[4]) < np.abs(res[4]) * 1e-3 assert target_mu == 0.0 or np.abs(target_mu - res[5]) < np.abs(res[5]) * 5e-3 print "lr and mu computing test passed!"
dist_list,\ grad_var_list,\ lr_g_norm_list,\ lr_g_norm_squared_list,\ move_lr_g_norm_list,\ move_lr_g_norm_squared_list,\ lr_grad_norm_clamp_act_list,\ fast_view_act_list if args.opt_method == "SGD": optimizer = torch.optim.SGD(model.parameters(), lr, momentum=0.0) elif args.opt_method == "momSGD": optimizer = torch.optim.SGD(model.parameters(), lr, momentum=0.9) elif args.opt_method == "YF": optimizer = YFOptimizer(model.parameters() ) elif args.opt_method == "Adam": optimizer = torch.optim.Adam(model.parameters(), lr) best_val_loss = None train_loss_list = [] val_loss_list = [] lr_list = [] mu_list = [] loss_list = [] local_curv_list = [] max_curv_list = [] min_curv_list = [] lr_g_norm_list = []
def test_measurement(zero_debias=True): dtype = torch.FloatTensor w = Variable(torch.ones(n_dim, 1).type(dtype), requires_grad=True) b = Variable(torch.ones(1).type(dtype), requires_grad=True) x = Variable(torch.ones(1, n_dim).type(dtype), requires_grad=False) opt = YFOptimizer([w, b], lr=1.0, mu=0.0, zero_debias=zero_debias) target_h_max = 0.0 target_h_min = 0.0 g_norm_squared_avg = 0.0 g_norm_avg = 0.0 g_avg = 0.0 target_dist = 0.0 for i in range(n_iter): opt.zero_grad() loss = (x.mm(w) + b).sum() loss.backward() w.grad.data = (i + 1) * torch.ones([ n_dim, ]).type(dtype) b.grad.data = (i + 1) * torch.ones([ 1, ]).type(dtype) opt.step() res = [opt._h_max, opt._h_min, opt._grad_var, opt._dist_to_opt] g_norm_squared_avg = 0.999 * g_norm_squared_avg \ + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2) g_norm_avg = 0.999 * g_norm_avg \ + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) ) g_avg = 0.999 * g_avg + 0.001 * (i + 1) target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2 * (n_dim + 1) target_h_min = 0.999 * target_h_min + 0.001 * max( 1, i + 2 - 20)**2 * (n_dim + 1) if zero_debias: target_var = g_norm_squared_avg/(1-0.999**(i + 1) ) \ - g_avg**2 * (n_dim + 1) / (1-0.999**(i + 1) )**2 else: target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1) target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg if i == 0: continue if zero_debias: # print "iter ", i, " h max ", res[0], target_h_max/(1-0.999**(i + 1) ), \ # " h min ", res[1], target_h_min/(1-0.999**(i + 1) ), \ # " var ", res[2], target_var, \ # " dist ", res[3], target_dist/(1-0.999**(i + 1) ) assert np.abs(target_h_max / (1 - 0.999**(i + 1)) - res[0]) < np.abs(res[0]) * 1e-3 assert np.abs(target_h_min / (1 - 0.999**(i + 1)) - res[1]) < np.abs(res[1]) * 1e-3 assert np.abs(target_var - res[2]) < np.abs(target_var) * 1e-3 assert np.abs(target_dist / (1 - 0.999**(i + 1)) - res[3]) < np.abs(res[3]) * 1e-3 else: # print "iter ", i, " h max ", res[0], target_h_max, " h min ", res[1], target_h_min, \ # " var ", res[2], target_var, " dist ", res[3], target_dist assert np.abs(target_h_max - res[0]) < np.abs(target_h_max) * 1e-3 assert np.abs(target_h_min - res[1]) < np.abs(target_h_min) * 1e-3 assert np.abs(target_var - res[2]) < np.abs(res[2]) * 1e-3 assert np.abs(target_dist - res[3]) < np.abs(res[3]) * 1e-3 print "sync measurement test passed!"
def _init_graph(self): self.graph = tf.Graph() with self.graph.as_default(): tf.set_random_seed(self.random_seed) self.feat_index = tf.placeholder(tf.int32, shape=[None, None], name="feat_index") # None * F batch_size * field_size self.feat_value = tf.placeholder(tf.float32, shape=[None, None], name="feat_value") # None * F batch_size * field_size self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label") # None * 1 self.dropout_keep_fm = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_fm") self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_deep") self.train_phase = tf.placeholder(tf.bool, name="train_phase") #todo 初始化权重 self.weights = self._initialize_weights() # model self.embeddings = tf.nn.embedding_lookup(self.weights["feature_embeddings"], #[self.feature_size, self.embedding_size] self.feat_index) # None * F * K 由于one-hot是01编码 所以取出的大小为batch_size*field_size*embedding_size feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1]) self.embeddings = tf.multiply(self.embeddings, feat_value) #todo 两个矩阵中对应元素各自相乘 这个就是就是每个值和自己的隐向量的乘积 # ---------- todo first order term 其实就是wx---------- #得到的大小为batch_size*field_size*1 self.y_first_order = tf.nn.embedding_lookup(self.weights["feature_bias"], self.feat_index) # None * F * 1 [self.feature_size, 1] #todo feat_value大小为 batch_size*field_size*1 得到的结果的大小 batch_size*field_size self.y_first_order = tf.reduce_sum(tf.multiply(self.y_first_order, feat_value), 2) # None * F self.y_first_order = tf.nn.dropout(self.y_first_order, self.dropout_keep_fm[0]) # None * F # ---------- todo 二次项second order term --------------- # sum_square part (batch_size*field_size*embedding_size) self.summed_features_emb = tf.reduce_sum(self.embeddings, 1) # None * K self.summed_features_emb_square = tf.square(self.summed_features_emb) # None * K # square_sum part self.squared_features_emb = tf.square(self.embeddings) self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb, 1) # None * K # second order self.y_second_order = 0.5 * tf.subtract(self.summed_features_emb_square, self.squared_sum_features_emb) # None * K self.y_second_order = tf.nn.dropout(self.y_second_order, self.dropout_keep_fm[1]) # None * K # ---------- todo Deep component ---------- #todo emdeddings的大小为 batch_size*field_size*embedding_size self.y_deep = tf.reshape(self.embeddings, shape=[-1, self.field_size * self.embedding_size]) # None * (F*K) self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0]) for i in range(0, len(self.deep_layers)): self.y_deep = tf.add(tf.matmul(self.y_deep, self.weights["layer_%d" %i]), self.weights["bias_%d"%i]) # None * layer[i] * 1 if self.batch_norm: self.y_deep = self.batch_norm_layer(self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" %i) # None * layer[i] * 1 self.y_deep = self.deep_layers_activation(self.y_deep) self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[1+i]) # dropout at each Deep layer # ---------- DeepFM ---------- if self.use_fm and self.use_deep: concat_input = tf.concat([self.y_first_order, self.y_second_order, self.y_deep], axis=1) elif self.use_fm: concat_input = tf.concat([self.y_first_order, self.y_second_order], axis=1) elif self.use_deep: concat_input = self.y_deep #todo 将结果进行投影 self.out = tf.add(tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"]) # loss 损失函数 if self.loss_type == "logloss": self.out = tf.nn.sigmoid(self.out) self.loss = tf.losses.log_loss(self.label, self.out) elif self.loss_type == "mse": self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out)) # l2 regularization on weights 正则化项目 if self.l2_reg > 0: self.loss += tf.contrib.layers.l2_regularizer( self.l2_reg)(self.weights["concat_projection"]) if self.use_deep: for i in range(len(self.deep_layers)): self.loss += tf.contrib.layers.l2_regularizer( self.l2_reg)(self.weights["layer_%d"%i]) # optimizer if self.optimizer_type == "adam": self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) elif self.optimizer_type == "adagrad": self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss) elif self.optimizer_type == "gd": self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss) elif self.optimizer_type == "momentum": self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize( self.loss) elif self.optimizer_type == "yellowfin": self.optimizer = YFOptimizer(learning_rate=self.learning_rate, momentum=0.0).minimize( self.loss) # init self.saver = tf.train.Saver() init = tf.global_variables_initializer() self.sess = self._init_session() self.sess.run(init) # number of params total_parameters = 0 for variable in self.weights.values(): shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters if self.verbose > 0: print("#params: %d" % total_parameters)
curParams = [p for p in net.parameters() if p.requires_grad] if args.optimizer.lower() == 'sgd': optimizer = optim.SGD(curParams, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adam': #args.lr=1e-3 optimizer = optim.Adam(params=curParams) elif args.optimizer.lower() == 'rmsprop': #args.lr=1e-2 optimizer = optim.RMSprop(params=curParams) elif args.optimizer.lower() in ['yellowfin', 'yf']: optimizer = YFOptimizer(curParams, lr=args.lr, mu=0.0, weight_decay=weight_decay, clip_thresh=2.0, curv_win_width=20) else: raise Exception('Unsupported optimizer type encountered:' + args.optimizer) criterion = MultiBoxLoss(num_classes, 0.5, True, 0, True, 3, 0.5, False, args.cuda) def train(): #import cProfile, pstats #from io import StringIO #pr = cProfile.Profile() #pr.enable() net.train()
def _init_graph(self): self.graph = tf.Graph() with self.graph.as_default(): tf.set_random_seed(self.random_seed) with tf.name_scope('input'): self.feat_index = tf.placeholder(tf.int32, shape=[None, None], name="feat_index") # None * F self.feat_value = tf.placeholder(tf.float32, shape=[None, None], name="feat_value") # None * F self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label") # None * 1 self.dropout_keep_fm = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_fm") self.dropout_keep_deep = tf.placeholder( tf.float32, shape=[None], name="dropout_keep_deep") self.train_phase = tf.placeholder(tf.bool, name="train_phase") self.weights = self._initialize_weights() # model self.embeddings = tf.nn.embedding_lookup( self.weights["feature_embeddings"], self.feat_index) # None * F * K feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1], name="reshape_feat_value") self.embeddings = tf.multiply(self.embeddings, feat_value) with tf.name_scope("FM-model"): # ---------- first order term ---------- self.y_first_order = tf.nn.embedding_lookup( self.weights["feature_bias"], self.feat_index) # None * F * 1 self.y_first_order = tf.reduce_sum( tf.multiply(self.y_first_order, feat_value), 2) # None * F self.y_first_order = tf.nn.dropout( self.y_first_order, self.dropout_keep_fm[0]) # None * F # ---------- second order term --------------- # sum_square part self.summed_features_emb = tf.reduce_sum(self.embeddings, 1) # None * K self.summed_features_emb_square = tf.square( self.summed_features_emb) # None * K # square_sum part self.squared_features_emb = tf.square(self.embeddings) self.squared_sum_features_emb = tf.reduce_sum( self.squared_features_emb, 1) # None * K # second order self.y_second_order = 0.5 * tf.subtract( self.summed_features_emb_square, self.squared_sum_features_emb) # None * K self.y_second_order = tf.nn.dropout( self.y_second_order, self.dropout_keep_fm[1]) # None * K with tf.name_scope("Deep-model"): # ---------- Deep component ---------- self.y_deep = tf.reshape( self.embeddings, shape=[-1, self.field_size * self.embedding_size ]) # None * (F*K) self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0]) for i in range(0, len(self.deep_layers)): self.y_deep = tf.add( tf.matmul(self.y_deep, self.weights["layer_%d" % i]), self.weights["bias_%d" % i]) # None * layer[i] * 1 if self.batch_norm: self.y_deep = self.batch_norm_layer( self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" % i) # None * layer[i] * 1 self.y_deep = self.deep_layers_activation(self.y_deep) self.y_deep = tf.nn.dropout( self.y_deep, self.dropout_keep_deep[ 1 + i]) # dropout at each Deep layer with tf.name_scope("DeepFM-model"): # ---------- DeepFM ---------- if self.use_fm and self.use_deep: concat_input = tf.concat( [self.y_first_order, self.y_second_order, self.y_deep], axis=1) elif self.use_fm: concat_input = tf.concat( [self.y_first_order, self.y_second_order], axis=1) elif self.use_deep: concat_input = self.y_deep self.out = tf.add( tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"]) # loss with tf.name_scope("loss"): if self.loss_type == "logloss": self.out = tf.nn.sigmoid(self.out) self.loss = tf.losses.log_loss(self.label, self.out) elif self.loss_type == "mse": self.loss = tf.nn.l2_loss(tf.subtract( self.label, self.out)) # l2 regularization on weights if self.l2_reg > 0: self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)( self.weights["concat_projection"]) if self.use_deep: for i in range(len(self.deep_layers)): self.loss += tf.contrib.layers.l2_regularizer( self.l2_reg)(self.weights["layer_%d" % i]) # optimizer with tf.name_scope("train"): if self.optimizer_type == "adam": self.optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) elif self.optimizer_type == "adagrad": self.optimizer = tf.train.AdagradOptimizer( learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss) elif self.optimizer_type == "gd": self.optimizer = tf.train.GradientDescentOptimizer( learning_rate=self.learning_rate).minimize(self.loss) elif self.optimizer_type == "momentum": self.optimizer = tf.train.MomentumOptimizer( learning_rate=self.learning_rate, momentum=0.95).minimize(self.loss) elif self.optimizer_type == "yellowfin": self.optimizer = YFOptimizer( learning_rate=self.learning_rate, momentum=0.0).minimize(self.loss) # init init = tf.global_variables_initializer() self.sess = self._init_session() self.sess.run(init) # save model self.saver = tf.train.Saver() # save summary tf.summary.scalar('log_loss', self.loss) self.merge_summary = tf.summary.merge_all( ) #调用sess.run运行图,生成一步的训练过程数据, 是一个option self.writer = tf.summary.FileWriter("./graphs", self.sess.graph) # number of params total_parameters = 0 for variable in self.weights.values(): shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters if self.verbose > 0: print("#params: %d" % total_parameters)
def learn(dataset, dim=2, hyp=1, edim=1, euc=0, sdim=1, sph=0, scale=1., riemann=False, learning_rate=1e-1, decay_length=1000, decay_step=1.0, momentum=0.0, tol=1e-8, epochs=100, burn_in=0, use_yellowfin=False, use_adagrad=False, resample_freq=1000, print_freq=1, model_save_file=None, model_load_file=None, batch_size=16, num_workers=None, lazy_generation=False, log_name=None, log=False, warm_start=None, learn_scale=False, checkpoint_freq=100, sample=1., subsample=None, logloss=False, distloss=False, squareloss=False, symloss=False, exponential_rescale=None, extra_steps=1, use_svrg=False, T=10, use_hmds=False, visualize=False): # Log configuration formatter = logging.Formatter('%(asctime)s %(message)s') logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%FT%T', ) if log_name is None and log: log_name = f"{os.path.splitext(dataset)[0]}.H{dim}-{hyp}.E{edim}-{euc}.S{sdim}-{sph}.lr{learning_rate}.log" if log_name is not None: logging.info(f"Logging to {log_name}") log = logging.getLogger() fh = logging.FileHandler(log_name) fh.setFormatter(formatter) log.addHandler(fh) logging.info(f"Commandline {sys.argv}") if model_save_file is None: logging.warn("No Model Save selected!") G = load_graph.load_graph(dataset) GM = nx.to_scipy_sparse_matrix(G, nodelist=list(range(G.order()))) # grab scale if warm starting: if warm_start: scale = pandas.read_csv(warm_start, index_col=0).as_matrix()[0, -1] n = G.order() logging.info(f"Loaded Graph {dataset} with {n} nodes scale={scale}") if exponential_rescale is not None: # torch.exp(exponential_rescale * -d) def weight_fn(d): if d <= 2.0: return 5.0 elif d > 4.0: return 0.01 else: return 1.0 else: def weight_fn(d): return 1.0 Z, z = build_dataset(G, lazy_generation, sample, subsample, scale, batch_size, weight_fn, num_workers) if model_load_file is not None: logging.info(f"Loading {model_load_file}...") m = torch.load(model_load_file).to(device) logging.info( f"Loaded scale {unwrap(m.scale())} {torch.sum(m.embedding().data)} {m.epoch}" ) else: logging.info(f"Creating a fresh model warm_start?={warm_start}") m_init = None if warm_start: # load from DataFrame; assume that the julia combinatorial embedding has been saved raise NotImplementedError("Removed from this branch") elif use_hmds: # m_init = torch.DoubleTensor(mds_warmstart.get_normalized_hyperbolic(mds_warmstart.get_model(dataset,dim,scale)[1])) raise NotImplementedError("Removed from this branch") logging.info( f"\t Warmstarting? {warm_start} {m_init.size() if warm_start else None} {G.order()}" ) initial_scale = z.dataset.max_dist / 3.0 print("MAX DISTANCE", z.dataset.max_dist) print("AVG DISTANCE", torch.mean(z.dataset.val_cache)) initial_scale = 0.0 m = ProductEmbedding(G.order(), dim, hyp, edim, euc, sdim, sph, initialize=m_init, learn_scale=learn_scale, initial_scale=initial_scale, logrel_loss=logloss, dist_loss=distloss, square_loss=squareloss, sym_loss=symloss, exponential_rescale=exponential_rescale, riemann=riemann).to(device) m.normalize() m.epoch = 0 logging.info( f"Constructed model with dim={dim} and epochs={m.epoch} isnan={np.any(np.isnan(m.embedding().cpu().data.numpy()))}" ) if visualize: name = 'animations/' + f"{os.path.split(os.path.splitext(dataset)[0])[1]}.H{dim}-{hyp}.E{edim}-{euc}.S{sdim}-{sph}.lr{learning_rate}.ep{epochs}.seed{seed}" fig, ax, writer = vis.setup_plot(m=m, name=name, draw_circle=True) else: fig = None ax = None writer = None # # Build the Optimizer # # TODO: Redo this in a sensible way!! # per-parameter learning rates exp_params = [p for p in m.embed_params if p.use_exp] learn_params = [p for p in m.embed_params if not p.use_exp] hyp_params = [p for p in m.hyp_params if not p.use_exp] euc_params = [p for p in m.euc_params if not p.use_exp] sph_params = [p for p in m.sph_params if not p.use_exp] scale_params = m.scale_params # model_params = [{'params': m.embed_params}, {'params': m.scale_params, 'lr': 1e-4*learning_rate}] # model_params = [{'params': learn_params}, {'params': m.scale_params, 'lr': 1e-4*learning_rate}] model_params = [{ 'params': hyp_params }, { 'params': euc_params }, { 'params': sph_params, 'lr': 0.1 * learning_rate }, { 'params': m.scale_params, 'lr': 1e-4 * learning_rate }] # opt = None if len(model_params) > 0: opt = torch.optim.SGD(model_params, lr=learning_rate / 10, momentum=momentum) # opt = torch.optim.SGD(learn_params, lr=learning_rate/10, momentum=momentum) # opt = torch.optim.SGD(model_params, lr=learning_rate/10, momentum=momentum) # exp = None # if len(exp_params) > 0: # exp = torch.optim.SGD(exp_params, lr=1.0) # dummy for zeroing if len(scale_params) > 0: scale_opt = torch.optim.SGD(scale_params, lr=1e-3 * learning_rate) scale_decay = torch.optim.lr_scheduler.StepLR(scale_opt, step_size=1, gamma=.99) else: scale_opt = None scale_decay = None lr_burn_in = torch.optim.lr_scheduler.MultiStepLR(opt, milestones=[burn_in], gamma=10) # lr_decay = torch.optim.lr_scheduler.StepLR(opt, decay_length, decay_step) #TODO reconcile multiple LR schedulers if use_yellowfin: from yellowfin import YFOptimizer opt = YFOptimizer(model_params) if use_adagrad: opt = torch.optim.Adagrad(model_params) if use_svrg: from svrg import SVRG base_opt = torch.optim.Adagrad if use_adagrad else torch.optim.SGD opt = SVRG(m.parameters(), lr=learning_rate, T=T, data_loader=z, opt=base_opt) # TODO add ability for SVRG to take parameter groups logging.info(opt) # Log stats from import: when warmstarting, check that it matches Julia's stats logging.info(f"*** Initial Checkpoint. Computing Stats") major_stats(GM, n, m, lazy_generation, Z, z, fig, ax, writer, visualize, subsample) logging.info("*** End Initial Checkpoint\n") # track best stats best_loss = 1.0e10 best_dist = 1.0e10 best_wcdist = 1.0e10 best_map = 0.0 for i in range(m.epoch + 1, m.epoch + epochs + 1): lr_burn_in.step() # lr_decay.step() # scale_decay.step() # print(scale_opt.param_groups[0]['lr']) # for param_group in opt.param_groups: # print(param_group['lr']) # print(type(opt.param_groups), opt.param_groups) l, n_edges = 0.0, 0.0 # track average loss per edge m.train(True) if use_svrg: for data in z: def closure(data=data, target=None): _data = data if target is None else (data, target) c = m.loss(_data.to(device)) c.backward() return c.data[0] l += opt.step(closure) # Projection m.normalize() else: # scale_opt.zero_grad() for the_step in range(extra_steps): # Accumulate the gradient for u in z: # Zero out the gradients # if opt is not None: opt.zero_grad() # This is handled by the SVRG. # if exp is not None: exp.zero_grad() opt.zero_grad() for p in exp_params: if p.grad is not None: p.grad.detach_() p.grad.zero_() # Compute loss _loss = m.loss(cu_var(u)) _loss.backward() l += _loss.item() * u[0].size(0) # print(weight) n_edges += u[0].size(0) # modify gradients if necessary RParameter.correct_metric(m.parameters()) # step opt.step() for p in exp_params: lr = opt.param_groups[0]['lr'] p.exp(lr) # Projection m.normalize() # scale_opt.step() l /= n_edges # m.epoch refers to num of training epochs finished m.epoch += 1 # Logging code # if l < tol: # logging.info("Found a {l} solution. Done at iteration {i}!") # break if i % print_freq == 0: logging.info(f"{i} loss={l}") if (i <= burn_in and i % (checkpoint_freq / 5) == 0) or i % checkpoint_freq == 0: logging.info(f"\n*** Major Checkpoint. Computing Stats and Saving") avg_dist, wc_dist, me, mc, mapscore = major_stats( GM, n, m, True, Z, z, fig, ax, writer, visualize, subsample) best_loss = min(best_loss, l) best_dist = min(best_dist, avg_dist) best_wcdist = min(best_wcdist, wc_dist) best_map = max(best_map, mapscore) if model_save_file is not None: fname = f"{model_save_file}.{m.epoch}" logging.info( f"Saving model into {fname} {torch.sum(m.embedding().data)} " ) torch.save(m, fname) logging.info("*** End Major Checkpoint\n") if i % resample_freq == 0: if sample < 1. or subsample is not None: Z, z = build_dataset(G, lazy_generation, sample, subsample, scale, batch_size, weight_fn, num_workers) logging.info(f"final loss={l}") logging.info( f"best loss={best_loss}, distortion={best_dist}, map={best_map}, wc_dist={best_wcdist}" ) final_dist, final_wc, final_me, final_mc, final_map = major_stats( GM, n, m, lazy_generation, Z, z, fig, ax, writer, False, subsample) if log_name is not None: with open(log_name + '.stat', "w") as f: f.write("Best-loss MAP dist wc Final-_MAP dist wc me mc\n") f.write( f"{best_loss:10.6f} {best_map:8.4f} {best_dist:8.4f} {best_wcdist:8.4f} {l:10.6f} {final_map:8.4f} {final_dist:8.4f} {final_wc:8.4f} {final_me:8.4f} {final_mc:8.4f}" ) if visualize: writer.finish() if model_save_file is not None: fname = f"{model_save_file}.final" logging.info( f"Saving model into {fname}-final {torch.sum(m.embedding().data)} {unwrap(m.scale())}" ) torch.save(m, fname)
# Use the nn package to define our model and loss function. model = torch.nn.Sequential( torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out), ) loss_fn = torch.nn.MSELoss(size_average=False) # Use the optim package to define an Optimizer that will update the weights of # the model for us. Here we will use Adam; the optim package contains many other # optimization algoriths. The first argument to the Adam constructor tells the # optimizer which Variables it should update. min_loss_so_far = np.inf optimizer = YFOptimizer(model.parameters(), lr=0.0001, mu=0.0) for t in range(6600): # Forward pass: compute predicted y by passing x to the model. y_pred = model(x) # Compute and print loss. loss = loss_fn(y_pred, y) # Before the backward pass, use the optimizer object to zero all of the # gradients for the variables it will update (which are the learnable weights # of the model) optimizer.zero_grad() # Backward pass: compute gradient of the loss with respect to model parameters loss.backward()
def _init_graph(self): self.graph = tf.Graph() # 生成新的计算图 with self.graph.as_default(): # 作为默认的计算图 tf.compat.v1.set_random_seed(self.random_seed) # 设置随机数种子 # 占位符部分 self.feat_index = tf.compat.v1.placeholder( tf.int32, shape=[None, None], name="feat_index") # None *F self.feat_value = tf.compat.v1.placeholder( tf.float32, shape=[None, None], name="feat_value") # None *F self.label = tf.compat.v1.placeholder(tf.float32, shape=[None, 1], name="label") # None*1 self.dropout_keep_fm = tf.compat.v1.placeholder( tf.float32, shape=[None], name="dropout_keep_fm") self.dropout_keep_deep = tf.compat.v1.placeholder( tf.float32, shape=[None], name="dropout_keep_deep") self.train_phase = tf.compat.v1.placeholder( tf.bool, name="train_phase") # 默认为shape= None # self.weights = self._initialize_weights() # 初始化权重 # model self.embeddings = tf.nn.embedding_lookup( self.weights["feature_embeddings"], self.feat_index) # None *F *k feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1]) self.embeddings = tf.multiply(self.embeddings, feat_value) # -------------------first order term------------------------------ self.y_first_order = tf.nn.embedding_lookup( self.weights["feature_bias"], self.feat_index) # None*F*1 self.y_first_order = tf.reduce_sum( tf.multiply(self.y_first_order, feat_value), 2) self.y_first_order = tf.compat.v1.nn.dropout( self.y_first_order, rate=1 - self.dropout_keep_fm[0]) # None *F # --------------------second order term----------------------------- # sum_square part self.summed_features_emb = tf.reduce_sum(self.embeddings, 1) # None *k self.summed_features_emb_square = tf.square( self.summed_features_emb) # None *k # square_sum part self.squared_features_emb = tf.square(self.embeddings) # None*k self.squared_sum_features_emb = tf.reduce_sum( self.squared_features_emb, 1) # None *k # second order self.y_second_order = 0.5 * tf.subtract( self.summed_features_emb_square, self.squared_sum_features_emb) # None*k self.y_second_order = tf.nn.dropout( self.y_second_order, self.dropout_keep_fm[1]) # None *k # -------------------------Deep component--------------------------------------- self.y_deep = tf.reshape( self.embeddings, shape=[-1, self.field_size * self.embedding_size]) # None *(F*k) self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0]) for i in range(0, len(self.deep_layers)): self.y_deep = tf.add( tf.matmul(self.y_deep, self.weights["layer_%d" % i]), self.weights["bias_%d" % i]) # None * layer[i] * 1 if self.batch_norm: self.y_deep = self.batch_norm_layer( self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" % i) # None *layer[i] *1 self.y_deep = self.deep_layers_activation(self.y_deep) self.y_deep = tf.nn.dropout( self.y_deep, self.dropout_keep_deep[1 + i]) # dropout at each Deep layer # --------------------------------------DeepFM----------------------------------------- if self.use_fm and self.use_deep: concat_input = tf.concat( [self.y_first_order, self.y_second_order, self.y_deep], axis=1) elif self.use_fm: concat_input = tf.concat( [self.y_first_order, self.y_second_order], axis=1) elif self.use_deep: concat_input = self.y_deep else: raise AttributeError # 没选择式,导致错误 self.out = tf.add( tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"]) # ---------------------------------------loss----------------------------------------- if self.loss_type == "logloss": self.out = tf.nn.sigmoid(self.out) self.loss = tf.compat.v1.losses.log_loss(self.label, self.out) elif self.loss_type == "mse": self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out)) # l2 regularization on weights if self.l2_reg > 0: self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)( self.weights["concat_projection"]) if self.use_deep: for i in range(len(self.deep_layers)): self.loss += tf.contrib.layers.l2_regularizer( self.l2_reg)(self.weights["layer_%d" % i]) # -----------------------------------optimizer优化器---------------------------------- if self.optimizer_type == "adam": self.optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) elif self.optimizer_type == "adagrad": self.optimizer = tf.compat.v1.train.AdagradOptimizer( learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss) elif self.optimizer_type == "gd": self.optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=self.learning_rate).minimize(self.loss) elif self.optimizer_type == "momentum": self.optimizer = tf.compat.v1.train.MomentumOptimizer( learning_rate=self.learning_rate, momentum=0.95).minimize(self.loss) elif self.optimizer_type == "yellowfin": self.optimizer = YFOptimizer(learning_rate=self.learning_rate, momentum=0.0).minimize(self.loss) # init self.save = tf.compat.v1.train.Saver() # 实例化对象,进行声明 init = tf.compat.v1.global_variables_initializer() self.sess = self._init_session() self.sess.run(init) # number of params total_parameters = 0 for variable in self.weights.values(): shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters if self.verbose > 0: print("#params: %d" % total_parameters)
if not os.path.isdir(args.logdir): os.mkdir(args.logdir) train_loss_list = [] val_loss_list = [] lr_list = [] mu_list = [] if args.opt_method == "SGD": print("using SGD") optimizer = torch.optim.SGD(model.parameters(), lr, momentum=0.0) elif args.opt_method == "momSGD": print("using mom SGD") optimizer = torch.optim.SGD(model.parameters(), lr, momentum=0.9) elif args.opt_method == "YF": print("using YF") optimizer = YFOptimizer(model.parameters(), lr=1.0, mu=0.0) elif args.opt_method == "Adagrad": print("using Adagrad") optimizer = torch.optim.Adagrad(model.parameters(), lr) elif args.opt_method == "Adam": print("using Adam") optimizer = torch.optim.Adam(model.parameters(), lr) for epoch in range(1, args.epochs+1): epoch_start_time = time.time() train_loss = train() train_loss_list += train_loss val_loss = evaluate(val_data) val_loss_list.append(val_loss) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch,
def _init_graph(self): self.graph = tf.Graph() with self.graph.as_default(): tf.set_random_seed(self.random_seed) #稀疏存储,取值范围是[0, feature_size],但是取值数目是field_size,即每个field只有一个值;多值离散型field怎么处理呢? self.feat_index = tf.placeholder(tf.int32, shape=[None, None], name="feat_index") # None * F self.feat_value = tf.placeholder(tf.float32, shape=[None, None], name="feat_value") # None * F self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label") # None * 1 self.dropout_keep_fm = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_fm") self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_deep") self.train_phase = tf.placeholder(tf.bool, name="train_phase") self.weights = self._initialize_weights() # 从feature_size * embedding_size的参数中查到对应的field_size个embedding向量,结合embedding * feat_value操作相当于MLP的第一层 self.embeddings = tf.nn.embedding_lookup(self.weights["feature_embeddings"], self.feat_index) # None * F * K feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1]) self.embeddings = tf.multiply(self.embeddings, feat_value) # ---------- first order term ---------- # weights['feature_bias']这里其实就是w,真正的bias:w0这里并没有 self.y_first_order = tf.nn.embedding_lookup(self.weights["feature_bias"], self.feat_index) # None * F * 1 self.y_first_order = tf.reduce_sum(tf.multiply(self.y_first_order, feat_value), 2) # None * F self.y_first_order = tf.nn.dropout(self.y_first_order, self.dropout_keep_fm[0]) # None * F # ---------- second order term --------------- # v_i已经把x_i吸收进去了,所以这里feature_value没有显示的出现 # sum_square part self.summed_features_emb = tf.reduce_sum(self.embeddings, 1) # None * K self.summed_features_emb_square = tf.square(self.summed_features_emb) # None * K # square_sum part self.squared_features_emb = tf.square(self.embeddings) self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb, 1) # None * K # second order self.y_second_order = 0.5 * tf.subtract(self.summed_features_emb_square, self.squared_sum_features_emb) # None * K self.y_second_order = tf.nn.dropout(self.y_second_order, self.dropout_keep_fm[1]) # None * K # ---------- Deep component ---------- #[field_size, embedding_size]的输入做flatten self.y_deep = tf.reshape(self.embeddings, shape=[-1, self.field_size * self.embedding_size]) # None * (F*K) self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0]) #下面就是正常的MLP for i in range(0, len(self.deep_layers)): self.y_deep = tf.add(tf.matmul(self.y_deep, self.weights["layer_%d" %i]), self.weights["bias_%d"%i]) # None * layer[i] * 1 if self.batch_norm: self.y_deep = self.batch_norm_layer(self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" %i) # None * layer[i] * 1 self.y_deep = self.deep_layers_activation(self.y_deep) self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[1+i]) # dropout at each Deep layer # ---------- DeepFM ---------- if self.use_fm and self.use_deep: concat_input = tf.concat([self.y_first_order, self.y_second_order, self.y_deep], axis=1) elif self.use_fm: concat_input = tf.concat([self.y_first_order, self.y_second_order], axis=1) elif self.use_deep: concat_input = self.y_deep self.out = tf.add(tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"]) # loss if self.loss_type == "logloss": self.out = tf.nn.sigmoid(self.out) self.loss = tf.losses.log_loss(self.label, self.out) elif self.loss_type == "mse": self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out)) # l2 regularization on weights if self.l2_reg > 0: self.loss += tf.contrib.layers.l2_regularizer( self.l2_reg)(self.weights["concat_projection"]) if self.use_deep: for i in range(len(self.deep_layers)): self.loss += tf.contrib.layers.l2_regularizer( self.l2_reg)(self.weights["layer_%d"%i]) # optimizer if self.optimizer_type == "adam": self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) elif self.optimizer_type == "adagrad": self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss) elif self.optimizer_type == "gd": self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss) elif self.optimizer_type == "momentum": self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize( self.loss) elif self.optimizer_type == "yellowfin": self.optimizer = YFOptimizer(learning_rate=self.learning_rate, momentum=0.0).minimize( self.loss) # init self.saver = tf.train.Saver() init = tf.global_variables_initializer() self.sess = self._init_session() self.sess.run(init) # number of params total_parameters = 0 for variable in self.weights.values(): shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters if self.verbose > 0: print("#params: %d" % total_parameters)
def test_lr_mu(): opt = YFOptimizer(zero_debias=False) w = tf.Variable(np.ones([n_dim, ] ), dtype=tf.float32, name="w", trainable=True) b = tf.Variable(np.ones([1, ], dtype=np.float32), dtype=tf.float32, name="b", trainable=True) x = tf.constant(np.ones([n_dim, ], dtype=np.float32), dtype=tf.float32) loss = tf.multiply(w, x) + b tvars = tf.trainable_variables() w_grad_val = tf.Variable(np.zeros( [n_dim, ] ), dtype=tf.float32, trainable=False) b_grad_val = tf.Variable(np.zeros([1, ] ), dtype=tf.float32, trainable=False) apply_op = opt.apply_gradients(zip([w_grad_val, b_grad_val], tvars) ) init_op = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init_op) target_h_max = 0.0 target_h_min = 0.0 g_norm_squared_avg = 0.0 g_norm_avg = 0.0 g_avg = 0.0 target_dist = 0.0 target_lr = 0.1 target_mu = 0.0 for i in range(n_iter): sess.run(tf.assign(w_grad_val, (i + 1) * np.ones( [n_dim, ], dtype=np.float32) ) ) sess.run(tf.assign(b_grad_val, (i + 1) * np.ones( [1, ], dtype=np.float32) ) ) res = sess.run( [opt._curv_win, opt._h_max, opt._h_min, opt._grad_var, opt._dist_to_opt_avg, opt._lr_var, opt._mu_var, apply_op] ) res[5] = opt._lr_var.eval() res[6] = opt._mu_var.eval() g_norm_squared_avg = 0.999 * g_norm_squared_avg \ + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2) g_norm_avg = 0.999 * g_norm_avg \ + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) ) g_avg = 0.999 * g_avg + 0.001 * (i + 1) target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2*(n_dim + 1) target_h_min = 0.999 * target_h_min + 0.001 * max(1, i + 2 - 20)**2*(n_dim + 1) target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1) target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg if i > 0: lr, mu = tune_everything(target_dist**2, target_var, 1, target_h_min, target_h_max) target_lr = 0.999 * target_lr + 0.001 * lr target_mu = 0.999 * target_mu + 0.001 * mu # print "iter ", i, " h max ", res[1], target_h_max, " h min ", res[2], target_h_min, \ # " var ", res[3], target_var, " dist ", res[4], target_dist # print "iter ", i, " lr ", res[5], target_lr, " mu ", res[6], target_mu assert np.abs(target_h_max - res[1] ) < np.abs(target_h_max) * 1e-3 assert np.abs(target_h_min - res[2] ) < np.abs(target_h_min) * 1e-3 assert np.abs(target_var - res[3] ) < np.abs(res[3] ) * 1e-3 assert np.abs(target_dist - res[4] ) < np.abs(res[4] ) * 1e-3 assert target_lr == 0.0 or np.abs(target_lr - res[5] ) < np.abs(res[5] ) * 1e-3 assert target_mu == 0.0 or np.abs(target_mu - res[6] ) < np.abs(res[6] ) * 5e-3 print "lr and mu computing test passed!"
maxlen=maxlen, minlen=length) valid = TextIterator(valid_dataset, dictionary, n_words_source=n_words, batch_size=valid_batch_size, maxlen=maxlen, minlen=length) rnn = RNN_LSTM(input_size, rnn_dim, num_layers, num_classes) rnn.cuda() criterion = nn.CrossEntropyLoss() #opt = torch.optim.Adam(rnn.parameters(), lr=lr) opt = YFOptimizer(rnn.parameters()) def evaluate_valid(valid): valid_loss = [] acc = 0.0 N = 0 for x in valid: x = numpy.asarray(x, dtype=numpy.float32) x = torch.from_numpy(x) x = x.view(x.size()[0], x.size()[1], input_size) y = torch.cat((x[:, 1:, :], torch.zeros([x.size()[0], 1, input_size])), 1) images = Variable(x).cuda() labels = Variable(y).long().cuda() opt.zero_grad()