def train(dataset, vectors_path, lr_file, ckpt_dir, checkpoint, idx2vocab, vocab_unigrams, embedding_size, neg_sampled, distortion_power, batch_size, initial_learning_rate, decay_epochs, decay_rate, iter_epochs, allow_soft_placement, log_device_placement, gpu_memory_fraction, using_gpu, allow_growth, loss_interval, summary_steps, ckpt_interval, ckpt_epochs, summary_interval, decay_interval, train_workers): num_steps_per_epoch = int(dataset.num_examples / batch_size) iter_steps = iter_epochs * num_steps_per_epoch decay_steps = int(decay_epochs * num_steps_per_epoch) ckpt_steps = int(ckpt_epochs * num_steps_per_epoch) LR = utils.LearningRateGenerator( initial_learning_rate=initial_learning_rate, initial_steps=0, decay_rate=decay_rate, decay_steps=decay_steps) with tf.Graph().as_default(), tf.device( '/gpu:0' if using_gpu else '/cpu:0'): global_step = tf.Variable(0, trainable=False, name="global_step") inputs = tf.placeholder(tf.int32, shape=[batch_size], name='inputs') labels = tf.placeholder(tf.int32, shape=[batch_size], name='labels') learning_rate = tf.placeholder(tf.float32, name='learning_rate') model = Word2Vec(vocab_size=len(idx2vocab), embedding_size=embedding_size, vocab_unigrams=vocab_unigrams, neg_sampled=neg_sampled, distortion_power=distortion_power, batch_size=batch_size) train_op, loss = model.train(inputs, labels, global_step, learning_rate) # Create a saver. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=5) summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init_op = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU implementations. config = tf.ConfigProto(allow_soft_placement=allow_soft_placement, log_device_placement=log_device_placement) config.gpu_options.per_process_gpu_memory_fraction = gpu_memory_fraction config.gpu_options.allow_growth = allow_growth # config.gpu_options.visible_device_list = visible_device_list with tf.Session(config=config) as sess: # first_step = 0 if checkpoint == '0': # new train sess.run(init_op) elif checkpoint == '-1': # choose the latest one ckpt = tf.train.get_checkpoint_state(ckpt_dir) if ckpt and ckpt.model_checkpoint_path: # new_saver = tf.train.import_meta_graph(ckpt.model_checkpoint_path + '.meta') # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) # global_step_for_restore = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] # first_step = int(global_step_for_restore) + 1 else: logger.warning('No checkpoint file found') return else: if os.path.exists( os.path.join(ckpt_dir, 'model.ckpt-' + checkpoint + '.index')): # new_saver = tf.train.import_meta_graph( # os.path.join(ckpt_dir, 'model.ckpt-' + checkpoint + '.meta')) saver.restore( sess, os.path.join(ckpt_dir, 'model.ckpt-' + checkpoint)) # first_step = int(checkpoint) + 1 else: logger.warning( 'checkpoint {} not found'.format(checkpoint)) return summary_writer = tf.summary.FileWriter(ckpt_dir, sess.graph) ## train executor_workers = train_workers - 1 if executor_workers > 0: executor = ThreadPoolExecutor(max_workers=executor_workers) for _ in range(executor_workers): executor.submit(_train_thread_body, dataset, batch_size, inputs, labels, sess, train_op, iter_steps, global_step, learning_rate, LR) last_loss_time = time.time() - loss_interval last_summary_time = time.time() - summary_interval last_decay_time = last_checkpoint_time = time.time() last_decay_step = last_summary_step = last_checkpoint_step = 0 while True: start_time = time.time() batch_data, batch_labels = dataset.next_batch( batch_size, keep_strict_batching=True) feed_dict = { inputs: batch_data, labels: batch_labels, learning_rate: LR.learning_rate } _, loss_value, cur_step = sess.run( [train_op, loss, global_step], feed_dict=feed_dict) now = time.time() assert not np.isnan( loss_value), 'Model diverged with loss = NaN' epoch, epoch_step = divmod(cur_step, num_steps_per_epoch) if now - last_loss_time >= loss_interval: format_str = '%s: step=%d(%d/%d), lr=%.6f, loss=%.6f, duration/step=%.4fs' logger.info(format_str % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step, epoch_step, epoch, LR.learning_rate, loss_value, now - start_time)) last_loss_time = time.time() if now - last_summary_time >= summary_interval or cur_step - last_summary_step >= summary_steps or cur_step >= iter_steps: summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, cur_step) last_summary_time = time.time() last_summary_step = cur_step ckpted = False # Save the model checkpoint periodically. (named 'model.ckpt-global_step.meta') if now - last_checkpoint_time >= ckpt_interval or cur_step - last_checkpoint_step >= ckpt_steps or cur_step >= iter_steps: checkpoint_path = os.path.join(ckpt_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=cur_step) # embedding_vectors = sess.run(model.vectors, feed_dict=feed_dict) vecs, weights, biases = sess.run([ model.vectors, model.context_weights, model.context_biases ], feed_dict=feed_dict) save_word2vec_format(vectors_path, vecs, idx2vocab) np.savetxt(vectors_path + ".contexts", weights) np.savetxt(vectors_path + ".context_biases", biases) last_checkpoint_time = time.time() last_checkpoint_step = cur_step ckpted = True # update learning rate if ckpted or now - last_decay_time >= decay_interval or cur_step - last_decay_step >= decay_steps: lr_info = np.loadtxt(lr_file, dtype=float) if np.abs(lr_info[1] - decay_epochs) >= 1e-7: decay_epochs = lr_info[1] decay_steps = int(decay_epochs * num_steps_per_epoch) if np.abs(lr_info[2] - decay_rate) >= 1e-7: decay_rate = lr_info[2] if np.abs(lr_info[0] - initial_learning_rate) < 1e-7: LR.exponential_decay(cur_step, decay_rate=decay_rate, decay_steps=decay_steps) else: initial_learning_rate = lr_info[0] LR.reset(initial_learning_rate=initial_learning_rate, initial_steps=cur_step, decay_rate=decay_rate, decay_steps=decay_steps) last_decay_time = time.time() last_decay_step = cur_step if cur_step >= iter_steps: break
def train(net, vectors_path, lr_file, ckpt_dir, checkpoint, embedding_size, neg_sampled, order, distortion_power, iter_epochs, batch_size, initial_learning_rate, decay_epochs, decay_interval, decay_rate, allow_soft_placement, log_device_placement, gpu_memory_fraction, using_gpu, allow_growth, loss_interval, summary_steps, summary_interval, ckpt_epochs, ckpt_interval, train_workers): edge_sampler = Edge_sampler(net, batch_size) edges_size = edge_sampler.edges_size nodes_size = net.get_nodes_size() num_steps_per_epoch = int(edges_size / batch_size) iter_steps = round( iter_epochs * num_steps_per_epoch) # iter_epochs should be big enough to converge. decay_steps = round(decay_epochs * num_steps_per_epoch) ckpt_steps = round(ckpt_epochs * num_steps_per_epoch) nodes_degrees = [net.get_degrees(v) for v in range(nodes_size)] LR = utils.LearningRateGenerator( initial_learning_rate=initial_learning_rate, initial_steps=0, decay_rate=decay_rate, decay_steps=decay_steps, iter_steps=iter_steps) with tf.Graph().as_default(), tf.device( '/gpu:0' if using_gpu else '/cpu:0'): inputs = tf.placeholder(tf.int32, shape=[batch_size], name='inputs') labels = tf.placeholder(tf.int32, shape=[batch_size], name='labels') learning_rate = tf.placeholder(tf.float32, name='learning_rate') model_list = [] trains_list = [] if order == "1": with tf.name_scope("1st_order"): model = SGNS(vocab_size=nodes_size, embedding_size=embedding_size, vocab_unigrams=nodes_degrees, distortion_power=distortion_power, neg_sampled=neg_sampled, batch_size=batch_size, order=1) global_step = tf.Variable(0, trainable=False, name="global_step") train_op, loss = model.train(inputs, labels, global_step, learning_rate) model_list.append(model) trains_list.append((train_op, loss, global_step)) elif order == "2": with tf.name_scope("2st_order"): model = SGNS(vocab_size=nodes_size, embedding_size=embedding_size, vocab_unigrams=nodes_degrees, distortion_power=distortion_power, neg_sampled=neg_sampled, batch_size=batch_size, order=2) global_step = tf.Variable(0, trainable=False, name="global_step") train_op, loss = model.train(inputs, labels, global_step, learning_rate) model_list.append(model) trains_list.append((train_op, loss, global_step)) elif order == "3": with tf.name_scope("1st_order"): model = SGNS(vocab_size=nodes_size, embedding_size=embedding_size // 2, vocab_unigrams=nodes_degrees, distortion_power=distortion_power, neg_sampled=neg_sampled, batch_size=batch_size, order=1) global_step = tf.Variable(0, trainable=False, name="global_step") train_op, loss = model.train(inputs, labels, global_step, learning_rate) model_list.append(model) trains_list.append((train_op, loss, global_step)) with tf.name_scope("2st_order"): model = SGNS(vocab_size=nodes_size, embedding_size=embedding_size // 2, vocab_unigrams=nodes_degrees, distortion_power=distortion_power, neg_sampled=neg_sampled, batch_size=batch_size, order=2) global_step = tf.Variable(0, trainable=False, name="global_step") train_op, loss = model.train(inputs, labels, global_step, learning_rate) model_list.append(model) trains_list.append((train_op, loss, global_step)) else: logger.error("unvalid order in LINE: '%s'. " % order) sys.exit() # Create a saver. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=5) summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init_op = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU implementations. config = tf.ConfigProto(allow_soft_placement=allow_soft_placement, log_device_placement=log_device_placement) config.gpu_options.per_process_gpu_memory_fraction = gpu_memory_fraction config.gpu_options.allow_growth = allow_growth # config.gpu_options.visible_device_list = visible_device_list with tf.Session(config=config) as sess: # first_step = 0 if checkpoint == '0': # new train sess.run(init_op) elif checkpoint == '-1': # choose the latest one ckpt = tf.train.get_checkpoint_state(ckpt_dir) if ckpt and ckpt.model_checkpoint_path: # new_saver = tf.train.import_meta_graph(ckpt.model_checkpoint_path + '.meta') # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) # global_step_for_restore = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] # first_step = int(global_step_for_restore) + 1 else: logger.warning('No checkpoint file found') return else: if os.path.exists( os.path.join(ckpt_dir, 'model.ckpt-' + checkpoint + '.index')): # new_saver = tf.train.import_meta_graph( # os.path.join(ckpt_dir, 'model.ckpt-' + checkpoint + '.meta')) saver.restore( sess, os.path.join(ckpt_dir, 'model.ckpt-' + checkpoint)) # first_step = int(checkpoint) + 1 else: logger.warning( 'checkpoint {} not found'.format(checkpoint)) return summary_writer = tf.summary.FileWriter(ckpt_dir, sess.graph) ## train executor_workers = train_workers - 1 if executor_workers > 0: futures = set() executor = ThreadPoolExecutor(max_workers=executor_workers) for _ in range(executor_workers): future = executor.submit(_train_thread_body, edge_sampler, inputs, labels, sess, trains_list, learning_rate, LR) logger.info("open a new training thread: %s" % future) futures.add(future) last_loss_time = time.time() - loss_interval last_summary_time = time.time() - summary_interval last_decay_time = last_checkpoint_time = time.time() last_decay_step = last_summary_step = last_checkpoint_step = 0 while True: start_time = time.time() batch_data, batch_labels = edge_sampler.next_batch() feed_dict = { inputs: batch_data, labels: batch_labels, learning_rate: LR.learning_rate } loss_value_list = [] for train_op, loss, global_step in trains_list: _, loss_value, cur_step = sess.run( [train_op, loss, global_step], feed_dict=feed_dict) assert not np.isnan( loss_value), 'Model diverged with loss = NaN' loss_value_list.append(loss_value) now = time.time() epoch, epoch_step = divmod(cur_step, num_steps_per_epoch) if now - last_loss_time >= loss_interval: if len(loss_value_list) == 1: loss_str = "%.6f" % loss_value_list[0] else: loss_str = "[%.6f, %.6f]" % (loss_value_list[0], loss_value_list[1]) format_str = '%s: step=%d(%d/%d), lr=%.6f, loss=%s, duration/step=%.4fs' logger.info(format_str % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step, epoch_step, epoch, LR.learning_rate, loss_str, now - start_time)) last_loss_time = time.time() if now - last_summary_time >= summary_interval or cur_step - last_summary_step >= summary_steps or cur_step >= iter_steps: summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, cur_step) last_summary_time = time.time() last_summary_step = cur_step ckpted = False # Save the model checkpoint periodically. (named 'model.ckpt-global_step.meta') if now - last_checkpoint_time >= ckpt_interval or cur_step - last_checkpoint_step >= ckpt_steps or cur_step >= iter_steps: # embedding_vectors = sess.run(model.vectors, feed_dict=feed_dict) vecs_list = [] for model in model_list: vecs = sess.run(model.vectors, feed_dict=feed_dict) vecs_list.append(vecs) vecs = np.concatenate(vecs_list, axis=1) checkpoint_path = os.path.join(ckpt_dir, 'model.ckpt') utils.save_word2vec_format_and_ckpt( vectors_path, vecs, checkpoint_path, sess, saver, cur_step) last_checkpoint_time = time.time() last_checkpoint_step = cur_step ckpted = True # update learning rate if ckpted or now - last_decay_time >= decay_interval or ( decay_steps > 0 and cur_step - last_decay_step >= decay_steps): lr_info = np.loadtxt(lr_file, dtype=float) if np.abs(lr_info[1] - decay_epochs) > 1e-6: decay_epochs = lr_info[1] decay_steps = round(decay_epochs * num_steps_per_epoch) if np.abs(lr_info[2] - decay_rate) > 1e-6: decay_rate = lr_info[2] if np.abs(lr_info[3] - iter_epochs) > 1e-6: iter_epochs = lr_info[3] iter_steps = round(iter_epochs * num_steps_per_epoch) if np.abs(lr_info[0] - initial_learning_rate) > 1e-6: initial_learning_rate = lr_info[0] LR.reset(initial_learning_rate=initial_learning_rate, initial_steps=cur_step, decay_rate=decay_rate, decay_steps=decay_steps, iter_steps=iter_steps) else: LR.exponential_decay(cur_step, decay_rate=decay_rate, decay_steps=decay_steps, iter_steps=iter_steps) last_decay_time = time.time() last_decay_step = cur_step if cur_step >= LR.iter_steps: break summary_writer.close() if executor_workers > 0: logger.info("waiting the training threads finished:") try: for future in as_completed(futures): logger.info(future) except KeyboardInterrupt: print("stopped by hand.")
def train(dataset, lr_file, ckpt_dir, checkpoint, options): nodes_size = dataset._nodes_size num_steps_per_epoch = int(nodes_size / options.batch_size) iter_epochs = options.iter_epoches iter_steps = round( iter_epochs * num_steps_per_epoch) # iter_epoches should be big enough to converge. decay_epochs = options.decay_epochs decay_steps = round(decay_epochs * num_steps_per_epoch) ckpt_steps = round(options.ckpt_epochs * num_steps_per_epoch) initial_learning_rate = options.learning_rate decay_rate = options.decay_rate LR = utils.LearningRateGenerator( initial_learning_rate=initial_learning_rate, initial_steps=0, decay_rate=decay_rate, decay_steps=decay_steps, iter_steps=iter_steps) with tf.Graph().as_default(), tf.device( '/gpu:0' if options.using_gpu else '/cpu:0'): global_step = tf.Variable(0, trainable=False, name="global_step") learning_rate = tf.placeholder(tf.float32, name='learning_rate') inputs = tf.placeholder(tf.float32, shape=[None, options.feature_size], name='inputs') laplacian = tf.placeholder(tf.float32, [None, None], name="laplacian_matrix") if options.using_label: labels = tf.placeholder(tf.int32, shape=[None, options.label_size], name='labels') else: labels = tf.placeholder(tf.int32, shape=[None, None], name='adjacency') model = GCN(dropout=options.dropout, feature_size=options.feature_size, using_label=options.using_label, embedding_size=options.embedding_size, hidden_size_list=options.hidden_size_list, label_size=options.label_size, weight_decay=options.weight_decay) train_op, loss = model.train(inputs, laplacian, labels, global_step, learning_rate) # Create a saver. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=6) summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init_op = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU implementations. config = tf.ConfigProto( allow_soft_placement=options.allow_soft_placement, log_device_placement=options.log_device_placement) config.gpu_options.per_process_gpu_memory_fraction = options.gpu_memory_fraction config.gpu_options.allow_growth = options.allow_growth # config.gpu_options.visible_device_list = visible_device_list with tf.Session(config=config) as sess: # first_step = 0 if checkpoint == '0': # new train sess.run(init_op) elif checkpoint == '-1': # choose the latest one ckpt = tf.train.get_checkpoint_state(ckpt_dir) if ckpt and ckpt.model_checkpoint_path: # new_saver = tf.train.import_meta_graph(ckpt.model_checkpoint_path + '.meta') # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) # global_step_for_restore = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] # first_step = int(global_step_for_restore) + 1 else: logger.warning('No checkpoint file found') return else: if os.path.exists( os.path.join(ckpt_dir, 'model.ckpt-' + checkpoint + '.index')): # new_saver = tf.train.import_meta_graph( # os.path.join(ckpt_dir, 'model.ckpt-' + checkpoint + '.meta')) saver.restore( sess, os.path.join(ckpt_dir, 'model.ckpt-' + checkpoint)) # first_step = int(checkpoint) + 1 else: logger.warning( 'checkpoint {} not found'.format(checkpoint)) return summary_writer = tf.summary.FileWriter(ckpt_dir, sess.graph) last_loss_time = time.time() - options.loss_interval last_summary_time = time.time() - options.summary_interval last_decay_time = last_checkpoint_time = time.time() last_decay_step = last_summary_step = last_checkpoint_step = 0 while True: start_time = time.time() batch_features, batch_adj, batch_labels = dataset.next_batch( options.batch_size) feed_dict = { inputs: batch_features, laplacian: batch_adj, labels: batch_labels, learning_rate: LR.learning_rate } _, loss_value, cur_step = sess.run( [train_op, loss, global_step], feed_dict=feed_dict) now = time.time() assert not np.isnan( loss_value), 'Model diverged with loss = NaN' epoch, epoch_step = divmod(cur_step, num_steps_per_epoch) if now - last_loss_time >= options.loss_interval: format_str = '%s: step=%d(%d/%d), lr=%.6f, loss=%.6f, duration/step=%.4fs' logger.info(format_str % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step, epoch_step, epoch, LR.learning_rate, loss_value, now - start_time)) last_loss_time = time.time() if now - last_summary_time >= options.summary_interval or cur_step - last_summary_step >= options.summary_steps or cur_step >= iter_steps: summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, cur_step) last_summary_time = time.time() last_summary_step = cur_step ckpted = False # Save the model checkpoint periodically. (named 'model.ckpt-global_step.meta') if now - last_checkpoint_time >= options.ckpt_interval or cur_step - last_checkpoint_step >= ckpt_steps or cur_step >= iter_steps: if options.batch_size == nodes_size: batch_features, batch_adj, batch_labels = dataset.get_full( ) feed_dict = { inputs: batch_features, laplacian: batch_adj, labels: batch_labels, learning_rate: LR.learning_rate } vecs = sess.run(model.vectors, feed_dict=feed_dict) else: vecs = [] start = 0 while start < nodes_size: end = min(nodes_size, start + options.batch_size) index = np.arange(start, end) start = end batch_features, batch_adj, batch_labels = dataset.get_batch( index) feed_dict = { inputs: batch_features, laplacian: batch_adj, labels: batch_labels, learning_rate: LR.learning_rate } batch_embeddings = sess.run(model.vectors, feed_dict=feed_dict) vecs.append(batch_embeddings) vecs = np.concatenate(vecs, axis=0) checkpoint_path = os.path.join(ckpt_dir, 'model.ckpt') utils.save_word2vec_format_and_ckpt( options.vectors_path, vecs, checkpoint_path, sess, saver, cur_step) last_checkpoint_time = time.time() last_checkpoint_step = cur_step ckpted = True # update learning rate if ckpted or now - last_decay_time >= options.decay_interval or ( decay_steps > 0 and cur_step - last_decay_step >= decay_steps): lr_info = np.loadtxt(lr_file, dtype=float) if np.abs(lr_info[1] - decay_epochs) > 1e-6: decay_epochs = lr_info[1] decay_steps = round(decay_epochs * num_steps_per_epoch) if np.abs(lr_info[2] - decay_rate) > 1e-6: decay_rate = lr_info[2] if np.abs(lr_info[3] - iter_epochs) > 1e-6: iter_epochs = lr_info[3] iter_steps = round(iter_epochs * num_steps_per_epoch) if np.abs(lr_info[0] - initial_learning_rate) > 1e-6: initial_learning_rate = lr_info[0] LR.reset(initial_learning_rate=initial_learning_rate, initial_steps=cur_step, decay_rate=decay_rate, decay_steps=decay_steps, iter_steps=iter_steps) else: LR.exponential_decay(cur_step, decay_rate=decay_rate, decay_steps=decay_steps, iter_steps=iter_steps) last_decay_time = time.time() last_decay_step = cur_step if cur_step >= LR.iter_steps: break summary_writer.close()
def train(walker, lr_file, ckpt_dir, checkpoint, options): vocab_size = walker.nodes_size types_size = walker.node_types_size num_steps_per_epoch = int( vocab_size * options.train_workers / options.batch_size) # a rough formula of epoch in RWR.??????????? iter_epochs = options.iter_epoches iter_steps = round( iter_epochs * num_steps_per_epoch) # iter_epoches should be big enough to converge. decay_epochs = options.decay_epochs decay_steps = round(decay_epochs * num_steps_per_epoch) ckpt_steps = round(options.ckpt_epochs * num_steps_per_epoch) initial_learning_rate = options.learning_rate decay_rate = options.decay_rate LR = utils.LearningRateGenerator( initial_learning_rate=initial_learning_rate, initial_steps=0, decay_rate=decay_rate, decay_steps=decay_steps, iter_steps=iter_steps) with tf.Graph().as_default(), tf.device( '/gpu:0' if options.using_gpu else '/cpu:0'): global_step = tf.Variable(0, trainable=False, name="global_step") # inputs(center_nodes), labels(context_nodes), labels_type(context_nodes_type), neg_labels(neg_nodes) inputs = tf.placeholder(tf.int32, name='inputs') # center_nodes labels = [ tf.placeholder(tf.int32, shape=[None], name='labels_T{}'.format(type_i)) for type_i in range(types_size) ] labels_mask = [ tf.placeholder(tf.float32, name='labels_mask_T{}'.format(type_i)) for type_i in range(types_size) ] neg_labels = [ tf.placeholder(tf.int32, shape=[None], name='neg_labels_T{}'.format(type_i)) for type_i in range(types_size) ] learning_rate = tf.placeholder(tf.float32, name='learning_rate') model = SGNS(vocab_size=vocab_size, embedding_size=options.embedding_size, type_size=types_size) train_op, loss = model.train(inputs, labels, labels_mask, neg_labels, global_step, learning_rate) # Create a saver. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=6) summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init_op = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU implementations. config = tf.ConfigProto( allow_soft_placement=options.allow_soft_placement, log_device_placement=options.log_device_placement) config.gpu_options.per_process_gpu_memory_fraction = options.gpu_memory_fraction config.gpu_options.allow_growth = options.allow_growth # config.gpu_options.visible_device_list = visible_device_list with tf.Session(config=config) as sess: # first_step = 0 if checkpoint == '0': # new train sess.run(init_op) elif checkpoint == '-1': # choose the latest one ckpt = tf.train.get_checkpoint_state(ckpt_dir) if ckpt and ckpt.model_checkpoint_path: # new_saver = tf.train.import_meta_graph(ckpt.model_checkpoint_path + '.meta') # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) # global_step_for_restore = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] # first_step = int(global_step_for_restore) + 1 else: logger.warning('No checkpoint file found') return else: if os.path.exists( os.path.join(ckpt_dir, 'model.ckpt-' + checkpoint + '.index')): # new_saver = tf.train.import_meta_graph( # os.path.join(ckpt_dir, 'model.ckpt-' + checkpoint + '.meta')) saver.restore( sess, os.path.join(ckpt_dir, 'model.ckpt-' + checkpoint)) # first_step = int(checkpoint) + 1 else: logger.warning( 'checkpoint {} not found'.format(checkpoint)) return summary_writer = tf.summary.FileWriter(ckpt_dir, sess.graph) last_loss_time = time.time() - options.loss_interval last_summary_time = time.time() - options.summary_interval last_decay_time = last_checkpoint_time = time.time() last_decay_step = last_summary_step = last_checkpoint_step = 0 rwrgenerator = RWRGenerator(walker=walker, walk_times=options.walk_times) while True: start_time = time.time() batch_inputs, batch_labels, batch_labels_mask, batch_neg_labels = rwrgenerator.next_batch( ) feed_dict = { inputs: batch_inputs, learning_rate: LR.learning_rate } for type_i in range(types_size): feed_dict[labels[type_i]] = batch_labels[type_i] feed_dict[labels_mask[type_i]] = batch_labels_mask[type_i] feed_dict[neg_labels[type_i]] = batch_neg_labels[type_i] _, loss_value, cur_step = sess.run( [train_op, loss, global_step], feed_dict=feed_dict) now = time.time() assert not np.isnan( loss_value), 'Model diverged with loss = NaN' epoch, epoch_step = divmod(cur_step, num_steps_per_epoch) if now - last_loss_time >= options.loss_interval: format_str = '%s: step=%d(%d/%d), lr=%.6f, loss=%.6f, duration/step=%.4fs' logger.info(format_str % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step, epoch_step, epoch, LR.learning_rate, loss_value, now - start_time)) last_loss_time = time.time() if now - last_summary_time >= options.summary_interval or cur_step - last_summary_step >= options.summary_steps or cur_step >= iter_steps: summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, cur_step) last_summary_time = time.time() last_summary_step = cur_step ckpted = False # Save the model checkpoint periodically. (named 'model.ckpt-global_step.meta') if now - last_checkpoint_time >= options.ckpt_interval or cur_step - last_checkpoint_step >= ckpt_steps or cur_step >= iter_steps: vecs, global_step_value = sess.run( [model.vectors, global_step], feed_dict=feed_dict) # vecs,weights,biases = sess.run([model.vectors,model.context_weights,model.context_biases], # feed_dict=feed_dict) checkpoint_path = os.path.join(ckpt_dir, 'model.ckpt') utils.save_word2vec_format_and_ckpt( options.vectors_path, vecs, checkpoint_path, sess, saver, global_step_value, types_size) # save_word2vec_format(vectors_path+".contexts", weights, walker.idx_nodes) # save_word2vec_format(vectors_path+".context_biases", np.reshape(biases,[-1,1]), walker.idx_nodes) last_checkpoint_time = time.time() last_checkpoint_step = global_step_value ckpted = True # update learning rate if ckpted or now - last_decay_time >= options.decay_interval or ( decay_steps > 0 and cur_step - last_decay_step >= decay_steps): lr_info = np.loadtxt(lr_file, dtype=float) if np.abs(lr_info[1] - decay_epochs) > 1e-6: decay_epochs = lr_info[1] decay_steps = round(decay_epochs * num_steps_per_epoch) if np.abs(lr_info[2] - decay_rate) > 1e-6: decay_rate = lr_info[2] if np.abs(lr_info[3] - iter_epochs) > 1e-6: iter_epochs = lr_info[3] iter_steps = round(iter_epochs * num_steps_per_epoch) if np.abs(lr_info[0] - initial_learning_rate) > 1e-6: initial_learning_rate = lr_info[0] LR.reset(initial_learning_rate=initial_learning_rate, initial_steps=cur_step, decay_rate=decay_rate, decay_steps=decay_steps, iter_steps=iter_steps) else: LR.exponential_decay(cur_step, decay_rate=decay_rate, decay_steps=decay_steps, iter_steps=iter_steps) last_decay_time = time.time() last_decay_step = cur_step if cur_step >= LR.iter_steps: break summary_writer.close()
def train(dataset, vectors_path, lr_file, ckpt_dir, checkpoint, embedding_size, struct, alpha, beta, gamma, reg, sparse_dot, iter_epochs, batch_size, initial_learning_rate, decay_epochs, decay_interval, decay_rate, allow_soft_placement, log_device_placement, gpu_memory_fraction, using_gpu, allow_growth, loss_interval, summary_steps, summary_interval, ckpt_epochs, ckpt_interval, dbn_initial, dbn_epochs, dbn_batchsize, dbn_learning_rate, active_function="sigmoid"): actv_func = { 'sigmoid': tf.sigmoid, 'tanh': tf.tanh, 'relu': tf.nn.relu, 'leaky_relu': tf.nn.leaky_relu }[active_function] nodes_size = dataset.nodes_size num_steps_per_epoch = int(nodes_size / batch_size) # iter_steps = round( iter_epochs * num_steps_per_epoch) # iter_epochs should be big enough to converge. decay_steps = round(decay_epochs * num_steps_per_epoch) ckpt_steps = round(ckpt_epochs * num_steps_per_epoch) LR = utils.LearningRateGenerator( initial_learning_rate=initial_learning_rate, initial_steps=0, decay_rate=decay_rate, decay_steps=decay_steps, iter_steps=iter_steps) with tf.Graph().as_default(), tf.device( '/gpu:0' if using_gpu else '/cpu:0'): global_step = tf.Variable(0, trainable=False, name="global_step") adj_matrix = tf.placeholder(tf.float32, [None, None]) if sparse_dot: inputs_sp_indices = tf.placeholder(tf.int64) inputs_sp_ids_val = tf.placeholder(tf.float32) inputs_sp_shape = tf.placeholder(tf.int64) inputs = tf.SparseTensor(inputs_sp_indices, inputs_sp_ids_val, inputs_sp_shape) else: inputs = tf.placeholder(tf.float32, [None, nodes_size]) learning_rate = tf.placeholder(tf.float32, name='learning_rate') model = SDNE(nodes_size=nodes_size, struct=struct, embedding_size=embedding_size, alpha=alpha, beta=beta, gamma=gamma, reg=reg, sparse_dot=sparse_dot, active_function=actv_func) train_op, loss, embeddings = model.train(inputs, adj_matrix, global_step, learning_rate) # Create a saver. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=5) summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init_op = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU implementations. config = tf.ConfigProto(allow_soft_placement=allow_soft_placement, log_device_placement=log_device_placement) config.gpu_options.per_process_gpu_memory_fraction = gpu_memory_fraction config.gpu_options.allow_growth = allow_growth # config.gpu_options.visible_device_list = visible_device_list with tf.Session(config=config) as sess: # first_step = 0 if checkpoint == '0': # new train sess.run(init_op) if dbn_initial: time_start = time.time() logger.info("DBN initial start ...") RBMs = [] for i in range(len(model._struct) - 1): RBM = rbm(model._struct[i], model._struct[i + 1], batchsize=dbn_batchsize, learning_rate=dbn_learning_rate, config=config) logger.info("create rbm {}-{}".format( model._struct[i], model._struct[i + 1])) RBMs.append(RBM) for epoch in range(dbn_epochs): error = 0 for batch in range(0, nodes_size, batch_size): # 这句没动 # 它是遍历了全局的node? mini_batch, _ = dataset.next_batch(batch_size) for k in range(len(RBMs) - 1): mini_batch = RBMs[k].getH(mini_batch) error += RBM.fit(mini_batch) logger.info("rbm_" + str(len(RBMs)) + " epochs:" + str(epoch) + " error: " + str(error)) W, bv, bh = RBM.getWb() name = "encoder" + str(i) def assign(a, b, sessss): op = a.assign(b) sessss.run(op) assign(model._weights[name], W, sess) assign(model._bias[name], bh, sess) name = "decoder" + str(len(model._struct) - i - 2) assign(model._weights[name], W.transpose(), sess) assign(model._bias[name], bv, sess) logger.info( "dbn_init finished in {}s.".format(time.time() - time_start)) vecs = [] start = 0 while start < nodes_size: end = min(nodes_size, start + batch_size) index = np.arange(start, end) start = end batch_input, batch_adj = dataset.get_batch(index) if sparse_dot: batch_input_ind = np.vstack( np.where(batch_input)).astype(np.int64).T batch_input_shape = np.array(batch_input.shape).astype( np.int64) batch_input_val = batch_input[np.where(batch_input)] feed_dict = { inputs_sp_indices: batch_input_ind, inputs_sp_shape: batch_input_shape, inputs_sp_ids_val: batch_input_val, adj_matrix: batch_adj, learning_rate: LR.learning_rate } else: feed_dict = { inputs: batch_input, adj_matrix: batch_adj, learning_rate: LR.learning_rate } batch_embeddings = sess.run(embeddings, feed_dict=feed_dict) vecs.append(batch_embeddings) vecs = np.concatenate(vecs, axis=0) checkpoint_path = os.path.join(ckpt_dir, 'model.ckpt') utils.save_word2vec_format_and_ckpt(vectors_path, vecs, checkpoint_path, sess, saver, 0) elif checkpoint == '-1': # load the latest one ckpt = tf.train.get_checkpoint_state(ckpt_dir) if ckpt and ckpt.model_checkpoint_path: # new_saver = tf.train.import_meta_graph(ckpt.model_checkpoint_path + '.meta') # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) # global_step_for_restore = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] # first_step = int(global_step_for_restore) + 1 else: logger.warning('No checkpoint file found') return else: if os.path.exists( os.path.join(ckpt_dir, 'model.ckpt-' + checkpoint + '.index')): # new_saver = tf.train.import_meta_graph( # os.path.join(ckpt_dir, 'model.ckpt-' + checkpoint + '.meta')) saver.restore( sess, os.path.join(ckpt_dir, 'model.ckpt-' + checkpoint)) # first_step = int(checkpoint) + 1 else: logger.warning( 'checkpoint {} not found'.format(checkpoint)) return summary_writer = tf.summary.FileWriter(ckpt_dir, sess.graph) ## train last_loss_time = time.time() - loss_interval last_summary_time = time.time() - summary_interval last_decay_time = last_checkpoint_time = time.time() last_decay_step = last_summary_step = last_checkpoint_step = 0 while True: start_time = time.time() batch_input, batch_adj = dataset.next_batch( batch_size, keep_strict_batching=True) if sparse_dot: batch_input_ind = np.vstack(np.where(batch_input)).astype( np.int64).T batch_input_shape = np.array(batch_input.shape).astype( np.int64) batch_input_val = batch_input[np.where(batch_input)] feed_dict = { inputs_sp_indices: batch_input_ind, inputs_sp_shape: batch_input_shape, inputs_sp_ids_val: batch_input_val, adj_matrix: batch_adj, learning_rate: LR.learning_rate } else: feed_dict = { inputs: batch_input, adj_matrix: batch_adj, learning_rate: LR.learning_rate } _, loss_value, cur_step = sess.run( [train_op, loss, global_step], feed_dict=feed_dict) now = time.time() assert not np.isnan( loss_value), 'Model diverged with loss = NaN' epoch, epoch_step = divmod(cur_step, num_steps_per_epoch) if now - last_loss_time >= loss_interval: format_str = '%s: step=%d(%d/%d), lr=%.6f, loss=%.6f, duration/step=%.4fs' logger.info(format_str % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step, epoch_step, epoch, LR.learning_rate, loss_value, now - start_time)) last_loss_time = time.time() if now - last_summary_time >= summary_interval or cur_step - last_summary_step >= summary_steps or cur_step >= iter_steps: summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, cur_step) last_summary_time = time.time() last_summary_step = cur_step ckpted = False # Save the model checkpoint periodically. (named 'model.ckpt-global_step.meta') if now - last_checkpoint_time >= ckpt_interval or cur_step - last_checkpoint_step >= ckpt_steps or cur_step >= iter_steps: vecs = [] start = 0 while start < nodes_size: end = min(nodes_size, start + batch_size) index = np.arange(start, end) start = end batch_input, batch_adj = dataset.get_batch(index) if sparse_dot: batch_input_ind = np.vstack( np.where(batch_input)).astype(np.int64).T batch_input_shape = np.array( batch_input.shape).astype(np.int64) batch_input_val = batch_input[np.where( batch_input)] feed_dict = { inputs_sp_indices: batch_input_ind, inputs_sp_shape: batch_input_shape, inputs_sp_ids_val: batch_input_val, adj_matrix: batch_adj, learning_rate: LR.learning_rate } else: feed_dict = { inputs: batch_input, adj_matrix: batch_adj, learning_rate: LR.learning_rate } batch_embeddings = sess.run(embeddings, feed_dict=feed_dict) vecs.append(batch_embeddings) vecs = np.concatenate(vecs, axis=0) checkpoint_path = os.path.join(ckpt_dir, 'model.ckpt') utils.save_word2vec_format_and_ckpt( vectors_path, vecs, checkpoint_path, sess, saver, cur_step) last_checkpoint_time = time.time() last_checkpoint_step = cur_step ckpted = True # update learning rate if ckpted or now - last_decay_time >= decay_interval or ( decay_steps > 0 and cur_step - last_decay_step >= decay_steps): lr_info = np.loadtxt(lr_file, dtype=float) if np.abs(lr_info[1] - decay_epochs) > 1e-6: decay_epochs = lr_info[1] decay_steps = round(decay_epochs * num_steps_per_epoch) if np.abs(lr_info[2] - decay_rate) > 1e-6: decay_rate = lr_info[2] if np.abs(lr_info[3] - iter_epochs) > 1e-6: iter_epochs = lr_info[3] iter_steps = round(iter_epochs * num_steps_per_epoch) if np.abs(lr_info[0] - initial_learning_rate) > 1e-6: initial_learning_rate = lr_info[0] LR.reset(initial_learning_rate=initial_learning_rate, initial_steps=cur_step, decay_rate=decay_rate, decay_steps=decay_steps, iter_steps=iter_steps) else: LR.exponential_decay(cur_step, decay_rate=decay_rate, decay_steps=decay_steps, iter_steps=iter_steps) last_decay_time = time.time() last_decay_step = cur_step if cur_step >= LR.iter_steps: break