def run(save_checkpoints_steps=10, keep_checkpoint_max=5, save_summary_steps=5, log_step_count_steps=5, num_epochs=5, test_iterator=False, test_images_dir="", output_dir=gin.REQUIRED): """ """ model = EASTModel() data_iterator = CIDARIterator() run_config = tf.ConfigProto() run_config.gpu_options.allow_growth = True # run_config.gpu_options.per_process_gpu_memory_fraction = 0.50 run_config.allow_soft_placement = True run_config.log_device_placement = False model_dir = model.model_dir run_config = tf.estimator.RunConfig( session_config=run_config, save_checkpoints_steps=save_checkpoints_steps, keep_checkpoint_max=keep_checkpoint_max, save_summary_steps=save_summary_steps, model_dir=model_dir, log_step_count_steps=log_step_count_steps) executor = Executor(model=model, data_iterator=data_iterator, config=run_config, train_hooks=None, eval_hooks=None, session_config=None) if test_iterator: executor.test_iterator() num_samples = data_iterator._num_train_examples batch_size = data_iterator._batch_size if not FLAGS.predict: for current_epoch in tqdm(range(num_epochs), desc="Epoch"): current_max_steps = (num_samples // batch_size) * (current_epoch + 1) print("\n\n Training for epoch {} with steps {}\n\n".format( current_epoch, current_max_steps)) # executor.train(max_steps=None) print("\n\n Evaluating for epoch\n\n", current_epoch) # executor.evaluate(steps=None) executor.train_and_evaluate() executor.export_model(model_dir + "/exported/") else: estimator = executor._estimator images = get_images(test_images_dir) for image_file_path in images: print("================> Text segmentation on :", image_file_path) im = cv2.imread(image_file_path)[:, :, ::-1] start_time = time.time() im_resized, (ratio_h, ratio_w) = resize_image(im) im_resized = np.expand_dims(im_resized, axis=0).astype(np.float32) def get_dataset(): dataset = tf.data.Dataset.from_tensor_slices(({ "images": im_resized }, np.ones_like(im_resized))) dataset = dataset.batch(batch_size=1) print(dataset.output_shapes) return dataset start = time.time() timer = {'net': 0, 'restore': 0, 'nms': 0} predict_fn = estimator.predict(input_fn=lambda: get_dataset()) for prediction in predict_fn: score = prediction["f_score"] geometry = prediction["f_geometry"] score = np.expand_dims(score, axis=0) geometry = np.expand_dims(geometry, axis=0) print("===============================") print(score.shape) print(geometry.shape) print("===============================") print(score) print(geometry) timer['net'] = time.time() - start boxes, timer = detect(score_map=score, geo_map=geometry, timer=timer) print('{} : net {:.0f}ms, restore {:.0f}ms, nms {:.0f}ms'.format( image_file_path, timer['net'] * 1000, timer['restore'] * 1000, timer['nms'] * 1000)) if boxes is not None: boxes = boxes[:, :8].reshape((-1, 4, 2)) boxes[:, :, 0] /= ratio_w boxes[:, :, 1] /= ratio_h duration = time.time() - start_time print('[timing] {}'.format(duration)) # save to file if boxes is not None: res_file = os.path.join( output_dir, '{}.txt'.format( os.path.basename(image_file_path).split('.')[0])) with open(res_file, 'w') as f: for box in boxes: # to avoid submitting errors box = sort_poly(box.astype(np.int32)) if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm( box[3] - box[0]) < 5: continue f.write('{},{},{},{},{},{},{},{}\r\n'.format( box[0, 0], box[0, 1], box[1, 0], box[1, 1], box[2, 0], box[2, 1], box[3, 0], box[3, 1], )) cv2.polylines( im[:, :, ::-1], [box.astype(np.int32).reshape((-1, 1, 2))], True, color=(255, 255, 0), thickness=1) # if not FLAGS.no_write_images: img_path = os.path.join(output_dir, os.path.basename(image_file_path)) cv2.imwrite(img_path, im[:, :, ::-1])
This is the main file for the subgraph classification task """ import tensorflow as tf import numpy as np import gnn_utils import GNN as GNN import Net_Subgraph as n from scipy.sparse import coo_matrix ##### GPU & stuff config import os os.environ['CUDA_VISIBLE_DEVICES'] = "0" config = tf.ConfigProto() config.gpu_options.allow_growth = True data_path = "./data" #data_path = "./Clique" set_name = "sub_15_7_200" ############# training set ################ #inp, arcnode, nodegraph, nodein, labels = Library.set_load_subgraph(data_path, "train") inp, arcnode, nodegraph, nodein, labels, _ = gnn_utils.set_load_general(data_path, "train", set_name=set_name) ############ test set #################### #inp_test, arcnode_test, nodegraph_test, nodein_test, labels_test = Library.set_load_subgraph(data_path, "test") inp_test, arcnode_test, nodegraph_test, nodein_test, labels_test, _ = gnn_utils.set_load_general(data_path, "test", set_name=set_name) ############ validation set #############
def train(): global parameters config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) device_str = get_device_str(FLAGS.device_id) if device_str.find('cpu') >= 0: # cpu version num_threads = os.getenv('OMP_NUM_THREADS', 1) print 'num_threads: ', num_threads config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=int(num_threads)) with tf.Graph().as_default(), tf.device(device_str), tf.Session(config=config) as sess: #image_size = 32 #images, labels = cifar10_input.distorted_inputs(FLAGS.data_dir, FLAGS.batch_size) images, labels = cifar10_input.inputs(False, FLAGS.data_dir, FLAGS.batch_size) print('Images: ', images) logits = inference(images) #logits = inference2(images) # Add a simple objective so we can calculate the backward pass. loss_value = loss(logits, labels) # Compute the gradient with respect to all the parameters. lr = 0.001 #grad = tf.train.GradientDescentOptimizer(lr).minimize(loss_value) grad = tf.train.MomentumOptimizer(lr, 0.9).minimize(loss_value) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build an initialization operation. init = tf.global_variables_initializer() # Start running operations on the Graph. sess.run(init) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) real_batch_size = FLAGS.batch_size num_batches_per_epoch = int((EPOCH_SIZE + real_batch_size - 1)/ real_batch_size) iterations = FLAGS.epochs * num_batches_per_epoch average_batch_time = 0.0 epochs_info = [] average_loss = 0.0 for step in xrange(iterations): start_time = time.time() _, loss_v = sess.run([grad, loss_value]) average_loss += loss_v duration = time.time() - start_time average_batch_time += float(duration) assert not np.isnan(loss_v), 'Model diverged with loss = NaN' if step % FLAGS.log_step == 0: examples_per_sec = FLAGS.batch_size / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)') print (format_str % (datetime.now(), step, loss_v, examples_per_sec, sec_per_batch)) if step > 0 and step % (FLAGS.eval_step * num_batches_per_epoch) == 0: average_loss /= num_batches_per_epoch * FLAGS.eval_step print ('epoch: %d, loss: %.2f' % (step /num_batches_per_epoch, average_loss)) epochs_info.append('%d:_:%s'%(step/(FLAGS.eval_step*num_batches_per_epoch), average_loss)) average_loss = 0.0 if step == iterations-1: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) coord.request_stop() coord.join(threads) average_batch_time /= iterations print 'average_batch_time: ', average_batch_time print ('epoch_info: %s' % ','.join(epochs_info))
def main(args): img = cv2.imread(args.image_path) file_paths = get_model_filenames(args.model_dir) with tf.device('/gpu:0'): with tf.Graph().as_default(): config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: if len(file_paths) == 3: image_pnet = tf.placeholder(tf.float32, [None, None, None, 3]) pnet = PNet({'data': image_pnet}, mode='test') out_tensor_pnet = pnet.get_all_output() image_rnet = tf.placeholder(tf.float32, [None, 24, 24, 3]) rnet = RNet({'data': image_rnet}, mode='test') out_tensor_rnet = rnet.get_all_output() image_onet = tf.placeholder(tf.float32, [None, 48, 48, 3]) onet = ONet({'data': image_onet}, mode='test') out_tensor_onet = onet.get_all_output() saver_pnet = tf.train.Saver([ v for v in tf.global_variables() if v.name[0:5] == "pnet/" ]) saver_rnet = tf.train.Saver([ v for v in tf.global_variables() if v.name[0:5] == "rnet/" ]) saver_onet = tf.train.Saver([ v for v in tf.global_variables() if v.name[0:5] == "onet/" ]) saver_pnet.restore(sess, file_paths[0]) def pnet_fun(img): return sess.run(out_tensor_pnet, feed_dict={image_pnet: img}) saver_rnet.restore(sess, file_paths[1]) def rnet_fun(img): return sess.run(out_tensor_rnet, feed_dict={image_rnet: img}) saver_onet.restore(sess, file_paths[2]) def onet_fun(img): return sess.run(out_tensor_onet, feed_dict={image_onet: img}) else: saver = tf.train.import_meta_graph(file_paths[0]) saver.restore(sess, file_paths[1]) def pnet_fun(img): return sess.run( ('softmax/Reshape_1:0', 'pnet/conv4-2/BiasAdd:0'), feed_dict={'Placeholder:0': img}) def rnet_fun(img): return sess.run(('softmax_1/softmax:0', 'rnet/conv5-2/rnet/conv5-2:0'), feed_dict={'Placeholder_1:0': img}) def onet_fun(img): return sess.run(('softmax_2/softmax:0', 'onet/conv6-2/onet/conv6-2:0', 'onet/conv6-3/onet/conv6-3:0'), feed_dict={'Placeholder_2:0': img}) start_time = time.time() rectangles, points = detect_face(img, args.minsize, pnet_fun, rnet_fun, onet_fun, args.threshold, args.factor) duration = time.time() - start_time points = np.transpose(points) print('face detect cost=%ds|total rectangles=%d|points=%d' % (duration, rectangles.shape[0], points.shape[0])) for rectangle in rectangles: ''' cv2.putText(img, str(rectangle[4]), (int(rectangle[0]), int(rectangle[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0)) ''' cv2.rectangle(img, (int(rectangle[0]), int(rectangle[1])), (int(rectangle[2]), int(rectangle[3])), (255, 0, 0), 1) for point in points: for i in range(0, 10, 2): cv2.circle(img, (int(point[i]), int(point[i + 1])), 2, (0, 255, 0)) cv2.imshow("test", img) if args.save_image: cv2.imwrite(args.save_name, img) if cv2.waitKey(0) & 0xFF == ord('q'): cv2.destroyAllWindows()
def Worker(index, update_game_num, Synchronizer, cluster, model_path): config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, ) config.gpu_options.allow_growth = True worker = tf.train.Server(cluster, job_name="worker", task_index=index, config=config) sess = tf.Session(target=worker.target, config=config) Net = MiniNetwork(sess=sess, summary_writer=None, rl_training=FLAGS.training, cluster=cluster, index=index, device=DEVICE[index % len(DEVICE)], ppo_load_path=FLAGS.restore_model_path, ppo_save_path=model_path) global_buffer = Buffer() agents = [] for i in range(THREAD_NUM): agent = terran_source_agent.SourceAgent( index=i, global_buffer=global_buffer, net=Net, restore_model=FLAGS.restore_model, rl_training=FLAGS.training, strategy_agent=None) agents.append(agent) print("Worker %d: waiting for cluster connection..." % index) sess.run(tf.report_uninitialized_variables()) print("Worker %d: cluster ready!" % index) while len(sess.run(tf.report_uninitialized_variables())): print("Worker %d: waiting for variable initialization..." % index) time.sleep(1) print("Worker %d: variables initialized" % index) game_num = np.ceil(update_game_num // THREAD_NUM) UPDATE_EVENT.clear() ROLLING_EVENT.set() # Run threads threads = [] for i in range(THREAD_NUM - 1): t = threading.Thread(target=run_thread, args=(agents[i], game_num, Synchronizer, FLAGS.difficulty)) threads.append(t) t.daemon = True t.start() time.sleep(3) run_thread(agents[-1], game_num, Synchronizer, FLAGS.difficulty) for t in threads: t.join()
train_op = model.create_train_op(t_learning_rate) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(store_dir, graph) saver = tf.train.Saver(max_to_keep=30) import time import datetime test_images = mnistu_test_images[:2000] test_labels = mnistu_test_labels[:2000] store_dir max_steps = 1502 sesh = tf.Session(graph = graph, config=tf.ConfigProto(intra_op_parallelism_threads=2, allow_soft_placement=True)) eval_interval = 100 p_aba_list = [] match_ab_list = [] t_sup_emb_list = [] t_unsup_emb_list = [] with sesh as sess: sess.run(unsup_it.initializer) sess.run(tf.global_variables_initializer()) epoch = 0
def train_lanenet(dataset_dir, weights_path=None, net_flag='vgg'): """ 单GPU情况下进行训练 :param dataset_dir: :param net_flag: choose which base network to use :param weights_path: :return: """ train_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( dataset_dir=dataset_dir, flags='train' ) val_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( dataset_dir=dataset_dir, flags='val' ) # 获取训练和验证集 with tf.device('/gpu:1'): # set lanenet train_net = lanenet.LaneNet(net_flag=net_flag, phase='train', reuse=False) val_net = lanenet.LaneNet(net_flag=net_flag, phase='val', reuse=True) # set compute graph node for training train_images, train_binary_labels, train_instance_labels = train_dataset.inputs( CFG.TRAIN.BATCH_SIZE, 1 ) train_compute_ret = train_net.compute_loss( input_tensor=train_images, binary_label=train_binary_labels, instance_label=train_instance_labels, name='lanenet_model' ) train_total_loss = train_compute_ret['total_loss'] train_binary_seg_loss = train_compute_ret['binary_seg_loss'] train_disc_loss = train_compute_ret['discriminative_loss'] train_pix_embedding = train_compute_ret['instance_seg_logits'] train_prediction_logits = train_compute_ret['binary_seg_logits'] train_prediction_score = tf.nn.softmax(logits=train_prediction_logits) train_prediction = tf.argmax(train_prediction_score, axis=-1) # 概率最大值 train_accuracy = evaluate_model_utils.calculate_model_precision( train_compute_ret['binary_seg_logits'], train_binary_labels ) # calculate accuracy acc = correct_nums / ground_truth_nums train_fp = evaluate_model_utils.calculate_model_fp( train_compute_ret['binary_seg_logits'], train_binary_labels ) train_fn = evaluate_model_utils.calculate_model_fn( train_compute_ret['binary_seg_logits'], train_binary_labels ) train_binary_seg_ret_for_summary = evaluate_model_utils.get_image_summary( img=train_prediction ) train_embedding_ret_for_summary = evaluate_model_utils.get_image_summary( img=train_pix_embedding ) train_cost_scalar = tf.summary.scalar( # 用来显示标量信息 name='train_cost', tensor=train_total_loss ) train_accuracy_scalar = tf.summary.scalar( name='train_accuracy', tensor=train_accuracy ) train_binary_seg_loss_scalar = tf.summary.scalar( name='train_binary_seg_loss', tensor=train_binary_seg_loss ) train_instance_seg_loss_scalar = tf.summary.scalar( name='train_instance_seg_loss', tensor=train_disc_loss ) train_fn_scalar = tf.summary.scalar( name='train_fn', tensor=train_fn ) train_fp_scalar = tf.summary.scalar( name='train_fp', tensor=train_fp ) train_binary_seg_ret_img = tf.summary.image( # 输出Summary带有图像的协议缓冲区 生成准确率标量图 name='train_binary_seg_ret', tensor=train_binary_seg_ret_for_summary ) train_embedding_feats_ret_img = tf.summary.image( name='train_embedding_feats_ret', tensor=train_embedding_ret_for_summary ) train_merge_summary_op = tf.summary.merge( # 对指定的汇总进行合并 [train_accuracy_scalar, train_cost_scalar, train_binary_seg_loss_scalar, train_instance_seg_loss_scalar, train_fn_scalar, train_fp_scalar, train_binary_seg_ret_img, train_embedding_feats_ret_img] ) # set compute graph node for validation val_images, val_binary_labels, val_instance_labels = val_dataset.inputs( CFG.TRAIN.VAL_BATCH_SIZE, 1 ) val_compute_ret = val_net.compute_loss( input_tensor=val_images, binary_label=val_binary_labels, instance_label=val_instance_labels, name='lanenet_model' ) val_total_loss = val_compute_ret['total_loss'] val_binary_seg_loss = val_compute_ret['binary_seg_loss'] val_disc_loss = val_compute_ret['discriminative_loss'] val_pix_embedding = val_compute_ret['instance_seg_logits'] val_prediction_logits = val_compute_ret['binary_seg_logits'] val_prediction_score = tf.nn.softmax(logits=val_prediction_logits) val_prediction = tf.argmax(val_prediction_score, axis=-1) val_accuracy = evaluate_model_utils.calculate_model_precision( val_compute_ret['binary_seg_logits'], val_binary_labels ) val_fp = evaluate_model_utils.calculate_model_fp( val_compute_ret['binary_seg_logits'], val_binary_labels ) val_fn = evaluate_model_utils.calculate_model_fn( val_compute_ret['binary_seg_logits'], val_binary_labels ) val_binary_seg_ret_for_summary = evaluate_model_utils.get_image_summary( img=val_prediction ) val_embedding_ret_for_summary = evaluate_model_utils.get_image_summary( img=val_pix_embedding ) val_cost_scalar = tf.summary.scalar( name='val_cost', tensor=val_total_loss ) val_accuracy_scalar = tf.summary.scalar( name='val_accuracy', tensor=val_accuracy ) val_binary_seg_loss_scalar = tf.summary.scalar( name='val_binary_seg_loss', tensor=val_binary_seg_loss ) val_instance_seg_loss_scalar = tf.summary.scalar( name='val_instance_seg_loss', tensor=val_disc_loss ) val_fn_scalar = tf.summary.scalar( name='val_fn', tensor=val_fn ) val_fp_scalar = tf.summary.scalar( name='val_fp', tensor=val_fp ) val_binary_seg_ret_img = tf.summary.image( name='val_binary_seg_ret', tensor=val_binary_seg_ret_for_summary ) val_embedding_feats_ret_img = tf.summary.image( name='val_embedding_feats_ret', tensor=val_embedding_ret_for_summary ) val_merge_summary_op = tf.summary.merge( [val_accuracy_scalar, val_cost_scalar, val_binary_seg_loss_scalar, val_instance_seg_loss_scalar, val_fn_scalar, val_fp_scalar, val_binary_seg_ret_img, val_embedding_feats_ret_img] ) # set optimizer global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.polynomial_decay( learning_rate=CFG.TRAIN.LEARNING_RATE, global_step=global_step, decay_steps=CFG.TRAIN.EPOCHS, power=0.9 ) """ 多项式衰减 global_step = min(global_step,decay_steps) decayed_learning_rate = (learning_rate-end_learning_rate)*(1-global_step/decay_steps)^ (power)+end_learning_rate """ update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # tf.GraphKeys.UPDATE_OPS: 一个tensorflow的计算图中内置的一个集合,其中会保存一些需要在训练操作之前完成的操作 with tf.control_dependencies(update_ops): optimizer = tf.train.MomentumOptimizer( # 当前权值的改变会受到上一次权值改变的影响,类似于小球向下滚动的时候带上了惯性。这样可以加快小球的向下的速度。 learning_rate=learning_rate, momentum=CFG.TRAIN.MOMENTUM).minimize( loss=train_total_loss, var_list=tf.trainable_variables(), global_step=global_step ) # Set tf model save path model_save_dir = 'model/tusimple_lanenet_{:s}'.format(net_flag) os.makedirs(model_save_dir, exist_ok=True) train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) model_name = 'tusimple_lanenet_{:s}_{:s}.ckpt'.format(net_flag, str(train_start_time)) model_save_path = ops.join(model_save_dir, model_name) saver = tf.train.Saver() # Set tf summary save path tboard_save_path = 'tboard/tusimple_lanenet_{:s}'.format(net_flag) os.makedirs(tboard_save_path, exist_ok=True) # Set sess configuration sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.TRAIN.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = CFG.TRAIN.TF_ALLOW_GROWTH sess_config.gpu_options.allocator_type = 'BFC' sess = tf.Session(config=sess_config) summary_writer = tf.summary.FileWriter(tboard_save_path) summary_writer.add_graph(sess.graph) # Set the training parameters train_epochs = CFG.TRAIN.EPOCHS # 80010 log.info('Global configuration is as follows:') log.info(CFG) with sess.as_default(): if weights_path is None: log.info('Training from scratch') init = tf.global_variables_initializer() sess.run(init) else: log.info('Restore model from last model checkpoint {:s}'.format(weights_path)) saver.restore(sess=sess, save_path=weights_path) if net_flag == 'vgg' and weights_path is None: load_pretrained_weights(tf.trainable_variables(), './data/vgg16.npy', sess) train_cost_time_mean = [] for epoch in range(train_epochs): # training part t_start = time.time() _, train_c, train_accuracy_figure, train_fn_figure, train_fp_figure, \ lr, train_summary, train_binary_loss, \ train_instance_loss, train_embeddings, train_binary_seg_imgs, train_gt_imgs, \ train_binary_gt_labels, train_instance_gt_labels = \ sess.run([optimizer, train_total_loss, train_accuracy, train_fn, train_fp, learning_rate, train_merge_summary_op, train_binary_seg_loss, train_disc_loss, train_pix_embedding, train_prediction, train_images, train_binary_labels, train_instance_labels]) if math.isnan(train_c) or math.isnan(train_binary_loss) or math.isnan(train_instance_loss): log.error('cost is: {:.5f}'.format(train_c)) log.error('binary cost is: {:.5f}'.format(train_binary_loss)) log.error('instance cost is: {:.5f}'.format(train_instance_loss)) return if epoch % 100 == 0: record_training_intermediate_result( gt_images=train_gt_imgs, gt_binary_labels=train_binary_gt_labels, gt_instance_labels=train_instance_gt_labels, binary_seg_images=train_binary_seg_imgs, pix_embeddings=train_embeddings ) summary_writer.add_summary(summary=train_summary, global_step=epoch) if epoch % CFG.TRAIN.DISPLAY_STEP == 0: # CFG.TRAIN.DISPLAY_SETP = 1 log.info('Epoch: {:d} total_loss= {:6f} binary_seg_loss= {:6f} ' 'instance_seg_loss= {:6f} accuracy= {:6f} fp= {:6f} fn= {:6f}' ' lr= {:6f} mean_cost_time= {:5f}s '. format(epoch + 1, train_c, train_binary_loss, train_instance_loss, train_accuracy_figure, train_fp_figure, train_fn_figure, lr, np.mean(train_cost_time_mean))) train_cost_time_mean.clear() # validation part val_c, val_accuracy_figure, val_fn_figure, val_fp_figure, \ val_summary, val_binary_loss, val_instance_loss, \ val_embeddings, val_binary_seg_imgs, val_gt_imgs, \ val_binary_gt_labels, val_instance_gt_labels = \ sess.run([val_total_loss, val_accuracy, val_fn, val_fp, val_merge_summary_op, val_binary_seg_loss, val_disc_loss, val_pix_embedding, val_prediction, val_images, val_binary_labels, val_instance_labels]) if math.isnan(val_c) or math.isnan(val_binary_loss) or math.isnan(val_instance_loss): log.error('cost is: {:.5f}'.format(val_c)) log.error('binary cost is: {:.5f}'.format(val_binary_loss)) log.error('instance cost is: {:.5f}'.format(val_instance_loss)) return if epoch % 100 == 0: record_training_intermediate_result( gt_images=val_gt_imgs, gt_binary_labels=val_binary_gt_labels, gt_instance_labels=val_instance_gt_labels, binary_seg_images=val_binary_seg_imgs, pix_embeddings=val_embeddings, flag='val' ) cost_time = time.time() - t_start train_cost_time_mean.append(cost_time) summary_writer.add_summary(summary=val_summary, global_step=epoch) if epoch % CFG.TRAIN.VAL_DISPLAY_STEP == 0: # CFG.TRAIN.VAL.DISPLAY_STEP = 1000 log.info('Epoch_Val: {:d} total_loss= {:6f} binary_seg_loss= {:6f} ' 'instance_seg_loss= {:6f} accuracy= {:6f} fp= {:6f} fn= {:6f}' ' mean_cost_time= {:5f}s '. format(epoch + 1, val_c, val_binary_loss, val_instance_loss, val_accuracy_figure, val_fp_figure, val_fn_figure, np.mean(train_cost_time_mean))) train_cost_time_mean.clear() if epoch % 2000 == 0: saver.save(sess=sess, save_path=model_save_path, global_step=global_step) return
def main(try_load_model=True): num_cores = 8 config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, inter_op_parallelism_threads=num_cores, allow_soft_placement=True, device_count = {'GPU' : 1} ) session = tf.Session(config=config) K.set_session(session) # Get the environment and extract the number of actions available in the Cartpole problem env = Distemper() np.random.seed(1234) env.seed(1234) nb_actions = env.action_space.n batch = 200 def agent(states, actions): model = Sequential() model.add(Flatten(input_shape = (batch, states))) model.add(Dense(256, activation='relu')) model.add(Dense(128, activation='relu')) model.add(Dense(32, activation='relu')) model.add(Dense(actions, activation='linear')) return model model = agent(env.observation_space.n, env.action_space.n) print(model.summary()) policy = BoltzmannGumbelQPolicy(C=2) test_policy = GreedyQPolicy() memory = SequentialMemory(limit=50000, window_length=batch) rl_agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1024,target_model_update=1e-2, #enable_double_dqn=True, enable_dueling_network=True, dueling_type='avg', policy=policy, test_policy=test_policy) rl_agent.compile(Adam(lr=1e-4), metrics = ['mse']) def _get_nice_display_results(rl_agent, env, runs=4): results = [] action_stats = [] for _ in range(runs): env.reset() rl_agent.test(env, nb_episodes=1, visualize=False) results.append(env.simulation._get_disease_stats()) action_stats.append(env._get_action_stats()) print(env._get_action_stats()) results_dataframe = pd.DataFrame.from_records(results) results_dataframe = results_dataframe.drop(['S', 'IS', 'SY', 'D'], axis=1) results_dataframe = results_dataframe.rename(index=str, columns={"E": "Total Intake", "I": "Total Infected"}) results_dataframe['Infection Rate'] = \ results_dataframe['Total Infected'] / results_dataframe['Total Intake'] means = results_dataframe.mean() stes = results_dataframe.std() / np.sqrt(len(results_dataframe)) cols = results_dataframe.columns return means, stes, cols # Train if try_load_model: rl_agent.load_weights('dqn_weights.h5f') else: rl_agent.fit(env, nb_steps=500000, visualize=False, verbose=1) # Test m, s, c = _get_nice_display_results(rl_agent, env, runs=30) print(m), print(s), print(c) rl_agent.save_weights('dqn_weights.h5f', overwrite=True)
def train(): print('loading dataset...') dataset, img_feature, train_data = get_data() num_train = train_data['question'].shape[0] vocabulary_size = len(dataset['ix_to_word'].keys()) print('vocabulary_size : ' + str(vocabulary_size)) print('constructing model...') model = Answer_Generator( rnn_size = rnn_size, rnn_layer = rnn_layer, batch_size = batch_size, input_embedding_size = input_embedding_size, dim_image = dim_image, dim_hidden = dim_hidden, max_words_q = max_words_q, vocabulary_size = vocabulary_size, drop_out_rate = 0.5) tf_loss, tf_image, tf_question, tf_answer, tf_label = model.build_model() sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) saver = tf.train.Saver(max_to_keep=100) tvars = tf.trainable_variables() lr = tf.Variable(learning_rate) opt = tf.train.AdamOptimizer(learning_rate=lr) # gradient clipping gvs = opt.compute_gradients(tf_loss,tvars) clipped_gvs = [(tf.clip_by_value(grad, -100.0, 100.0), var) for grad, var in gvs] ## either 100 or 10000 will result in Nan, original is 100 train_op = opt.apply_gradients(clipped_gvs) tf.initialize_all_variables().run() print('start training...') for itr in range(max_itr): tStart = time.time() # shuffle the training data index = np.random.random_integers(0, num_train-1, batch_size) current_question = train_data['question'][index,:] current_length_q = train_data['length_q'][index] current_answer = train_data['answer'][index] current_length_a = train_data['length_a'][index] current_img_list = train_data['img_list'][index] current_target = train_data['target'][index] current_img = img_feature[current_img_list,:] # do the training process!!! _, loss = sess.run( [train_op, tf_loss], feed_dict={ tf_image: current_img, tf_question: current_question, tf_answer: current_answer, tf_label: current_target }) current_learning_rate = lr*decay_factor lr.assign(current_learning_rate).eval() tStop = time.time() if np.mod(itr, 100) == 0: print ("Iteration: ", itr, " Loss: ", loss, " Learning Rate: ", lr.eval()) #print ("Iteration: ", itr, " scores: ", scores, " label: ", current_target) print ("Time Cost:", round(tStop - tStart,2), "s") if np.mod(itr, 2500) == 0: print ("Iteration ", itr, " is done. Saving the model ...") saver.save(sess, os.path.join(checkpoint_path, 'model'), global_step=itr) print ("Finally, saving the model ...") saver.save(sess, os.path.join(checkpoint_path, 'model'), global_step=n_epochs) tStop_total = time.time() print ("Total Time Cost:", round(tStop_total - tStart_total,2), "s")
def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" initial_learning_rate = log_uniform(flags.initial_alpha_low, flags.initial_alpha_high, flags.initial_alpha_log_rate) self.global_t = 0 self.stop_requested = False self.terminate_reqested = False action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.global_network = UnrealModel(action_size, -1, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, device) self.trainers = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = flags.rmsp_alpha, momentum = 0.0, epsilon = flags.rmsp_epsilon, clip_norm = flags.grad_norm_clip, device = device) for i in range(flags.parallel_size): trainer = Trainer(i, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, flags.env_type, flags.env_name, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, flags.local_t_max, flags.gamma, flags.gamma_pc, flags.experience_history_size, flags.max_time_step, device) self.trainers.append(trainer) # prepare session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) # summary for tensorboard self.score_input = tf.placeholder(tf.int32) tf.summary.scalar("score", self.score_input) self.summary_op = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(flags.log_file, self.sess.graph) # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_vars()) checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) tokens = checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[1]) print(">>> global step set: ", self.global_t) # set wall time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = (self.global_t + flags.save_interval_step) // flags.save_interval_step * flags.save_interval_step else: print("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step # run training threads self.train_threads = [] for i in range(flags.parallel_size): self.train_threads.append(threading.Thread(target=self.train_function, args=(i,True))) signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t for t in self.train_threads: t.start() print('Press Ctrl+C to stop') signal.pause()
def main(argv=None): os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_list if not tf.gfile.Exists(FLAGS.checkpoint_path): tf.gfile.MkDir(FLAGS.checkpoint_path) else: if not FLAGS.restore: tf.gfile.DeleteRecursively(FLAGS.checkpoint_path) tf.gfile.MkDir(FLAGS.checkpoint_path) input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images') input_score_maps = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_score_maps') if FLAGS.geometry == 'RBOX': input_geo_maps = tf.placeholder(tf.float32, shape=[None, None, None, 5], name='input_geo_maps') else: input_geo_maps = tf.placeholder(tf.float32, shape=[None, None, None, 8], name='input_geo_maps') input_training_masks = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_training_masks') input_labels = tf.placeholder(tf.float32, shape=[None, None, 4, 2], name='input_labels') global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, global_step, decay_steps=10000, decay_rate=0.01, staircase=True) # add summary tf.summary.scalar('learning_rate', learning_rate) opt = tf.train.AdamOptimizer(learning_rate) # split input_images_split = tf.split(input_images, len(gpus)) input_score_maps_split = tf.split(input_score_maps, len(gpus)) input_geo_maps_split = tf.split(input_geo_maps, len(gpus)) input_training_masks_split = tf.split(input_training_masks, len(gpus)) input_labels_split = tf.split(input_labels, len(gpus)) tower_grads = [] reuse_variables = None for i, gpu_id in enumerate(gpus): with tf.device('/gpu:%d' % gpu_id): with tf.name_scope('model_%d' % gpu_id) as scope: iis = input_images_split[i] isms = input_score_maps_split[i] igms = input_geo_maps_split[i] itms = input_training_masks_split[i] il = input_labels_split[i] total_loss, model_loss, f_score, f_geometry, _ = tower_loss( iis, isms, igms, itms, il, reuse_variables) #f_score, f_geometry = i_am_testing(iis) batch_norm_updates_op = tf.group( *tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope)) #print "below..." #batch_norm_updates_op = tf.group(*[op for op in tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope) if 'resnet_v1_50/block4' in op.name or 'resnet_v1_50/block3' in op.name or 'feature_fusion' in op.name]) #print "above..." reuse_variables = True #print "below.." #train_var = [var for var in tf.trainable_variables() if 'resnet_v1_50/block1' in var.name] #train_var = [var for var in tf.trainable_variables() if 'resnet_v1_50/block4' in var.name] #train_var += [var for var in tf.trainable_variables() if 'feature_fusion/Conv_7' in var.name] #train_var += [var for var in tf.trainable_variables() if 'feature_fusion/Conv_8' in var.name] #train_var += [var for var in tf.trainable_variables() if 'feature_fusion/Conv_9' in var.name] #print train_var #print "above..." train_var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='feature_fusion') grads = opt.compute_gradients(total_loss, var_list=train_var) tower_grads.append(grads) grads = average_gradients(tower_grads) apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) summary_op = tf.summary.merge_all() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) #train_var = [var for var in tf.trainable_variables() if ('resnet_v1_50/block3' in var.name or 'resnet_v1_50/block4' in var.name or 'feature_fusion' in var.name)] variables_averages_op = variable_averages.apply(tf.trainable_variables()) with tf.control_dependencies( [variables_averages_op, apply_gradient_op, batch_norm_updates_op]): train_op = tf.no_op(name='train_op') saver = tf.train.Saver(tf.global_variables()) summary_writer = tf.summary.FileWriter(FLAGS.checkpoint_path, tf.get_default_graph()) init = tf.global_variables_initializer() if FLAGS.pretrained_model_path is not None: variable_restore_op = slim.assign_from_checkpoint_fn( FLAGS.pretrained_model_path, slim.get_trainable_variables(), ignore_missing_vars=True) my_char_l = "5" my_char_U = "" data_size = 0 train_data_indices = [] list_of_img_pos = [] with open( 'Data/cropped_annotations_new/cropped_annotations' + my_char_l + '.txt', 'r') as f: annotation_file = f.readlines() #with open('Data/cropped_annotations_new/cropped_annotations' + my_char_U + '.txt', 'r') as f: # annotation_file += f.readlines() idx = 0 for line in annotation_file: if len(line) > 1 and line[:13] == './cropped_img' and str( line[14:27]) in training_list: data_size += 1 train_data_indices.append(idx) list_of_img_pos.append(line[14:].split(".")[0] + ".tiff") idx += 1 list_of_img_all = os.listdir('Data/cropped_img') list_of_img_neg = np.array( list(set(list_of_img_all) - set(list_of_img_pos))) print "Char model: " + my_char_U + my_char_l print "Data size: " + str(data_size) epoche_size = data_size / (16 * 2) #print epoche_size print "This many steps per epoche: " + str(epoche_size) list_of_img_neg_char = os.listdir('Data/j') with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: if FLAGS.restore: ckpt_state = tf.train.get_checkpoint_state(FLAGS.checkpoint_path) model_path = os.path.join( FLAGS.checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path)) saver.restore(sess, model_path) else: sess.run(init) if FLAGS.pretrained_model_path is not None: variable_restore_op(sess) #print "below:" #tvars = tf.trainable_variables() #g_vars = [var for var in tvars if 'resnet_v1_50/block4' in var.name] #print g_vars #print tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='resnet_v1_50') #return print FLAGS.learning_rate print reg_constant for step in range(24 * epoche_size): ### Generate Dwata ### data = [], [], [], [], [] np.random.shuffle(train_data_indices) num_im = 0 actual_num_im = 0 while len(data[0]) < 32: prob = np.random.random(1)[0] if prob > 0.49: i = train_data_indices[num_im] im_fn = "Data/cropped_img/" + annotation_file[i][ 14:].split(".tiff", 1)[0] + ".tiff" im = cv2.imread(im_fn) if im is not None: r, c, _ = im.shape text_polys = [] text_tags = [] if int(annotation_file[i + 1]) > 0: for idx in range( i + 2, i + 2 + int(annotation_file[i + 1])): annotation_data = annotation_file[idx] annotation_data = annotation_data.split(" ") x, y = float(annotation_data[0]), float( annotation_data[1]) w, h = float(annotation_data[2]), float( annotation_data[3]) text_polys.append([ list([int(x), int(y - h)]), list([int(x + w), int(y - h)]), list([int(x + w), int(y)]), list([int(x), int(y)]) ]) text_tags.append(False) score_map, geo_map, training_mask = icdar.generate_rbox( (int(r), int(c)), np.array(text_polys), np.array(text_tags)) data[0].append(im[:, :, ::-1].astype(np.float32)) data[1].append(im_fn) data[2].append(score_map[::4, ::4, np.newaxis].astype( np.float32)) data[3].append(geo_map[::4, ::4, :].astype(np.float32)) data[4].append(training_mask[::4, ::4, np.newaxis].astype( np.float32)) actual_num_im += 1 num_im += 1 else: im_fn = np.random.choice(list_of_img_neg) im = cv2.imread("Data/cropped_img/" + im_fn) #if prob > 0.25: # im_fn = np.random.choice(list_of_img_neg_char) # im_mini = cv2.imread("Data/j/" + im_fn) # r0, c0, _ = im_mini.shape # im = np.zeros((512, 512, 3), dtype=np.uint8) # ra, rb, ca, cb = 256-r0/2, 256+(r0+1)/2, 256-c0/2, 256+(c0+1)/2 # im[ra:rb, ca:cb, :] = im_mini.copy() if im is not None: r, c, _ = im.shape score_map, geo_map, training_mask = icdar.generate_rbox( (int(r), int(c)), np.array([]), np.array([])) data[0].append(im[:, :, ::-1].astype(np.float32)) data[1].append(im_fn) data[2].append(score_map[::4, ::4, np.newaxis].astype( np.float32)) data[3].append(geo_map[::4, ::4, :].astype(np.float32)) data[4].append(training_mask[::4, ::4, np.newaxis].astype( np.float32)) ### Run model ### ml, tl, _ = sess.run( [model_loss, total_loss, train_op], feed_dict={ input_images: data[0], input_score_maps: data[2], input_geo_maps: data[3], input_training_masks: data[4] }) epoch = step / epoche_size batch_num = step % epoche_size if step % (epoche_size / 3) == 0: print "Epoch no.: " + str(epoch) + " batch no.: " + str( batch_num) + " loss: " + str(ml) print "Epoch no.: " + str(epoch) + " batch no.: " + str( batch_num) + " loss: " + str(tl) if step % (epoche_size / 2) == 0: #print "Epoche: " + str(step / (epoche_size/2)) saver.save(sess, FLAGS.checkpoint_path + 'model.ckpt', global_step=global_step) _, tl, summary_str = sess.run( [train_op, total_loss, summary_op], feed_dict={ input_images: data[0], input_score_maps: data[2], input_geo_maps: data[3], input_training_masks: data[4] }) summary_writer.add_summary(summary_str, global_step=step) if False: count_right = 0 count_wrong = 0 count_posNotDetected = 0 im0 = cv2.imread("Data/maps/D0117-5755036.tiff")[:, :, ::-1] w, h, _ = im0.shape slide_window = 300 crop_size = 512 crop_center = (256, 256) num_rows, num_cols = int(np.ceil(w / slide_window)), int( np.ceil(h / slide_window)) print num_cols for rot in [-90.0, -60.0, -30.0, 0.0, 30.0, 60.0, 90.0]: im = cv2.imread("Data/maps/D0117-5755036.tiff")[:, :, ::-1] boxes_one_rot = [] count = 0 while count < num_rows * num_cols: images, data2, data3, data4 = [], [], [], [] for k in range(16): i = (count + k) / num_rows j = (count + k) % num_cols temp = im[slide_window*i:slide_window*i+crop_size, \ slide_window*j:slide_window*j+crop_size, ::-1] w2, h2, _ = temp.shape if w2 < crop_size or h2 < crop_size: result = np.zeros((crop_size, crop_size, 3)) result[:w2, :h2] = temp temp = result M = cv2.getRotationMatrix2D(crop_center, rot, 1.0) temp = cv2.warpAffine(temp, M, (crop_size, crop_size)) images.append(temp) score_map, geo_map, training_mask = icdar.generate_rbox( (int(crop_size), int(crop_size)), np.array([]), np.array([])) data2.append(score_map[::4, ::4, np.newaxis].astype( np.float32)) data3.append(geo_map[::4, ::4, :].astype( np.float32)) data4.append(training_mask[::4, ::4, np.newaxis].astype( np.float32)) score, geometry = sess.run( [f_score, f_geometry], feed_dict={ input_images: images, input_score_maps: data2, input_geo_maps: data3, input_training_masks: data4 }) for k in range(16): i = (count + k) / num_rows j = (count + k) % num_cols boxes = detect(score_map=score[j], geo_map=geometry[j], score_map_thresh=0.01, box_thresh=0.01, nms_thres=0.01) if boxes is not None: boxes = boxes[:, :8].reshape((-1, 4, 2)) for box in boxes: M_inv = cv2.getRotationMatrix2D( crop_center, -1 * rot, 1) box[0] = M_inv.dot( np.array((box[0, 0], box[0, 1]) + (1, ))) box[1] = M_inv.dot( np.array((box[1, 0], box[1, 1]) + (1, ))) box[2] = M_inv.dot( np.array((box[2, 0], box[2, 1]) + (1, ))) box[3] = M_inv.dot( np.array((box[3, 0], box[3, 1]) + (1, ))) box = sort_poly(box.astype(np.int32)) box[0, 0] = box[0, 0] + j * slide_window box[0, 1] = box[0, 1] + i * slide_window box[1, 0] = box[1, 0] + j * slide_window box[1, 1] = box[1, 1] + i * slide_window box[2, 0] = box[2, 0] + j * slide_window box[2, 1] = box[2, 1] + i * slide_window box[3, 0] = box[3, 0] + j * slide_window box[3, 1] = box[3, 1] + i * slide_window boxes_one_rot.append(box) boxes_single_rot = np.zeros((len(boxes_one_rot), 9)) boxes_single_rot[:, :8] = np.array(boxes_one_rot).reshape( (-1, 8)) boxes_single_rot[:, 8] = 1 labels += boxes_single_rot.tolist() boxes = lanms.merge_quadrangle_n9(np.array(labels), nms_thres) annotation = np.load( "/mnt/nfs/work1/elm/ray/new_char_anots_ncs/" + "j" + "/" + "D0117-5755036" + ".npy").item() ### Compute the TP, FP, FN info for each image count_right_cache = 0 boxes = boxes[:, :8].reshape((-1, 4, 2)) num_true_pos = len(annotation) for box in boxes: box = sort_poly(box.astype(np.int32)) if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm( box[3] - box[0]) < 5: continue k = 0 idx = 0 count_wrong += 1 while (idx < num_true_pos): if k in annotation: proposed_label = annotation[k]['vertices'] if len(proposed_label) == 4: x3, y3, x2, y2, x1, y1, x0, y0 = proposed_label[0][0], proposed_label[0][1], proposed_label[1][0], proposed_label[1][1], \ proposed_label[2][0], proposed_label[2][1], proposed_label[3][0], proposed_label[3][1] if (checkIOU(box, [[x0, y0], [x1, y1], [x2, y2], [x3, y3]]) == True): count_right_cache += 1 count_wrong -= 1 break idx += 1 k += 1 count_posNotDetected += num_true_pos - count_right_cache count_right += count_right_cache precision = (float)(count_right) / (float)( count_right + count_wrong) # TP / TP + FP recall = (float)(count_right) / (float)( count_right + count_posNotDetected) # TP / TP + FN fscore = 2 * (precision * recall) / (precision + recall) print "Precision, recall, fscore: " + str( precision) + ", " + str(recall) + ", " + str(fscore)
def main(args): #network = importlib.import_module(args.model_def) subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S') log_dir = os.path.join(os.path.expanduser(args.logs_base_dir), subdir) if not os.path.isdir(log_dir): # Create the log directory if it doesn't exist os.makedirs(log_dir) model_dir = os.path.join(os.path.expanduser(args.models_base_dir), subdir) if not os.path.isdir(model_dir): # Create the model directory if it doesn't exist os.makedirs(model_dir) # Write arguments to a text file utils.write_arguments_to_file(args, os.path.join(log_dir, 'arguments.txt')) # Store some git revision info in a text file in the log directory src_path,_ = os.path.split(os.path.realpath(__file__)) utils.store_revision_info(src_path, log_dir, ' '.join(sys.argv)) np.random.seed(seed=args.seed) train_set = utils.get_dataset(args.data_dir) nrof_classes = len(train_set) print('nrof_classes: ',nrof_classes) image_list, label_list = utils.get_image_paths_and_labels(train_set) image_list = np.array(image_list) label_list = np.array(label_list,dtype=np.int32) dataset_size = len(image_list) single_batch_size = args.people_per_batch*args.images_per_person indices = range(dataset_size) np.random.shuffle(indices) def _sample_people_softmax(x): global softmax_ind if softmax_ind >= dataset_size: np.random.shuffle(indices) softmax_ind = 0 true_num_batch = min(single_batch_size,dataset_size - softmax_ind) sample_paths = image_list[indices[softmax_ind:softmax_ind+true_num_batch]] sample_labels = label_list[indices[softmax_ind:softmax_ind+true_num_batch]] softmax_ind += true_num_batch return (np.array(sample_paths), np.array(sample_labels,dtype=np.int32)) def _sample_people(x): '''We sample people based on tf.data, where we can use transform and prefetch. ''' image_paths, num_per_class = sample_people(train_set,args.people_per_batch*(args.num_gpus-1),args.images_per_person) labels = [] for i in range(len(num_per_class)): labels.extend([i]*num_per_class[i]) return (np.array(image_paths),np.array(labels,dtype=np.int32)) def _parse_function(filename,label): file_contents = tf.read_file(filename) image = tf.image.decode_image(file_contents, channels=3) #image = tf.image.decode_jpeg(file_contents, channels=3) print(image.shape) if args.random_crop: print('use random crop') image = tf.random_crop(image, [args.image_size, args.image_size, 3]) else: print('Not use random crop') #image.set_shape((args.image_size, args.image_size, 3)) image.set_shape((None,None, 3)) image = tf.image.resize_images(image, size=(args.image_height, args.image_width)) #print(image.shape) if args.random_flip: image = tf.image.random_flip_left_right(image) #pylint: disable=no-member #image.set_shape((args.image_size, args.image_size, 3)) image.set_shape((args.image_height, args.image_width, 3)) if debug: image = tf.cast(image,tf.float32) else: image = tf.image.per_image_standardization(image) return image, label print('Model directory: %s' % model_dir) print('Log directory: %s' % log_dir) if args.pretrained_model: print('Pre-trained model: %s' % os.path.expanduser(args.pretrained_model)) with tf.Graph().as_default(): tf.set_random_seed(args.seed) global_step = tf.Variable(0, trainable=False,name='global_step') # Placeholder for the learning rate learning_rate_placeholder = tf.placeholder(tf.float32, name='learning_rate') phase_train_placeholder = tf.placeholder(tf.bool, name='phase_train') #the image is generated by sequence with tf.device("/cpu:0"): softmax_dataset = tf_data.Dataset.range(args.epoch_size*args.max_nrof_epochs*100) softmax_dataset = softmax_dataset.map(lambda x: tf.py_func(_sample_people_softmax,[x],[tf.string,tf.int32])) softmax_dataset = softmax_dataset.flat_map(_from_tensor_slices) softmax_dataset = softmax_dataset.map(_parse_function,num_threads=8,output_buffer_size=2000) softmax_dataset = softmax_dataset.batch(args.num_gpus*single_batch_size) softmax_iterator = softmax_dataset.make_initializable_iterator() softmax_next_element = softmax_iterator.get_next() softmax_next_element[0].set_shape((args.num_gpus*single_batch_size, args.image_height,args.image_width,3)) softmax_next_element[1].set_shape(args.num_gpus*single_batch_size) batch_image_split = tf.split(softmax_next_element[0],args.num_gpus) batch_label_split = tf.split(softmax_next_element[1],args.num_gpus) learning_rate = tf.train.exponential_decay(learning_rate_placeholder, global_step, args.learning_rate_decay_epochs*args.epoch_size, args.learning_rate_decay_factor, staircase=True) tf.summary.scalar('learning_rate', learning_rate) print('Using optimizer: {}'.format(args.optimizer)) if args.optimizer == 'ADAGRAD': opt = tf.train.AdagradOptimizer(learning_rate) elif args.optimizer == 'MOM': opt = tf.train.MomentumOptimizer(learning_rate,0.9) tower_losses = [] tower_cross = [] tower_dist = [] tower_reg= [] for i in range(args.num_gpus): with tf.device("/gpu:" + str(i)): with tf.name_scope("tower_" + str(i)) as scope: with slim.arg_scope([slim.model_variable, slim.variable], device="/cpu:0"): with tf.variable_scope(tf.get_variable_scope()) as var_scope: reuse = False if i ==0 else True #with slim.arg_scope(resnet_v2.resnet_arg_scope(args.weight_decay)): #prelogits, end_points = resnet_v2.resnet_v2_50(batch_image_split[i],is_training=True, # output_stride=16,num_classes=args.embedding_size,reuse=reuse) #prelogits, end_points = network.inference(batch_image_split[i], args.keep_probability, # phase_train=phase_train_placeholder, bottleneck_layer_size=args.embedding_size, # weight_decay=args.weight_decay, reuse=reuse) if args.network == 'sphere_network': prelogits = network.infer(batch_image_split[i]) elif args.network == 'resnet_v2': with slim.arg_scope(resnet_v2.resnet_arg_scope(args.weight_decay)): prelogits, end_points = resnet_v2.resnet_v2_50(batch_image_split[i],is_training=True, output_stride=16,num_classes=args.embedding_size,reuse=reuse) prelogits = tf.squeeze(prelogits,axis=[1,2]) #prelogits = slim.batch_norm(prelogits, is_training=True, decay=0.997,epsilon=1e-5,scale=True,updates_collections=tf.GraphKeys.UPDATE_OPS,reuse=reuse,scope='softmax_bn') if args.loss_type == 'softmax': cross_entropy_mean = utils.softmax_loss(prelogits,batch_label_split[i], len(train_set),args.weight_decay,reuse) regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) tower_cross.append(cross_entropy_mean) #loss = cross_entropy_mean + args.weight_decay*tf.add_n(regularization_losses) loss = cross_entropy_mean + tf.add_n(regularization_losses) tower_dist.append(0) tower_cross.append(cross_entropy_mean) tower_th.append(0) tower_losses.append(loss) tower_reg.append(regularization_losses) elif args.loss_type == 'cosface': label_reshape = tf.reshape(batch_label_split[i],[single_batch_size]) label_reshape = tf.cast(label_reshape,tf.int64) coco_loss = utils.cos_loss(prelogits,label_reshape, len(train_set),reuse,alpha=args.alpha,scale=args.scale) regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) reg_loss = args.weight_decay*tf.add_n(regularization_losses) loss = coco_loss + reg_loss tower_losses.append(loss) tower_reg.append(reg_loss) #loss = tf.add_n([cross_entropy_mean] + regularization_losses, name='total_loss') tf.get_variable_scope().reuse_variables() total_loss = tf.reduce_mean(tower_losses) total_reg = tf.reduce_mean(tower_reg) losses = {} losses['total_loss'] = total_loss losses['total_reg'] = total_reg grads = opt.compute_gradients(total_loss,tf.trainable_variables(),colocate_gradients_with_ops=True) apply_gradient_op = opt.apply_gradients(grads,global_step=global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = tf.group(apply_gradient_op) save_vars = [var for var in tf.global_variables() if 'Adagrad' not in var.name and 'global_step' not in var.name] #saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=3) saver = tf.train.Saver(save_vars, max_to_keep=3) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Start running operations on the Graph. gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,allow_soft_placement=True)) # Initialize variables sess.run(tf.global_variables_initializer(), feed_dict={phase_train_placeholder:True}) sess.run(tf.local_variables_initializer(), feed_dict={phase_train_placeholder:True}) #sess.run(iterator.initializer) sess.run(softmax_iterator.initializer) summary_writer = tf.summary.FileWriter(log_dir, sess.graph) coord = tf.train.Coordinator() tf.train.start_queue_runners(coord=coord, sess=sess) with sess.as_default(): #pdb.set_trace() if args.pretrained_model: print('Restoring pretrained model: %s' % args.pretrained_model) saver.restore(sess, os.path.expanduser(args.pretrained_model)) # Training and validation loop epoch = 0 while epoch < args.max_nrof_epochs: step = sess.run(global_step, feed_dict=None) epoch = step // args.epoch_size if debug: debug_train(args, sess, train_set, epoch, image_batch_gather, enqueue_op,batch_size_placeholder, image_batch_split,image_paths_split,num_per_class_split, image_paths_placeholder,image_paths_split_placeholder, labels_placeholder, labels_batch, num_per_class_placeholder,num_per_class_split_placeholder,len(gpus)) # Train for one epoch train(args, sess, epoch, learning_rate_placeholder, phase_train_placeholder, global_step, losses, train_op, summary_op, summary_writer, args.learning_rate_schedule_file) # Save variables and the metagraph if it doesn't exist already save_variables_and_metagraph(sess, saver, summary_writer, model_dir, subdir, step) # Evaluate on LFW return model_dir
max_sequence_length_ = data_helpers.load_data(FLAGS.dev_data_path, checkpoint_dir=checkpoint_dir) #the labels arent used here print("Labels: %d: %s" % (len(onehot_label), ','.join(onehot_label.values()))) print("Vocabulary Size: {:d}".format(len(vocabulary))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) if FLAGS.use_word2vec: FLAGS.embedding_dim = 300 # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN(sequence_length=x_train.shape[1], num_classes=len(onehot_label), vocab_size=len(vocabulary), embedding_size=FLAGS.embedding_dim, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters=list(map(int, FLAGS.num_filters.split(","))), l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = 0 train_op = tf.train.AdamOptimizer(0.001).minimize(cnn.loss)
def halfGPU(): config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = GPU_FRACTION set_session(tf.Session(config=config))
def test_different_backends_load_openai(self): try: import tensorflow as tf except ImportError: raise SkipTest('tensorflow is not installed, so we can not compare results with the released model') os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from openai.train import dropout, embed, block, find_trainable_variables n_vocab = 40478 n_ctx = 7 n_embd = 768 embd_pdrop = 0.1 n_layer = 12 n_batch_train = 2 n_transfer = 1 + 12 * 12 def model(X, train=False, reuse=False): with tf.variable_scope('model', reuse=reuse): we = tf.get_variable("we", [n_vocab + TextEncoder.SPECIAL_COUNT + n_ctx, n_embd], initializer=tf.random_normal_initializer(stddev=0.02)) we = dropout(we, embd_pdrop, train) h = embed(X, we) for layer in range(n_layer): h = block(h, 'h%d' % layer, train=train, scale=True) return h X_train = tf.placeholder(tf.int32, [n_batch_train, n_ctx, 2]) res = model(X_train) params = find_trainable_variables('model') sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run(tf.global_variables_initializer()) with open('openai/model/params_shapes.json') as f: shapes = json.load(f) offsets = np.cumsum([np.prod(shape) for shape in shapes]) init_params = [np.load('openai/model/params_{}.npy'.format(n)) for n in range(10)] init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)] init_params[0] = init_params[0][:n_ctx] init_params[0] = np.concatenate( [init_params[1], (np.random.randn(TextEncoder.SPECIAL_COUNT, n_embd) * 0.02).astype(np.float32), init_params[0]], 0) del init_params[1] sess.run([p.assign(ip) for p, ip in zip(params[:n_transfer], init_params[:n_transfer])]) xmb = np.random.randint(0, n_vocab, (n_batch_train, n_ctx, 2)) xmb[:, :, 1] = np.random.randint(0, n_ctx, (n_batch_train, n_ctx)) xmb_tf = xmb.copy() xmb_tf[:, :, 1] += n_vocab + TextEncoder.SPECIAL_COUNT tf_result = sess.run(res, {X_train: xmb_tf}) for backend in self.list_backends(): try: set_keras_backend(backend) except ModuleNotFoundError: continue K.set_learning_phase(0) keras_model = load_openai_transformer(use_attn_mask=True, use_one_embedding_dropout=False, max_len=n_ctx) mask = create_attention_mask(None, True, n_batch_train, n_ctx) k_result = keras_model.predict( [xmb[:, :, 0], np.zeros((n_batch_train, n_ctx), dtype=np.int64), xmb[:, :, 1], mask], batch_size=n_batch_train) if K.backend() != 'tensorflow': assert np.allclose(tf_result, k_result, atol=1.e-4, rtol=1.e-4) else: assert (tf_result == k_result).all()
def test(model_path='model_save/model-40000'): print ('loading dataset...') dataset, img_feature, test_data = get_data_test() num_test = test_data['question'].shape[0] print('numtest: ' + str(num_test)) vocabulary_size = len(dataset['ix_to_word'].keys()) print ('vocabulary_size : ' + str(vocabulary_size)) model = Answer_Generator( rnn_size = rnn_size, rnn_layer = rnn_layer, batch_size = batch_size, input_embedding_size = input_embedding_size, dim_image = dim_image, dim_hidden = dim_hidden, max_words_q = max_words_q, vocabulary_size = vocabulary_size, drop_out_rate = 0) tf_proba, tf_image, tf_question, tf_answer = model.build_generator() #sess = tf.InteractiveSession() sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) saver = tf.train.Saver() saver.restore(sess, model_path) tStart_total = time.time() result = {} for current_batch_start_idx in xrange(0, num_test-1, batch_size): #for current_batch_start_idx in xrange(0,3,batch_size): tStart = time.time() # set data into current* if current_batch_start_idx + batch_size < num_test: current_batch_file_idx = range(current_batch_start_idx, current_batch_start_idx + batch_size) else: current_batch_file_idx = range(current_batch_start_idx, num_test) current_question = test_data['question'][current_batch_file_idx,:] current_length_q = test_data['length_q'][current_batch_file_idx] current_img_list = test_data['img_list'][current_batch_file_idx] current_answer = test_data['answer'][current_batch_file_idx,:] current_length_a = test_data['length_a'][current_batch_file_idx] current_ques_id = test_data['ques_id'][current_batch_file_idx] current_target = test_data['target'][current_batch_file_idx] current_img = img_feature[current_img_list,:] # (batch_size, dim_image) # deal with the last batch if(len(current_img)<500): pad_img = np.zeros((500-len(current_img),dim_image),dtype=np.int) pad_q = np.zeros((500-len(current_img),max_words_q),dtype=np.int) pad_q_len = np.zeros(500-len(current_length_q),dtype=np.int) pad_q_id = np.zeros(500-len(current_length_q),dtype=np.int) pad_img_list = np.zeros(500-len(current_length_q),dtype=np.int) pad_a = np.zeros((500-len(current_img),max_words_q),dtype=np.int) pad_a_len = np.zeros(500-len(current_length_a),dtype=np.int) pad_target = np.zeros((500-len(current_target), 2),dtype=np.int) current_img = np.concatenate((current_img, pad_img)) current_question = np.concatenate((current_question, pad_q)) current_length_q = np.concatenate((current_length_q, pad_q_len)) current_ques_id = np.concatenate((current_ques_id, pad_q_id)) current_img_list = np.concatenate((current_img_list, pad_img_list)) current_answer = np.concatenate((current_answer, pad_a)) current_length_a = np.concatenate((current_length_a, pad_a_len)) current_target = np.concatenate((current_target, pad_target)) pred_proba = sess.run( tf_proba, feed_dict={ tf_image: current_img, tf_question: current_question, tf_answer: current_answer }) # initialize json list pred_proba = np.transpose(pred_proba) assert(current_target.shape == (500,2)) assert(pred_proba.shape == (500,2)) target, prob = getMaximumLikelihood(current_target, pred_proba) for i in list(range(0, 500)): if str(current_ques_id[i]) not in result: result[str(current_ques_id[i])] = [target[i], prob[i]] else: if result[str(current_ques_id[i])][1] < prob[i]: result[str(current_ques_id[i])] = [target[i], prob[i]] tStop = time.time() print ("Testing batch: ", current_batch_file_idx[0]) print ("Time Cost:", round(tStop - tStart,2), "s") print ("Testing done.") tStop_total = time.time() print ("Total Time Cost:", round(tStop_total - tStart_total,2), "s") # Save to JSON print ('Saving result...') acc = 0 for k,v in result.items(): acc += v[0] print(str(acc*1.0/len(result))) dd = json.dump(result,open('data.json','w'))
def get_session(): config = tf.ConfigProto() config.gpu_options.allow_growth = True return keras.backend.tensorflow_backend.set_session(tf.Session(config=config))
def __init__(self, dropout=1.0): tf.reset_default_graph() with tf.variable_scope(NAMESPACE): config = tf.ConfigProto(allow_soft_placement=True) self.sess = tf.Session(config=config) # Input variables self.item = tf.placeholder(tf.float32, shape=(None, self._nodes_vocab_size), name='item') self.question_vectors_fw = tf.placeholder(tf.float32, shape=(None, None, self._question_vocab_size), name='question_vectors_inp_fw') self.question_vectors_bw = tf.placeholder(tf.float32, shape=(None, None, self._question_vocab_size), name='question_vectors_inp_nw') self.question_mask = tf.placeholder(tf.float32, shape=(None, None, self._mask_size), name='question_mask') # The question is pre-processed by a bi-GRU self.Wq = tf.Variable(tf.random_uniform([self._question_vocab_size, self._word_proj_size_for_rnn], -_rw, _rw)) self.bq = tf.Variable(tf.random_uniform([self._word_proj_size_for_rnn], -_rw, _rw)) self.internal_projection = lambda x: tf.nn.relu(tf.matmul(x, self.Wq) + self.bq) self.question_int_fw = tf.map_fn(self.internal_projection, self.question_vectors_fw) self.question_int_bw = tf.map_fn(self.internal_projection, self.question_vectors_bw) self.rnn_cell_fw = rnn.MultiRNNCell([rnn.GRUCell(self._memory_dim) for _ in range(self._stack_dimension)], state_is_tuple=True) self.rnn_cell_bw = rnn.MultiRNNCell([rnn.GRUCell(self._memory_dim) for _ in range(self._stack_dimension)], state_is_tuple=True) with tf.variable_scope('fw'): output_fw, state_fw = tf.nn.dynamic_rnn(self.rnn_cell_fw, self.question_int_fw, time_major=True, dtype=tf.float32) with tf.variable_scope('bw'): output_bw, state_bw = tf.nn.dynamic_rnn(self.rnn_cell_bw, self.question_int_bw, time_major=True, dtype=tf.float32) self.states = tf.concat(values=[output_fw, tf.reverse(output_bw, [0])], axis=2) self.question_vector_pre = tf.reduce_mean(tf.multiply(self.question_mask, self.states), axis=0) self.Wqa = tf.Variable( tf.random_uniform([2 * self._memory_dim, self._question_vector_size], -_rw, _rw), name='Wqa') self.bqa = tf.Variable(tf.random_uniform([self._question_vector_size], -_rw, _rw), name='bqa') self.question_vector = tf.nn.relu(tf.matmul(self.question_vector_pre, self.Wqa) + self.bqa) # Item self.Wit = tf.Variable(tf.random_uniform([self._nodes_vocab_size, self._word_proj_size_for_item], -_rw, _rw)) self.bit = tf.Variable(tf.random_uniform([self._word_proj_size_for_item], -_rw, _rw)) self.item_proj = tf.nn.relu(tf.matmul(self.item, self.Wit) + self.bit) # Concatenate self.concatenated = tf.concat(values=[self.question_vector, self.item_proj], axis=1) # Final feedforward layers self.Ws1 = tf.Variable( tf.random_uniform([self._question_vector_size + self._word_proj_size_for_item, self._hidden_layer2_size], -_rw, _rw), name='Ws1') self.bs1 = tf.Variable(tf.random_uniform([self._hidden_layer2_size], -_rw, _rw), name='bs1') self.first_hidden = tf.nn.relu(tf.matmul(self.concatenated, self.Ws1) + self.bs1) self.first_hidden_dropout = tf.nn.dropout(self.first_hidden, dropout) self.Wf = tf.Variable( tf.random_uniform([self._hidden_layer2_size, self._output_size], -_rw, _rw), name='Wf') self.bf = tf.Variable(tf.random_uniform([self._output_size], -_rw, _rw), name='bf') self.outputs = tf.nn.softmax(tf.matmul(self.first_hidden_dropout, self.Wf) + self.bf) # Loss function and training self.y_ = tf.placeholder(tf.float32, shape=(None, self._output_size), name='y_') self.outputs2 = tf.squeeze(self.outputs) self.y2_ = tf.squeeze(self.y_) self.one = tf.ones_like(self.outputs) self.tiny = self.one * TINY self.cross_entropy = (tf.reduce_mean( -tf.reduce_sum(self.y_ * tf.log(self.outputs + self.tiny) * _weight_for_positive_matches + (self.one - self.y_) * tf.log( self.one - self.outputs + self.tiny)) )) # Clipping the gradient optimizer = tf.train.AdamOptimizer(1e-4) gvs = optimizer.compute_gradients(self.cross_entropy) capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs if var.name.find(NAMESPACE) != -1] self.train_step = optimizer.apply_gradients(capped_gvs) self.sess.run(tf.global_variables_initializer()) # Adding the summaries tf.summary.scalar('cross_entropy', self.cross_entropy) self.merged = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter('./train', self.sess.graph)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.data_type == "onehop": dataset_class = input_fns.OneHopDataset eval_fn = evaluate.multihop_eval_fn elif FLAGS.data_type == "twohop": dataset_class = input_fns.TwoHopDataset eval_fn = evaluate.multihop_eval_fn elif FLAGS.data_type == "threehop": dataset_class = input_fns.ThreeHopDataset eval_fn = evaluate.multihop_eval_fn elif (FLAGS.data_type == "wikimovie" or FLAGS.data_type == "wikimovie-2hop" or FLAGS.data_type == "wikimovie-3hop"): dataset_class = input_fns.WikiMovieDataset eval_fn = evaluate.wikimovie_eval_fn elif FLAGS.data_type == "hotpotqa": dataset_class = input_fns.HotpotQADataset eval_fn = evaluate.hotpot_eval_fn if FLAGS.model_type == "onehop": create_model_fn = model_fns.create_onehop_model elif FLAGS.model_type == "twohop": create_model_fn = model_fns.create_twohop_model elif FLAGS.model_type == "twohop-cascaded": create_model_fn = model_fns.create_twohopcascade_model elif FLAGS.model_type == "threehop": create_model_fn = functools.partial( model_fns.create_twohop_model, num_hops=3) elif FLAGS.model_type == "threehop-cascaded": create_model_fn = functools.partial( model_fns.create_twohopcascade_model, num_hops=3) elif FLAGS.model_type == "wikimovie": create_model_fn = model_fns.create_wikimovie_model elif FLAGS.model_type == "wikimovie-2hop": create_model_fn = functools.partial( model_fns.create_wikimovie_model, num_hops=2) elif FLAGS.model_type == "wikimovie-3hop": create_model_fn = functools.partial( model_fns.create_wikimovie_model, num_hops=3) elif FLAGS.model_type == "hotpotqa": create_model_fn = functools.partial( model_fns.create_hotpotqa_model, num_hops=FLAGS.num_hops) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) # Load mention and entity files. mention2text = json.load( tf.gfile.Open(os.path.join(FLAGS.train_data_dir, "mention2text.json"))) tf.logging.info("Loading metadata about entities and mentions...") entity2id, entity2name = json.load( tf.gfile.Open(os.path.join(FLAGS.train_data_dir, "entities.json"))) entityid2name = {str(i): entity2name[e] for e, i in entity2id.items()} # all_paragraphs = json.load(tf.gfile.Open(os.path.join( # FLAGS.train_data_dir, "subparas.json"))) # all_mentions = np.load(tf.gfile.Open(os.path.join( # FLAGS.train_data_dir, "mentions.npy"))) all_paragraphs = None all_mentions = None qa_config = QAConfig( qry_layers_to_use=FLAGS.qry_layers_to_use, qry_aggregation_fn=FLAGS.qry_aggregation_fn, dropout=FLAGS.question_dropout, qry_num_layers=FLAGS.question_num_layers, projection_dim=FLAGS.projection_dim, load_only_bert=FLAGS.load_only_bert, num_entities=len(entity2id), max_entity_len=FLAGS.max_entity_len, ensure_answer_sparse=FLAGS.ensure_answer_sparse, ensure_answer_dense=FLAGS.ensure_answer_dense, train_with_sparse=FLAGS.train_with_sparse, predict_with_sparse=FLAGS.predict_with_sparse, fix_sparse_to_one=FLAGS.fix_sparse_to_one, supervision=FLAGS.supervision, l2_normalize_db=FLAGS.l2_normalize_db, entity_score_aggregation_fn=FLAGS.entity_score_aggregation_fn, entity_score_threshold=FLAGS.entity_score_threshold, softmax_temperature=FLAGS.softmax_temperature, sparse_reduce_fn=FLAGS.sparse_reduce_fn, intermediate_loss=FLAGS.intermediate_loss, light=FLAGS.light, sparse_strategy=FLAGS.sparse_strategy, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size) mips_config = MIPSConfig( ckpt_path=os.path.join(FLAGS.train_data_dir, "mention_feats"), ckpt_var_name="db_emb", num_mentions=len(mention2text), emb_size=FLAGS.projection_dim * 2, num_neighbors=FLAGS.num_mips_neighbors) validate_flags_or_throw() tf.gfile.MakeDirs(FLAGS.output_dir) if FLAGS.do_train: json.dump(tf.app.flags.FLAGS.flag_values_dict(), tf.gfile.Open(os.path.join(FLAGS.output_dir, "flags.json"), "w")) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=8, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), session_config=tf.ConfigProto(log_device_placement=False)) num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_dataset = dataset_class( in_file=FLAGS.train_file, tokenizer=tokenizer, subject_mention_probability=FLAGS.subject_mention_probability, max_qry_length=FLAGS.max_query_length, is_training=True, entity2id=entity2id, tfrecord_filename=os.path.join(FLAGS.output_dir, "train.tf_record")) num_train_steps = int(train_dataset.num_examples / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) summary_obj = None model_fn = model_fn_builder( bert_config=bert_config, qa_config=qa_config, mips_config=mips_config, init_checkpoint=FLAGS.init_checkpoint, e2m_checkpoint=os.path.join(FLAGS.train_data_dir, "ent2ment.npz"), m2e_checkpoint=os.path.join(FLAGS.train_data_dir, "coref.npz"), entity_id_checkpoint=os.path.join(FLAGS.train_data_dir, "entity_ids"), entity_mask_checkpoint=os.path.join(FLAGS.train_data_dir, "entity_mask"), learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, create_model_fn=create_model_fn, summary_obj=summary_obj) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Num orig examples = %d", train_dataset.num_examples) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train(train_dataset, estimator, num_train_steps) if FLAGS.do_predict: eval_dataset = dataset_class( in_file=FLAGS.predict_file, tokenizer=tokenizer, subject_mention_probability=0.0, max_qry_length=FLAGS.max_query_length, is_training=False, entity2id=entity2id, tfrecord_filename=os.path.join(FLAGS.output_dir, "eval.tf_record")) continuous_eval( eval_dataset, estimator, mention2text, entityid2name, qa_config.supervision, eval_fn, paragraphs=all_paragraphs, mentions=all_mentions) if FLAGS.do_test: # Load mention and entity files. mention2text = json.load( tf.gfile.Open(os.path.join(FLAGS.test_data_dir, "mention2text.json"))) entity2id, entity2name = json.load( tf.gfile.Open(os.path.join(FLAGS.test_data_dir, "entities.json"))) entityid2name = {str(i): entity2name[e] for e, i in entity2id.items()} all_paragraphs = json.load( tf.gfile.Open(os.path.join(FLAGS.test_data_dir, "subparas.json"))) all_mentions = np.load( tf.gfile.Open(os.path.join(FLAGS.test_data_dir, "mentions.npy"))) qa_config.num_entities = len(entity2id) mips_config = MIPSConfig( ckpt_path=os.path.join(FLAGS.test_data_dir, "mention_feats"), ckpt_var_name="db_emb", num_mentions=len(mention2text), emb_size=FLAGS.projection_dim * 2, num_neighbors=FLAGS.num_mips_neighbors) model_fn = model_fn_builder( bert_config=bert_config, qa_config=qa_config, mips_config=mips_config, init_checkpoint=FLAGS.init_checkpoint, e2m_checkpoint=os.path.join(FLAGS.test_data_dir, "ent2ment.npz"), m2e_checkpoint=os.path.join(FLAGS.test_data_dir, "coref.npz"), entity_id_checkpoint=os.path.join(FLAGS.test_data_dir, "entity_ids"), entity_mask_checkpoint=os.path.join(FLAGS.test_data_dir, "entity_mask"), learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, create_model_fn=create_model_fn, summary_obj=summary_obj) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size) eval_dataset = dataset_class( in_file=FLAGS.test_file, tokenizer=tokenizer, subject_mention_probability=0.0, max_qry_length=FLAGS.max_query_length, is_training=False, entity2id=entity2id, tfrecord_filename=os.path.join(FLAGS.output_dir, "test.tf_record")) if tf.gfile.Exists(os.path.join(FLAGS.output_dir, "best_model.meta")): ckpt_path = os.path.join(FLAGS.output_dir, "best_model") else: ckpt_path = None output_prediction_file = os.path.join(FLAGS.output_dir, "test_predictions.json") metrics = single_eval( eval_dataset, estimator, ckpt_path, mention2text, entityid2name, qa_config.supervision, output_prediction_file, eval_fn, paragraphs=all_paragraphs, mentions=all_mentions) with tf.gfile.Open(os.path.join(FLAGS.output_dir, "test_metrics.txt"), "w") as fo: for metric, value in metrics.items(): tf.logging.info("%s: %.4f", metric, value) fo.write("%s %.4f\n" % (metric, value))
def main(_): global ckpt_path global last_f1 if not os.path.exists(ckpt_path): os.makedirs(ckpt_path) if not os.path.exists(summary_path): os.makedirs(summary_path) elif not FLAGS.is_retrain: # 重新训练本模型,删除以前的 summary shutil.rmtree(summary_path) os.makedirs(summary_path) if not os.path.exists(summary_path): os.makedirs(summary_path) print('1.Loading data...') W_embedding = np.load(embedding_path) print('training sample_num = %d' % n_tr_batches) print('valid sample_num = %d' % n_va_batches) # Initial or restore the model print('2.Building model...') config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: model = network.TextCNN(W_embedding, settings) with tf.variable_scope('training_ops') as vs: learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step, FLAGS.decay_rate, staircase=True) # two optimizer: op1, update embedding; op2, do not update embedding. with tf.variable_scope('Optimizer1'): tvars1 = tf.trainable_variables() grads1 = tf.gradients(model.loss, tvars1) optimizer1 = tf.train.AdamOptimizer( learning_rate=learning_rate) train_op1 = optimizer1.apply_gradients( zip(grads1, tvars1), global_step=model.global_step) with tf.variable_scope('Optimizer2'): tvars2 = [ tvar for tvar in tvars1 if 'embedding' not in tvar.name ] grads2 = tf.gradients(model.loss, tvars2) optimizer2 = tf.train.AdamOptimizer( learning_rate=learning_rate) train_op2 = optimizer2.apply_gradients( zip(grads2, tvars2), global_step=model.global_step) update_op = tf.group(*model.update_emas) merged = tf.summary.merge_all() # summary train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph) test_writer = tf.summary.FileWriter(summary_path + 'test') training_ops = [ v for v in tf.global_variables() if v.name.startswith(vs.name + '/') ] # 如果已经保存过模型,导入上次的模型 if os.path.exists(ckpt_path + "checkpoint"): print("Restoring Variables from Checkpoint...") model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) last_valid_cost, precision, recall, last_f1 = valid_epoch( data_valid_path, sess, model) print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1)) sess.run(tf.variables_initializer(training_ops)) train_op2 = train_op1 else: print('Initializing Variables...') sess.run(tf.global_variables_initializer()) print('3.Begin training...') print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch)) for epoch in xrange(FLAGS.max_max_epoch): global_step = sess.run(model.global_step) print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate))) if epoch == FLAGS.max_epoch: # update the embedding train_op = train_op1 else: train_op = train_op2 train_fetches = [merged, model.loss, train_op, update_op] valid_fetches = [merged, model.loss] train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer) # 最后再做一次验证 valid_cost, precision, recall, f1 = valid_epoch( data_valid_path, sess, model) print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (sess.run(model.global_step), valid_cost, precision, recall, f1)) if f1 > last_f1: # save the better model saving_path = model.saver.save(sess, model_path, sess.run(model.global_step) + 1) print('saved new model to %s ' % saving_path)
def train_lanenet_multi_gpu(dataset_dir, weights_path=None, net_flag='vgg'): """ train lanenet with multi gpu :param dataset_dir: :param weights_path: :param net_flag: :return: """ # set lanenet dataset # 训练集和验证集 train_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( dataset_dir=dataset_dir, flags='train' ) val_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( dataset_dir=dataset_dir, flags='val' ) # set lanenet train_net = lanenet.LaneNet(net_flag=net_flag, phase='train', reuse=False) val_net = lanenet.LaneNet(net_flag=net_flag, phase='val', reuse=True) # set compute graph node train_images, train_binary_labels, train_instance_labels = train_dataset.inputs( CFG.TRAIN.BATCH_SIZE, 1 ) val_images, val_binary_labels, val_instance_labels = val_dataset.inputs( CFG.TRAIN.VAL_BATCH_SIZE, 1 ) # set average container tower_grads = [] train_tower_loss = [] val_tower_loss = [] batchnorm_updates = None train_summary_op_updates = None # set lr global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.polynomial_decay( learning_rate=CFG.TRAIN.LEARNING_RATE, # 0.0005 global_step=global_step, decay_steps=CFG.TRAIN.EPOCHS, # 80010 power=0.9 ) """ 训练神经网络时,控制学习率对训练的速度和准确度都有很大作用.逐渐减小学习率在实践中被证明对训练的收敛有正向效果 函数使用多项式衰减,在给定的decay_steps将初始学习率衰减到指定的学习率 输入: learning_rate:初始值 global_step:全局step数 decay_steps:学习率衰减的步数,也代表学习率每次更新相隔的步数 end_learning_rate:衰减最终值 power:多项式衰减系数 cycle:step超出decay_steps之后是否继续循环 name:操作的名称,默认为PolynomialDecay。 参数cycle目的:防止神经网络训练后期学习率过小导致网络一直在某个局部最小值中振荡;这样,通过增大学习率可以跳出局部极小值. """ # set optimizer optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=CFG.TRAIN.MOMENTUM # momentum: 要在多大程度上保留原来的更新方向 (0.9) ) """ 动量梯度下降算法 """ # set distributed train op with tf.variable_scope(tf.get_variable_scope()): for i in range(CFG.TRAIN.GPU_NUM): with tf.device('/gpu:{:d}'.format(i)): """ Returns: A context manager that specifies the default device to use for newly created ops. """ with tf.name_scope('tower_{:d}'.format(i)) as _: train_loss, grads = compute_net_gradients( train_images, train_binary_labels, train_instance_labels, train_net, optimizer ) # Only use the mean and var in the first gpu tower to update the parameter if i == 0: batchnorm_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_summary_op_updates = tf.get_collection(tf.GraphKeys.SUMMARIES) tower_grads.append(grads) train_tower_loss.append(train_loss) with tf.name_scope('validation_{:d}'.format(i)) as _: val_loss, _ = compute_net_gradients( val_images, val_binary_labels, val_instance_labels, val_net, optimizer) val_tower_loss.append(val_loss) grads = average_gradients(tower_grads) avg_train_loss = tf.reduce_mean(train_tower_loss) avg_val_loss = tf.reduce_mean(val_tower_loss) # Track the moving averages of all trainable variables variable_averages = tf.train.ExponentialMovingAverage( CFG.TRAIN.MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = tf.trainable_variables() + tf.moving_average_variables() variables_averages_op = variable_averages.apply(variables_to_average) # Group all the op needed for training batchnorm_updates_op = tf.group(*batchnorm_updates) apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Set tf summary save path tboard_save_path = 'tboard/tusimple_lanenet_multi_gpu_{:s}'.format(net_flag) os.makedirs(tboard_save_path, exist_ok=True) summary_writer = tf.summary.FileWriter(tboard_save_path) avg_train_loss_scalar = tf.summary.scalar( name='average_train_loss', tensor=avg_train_loss ) avg_val_loss_scalar = tf.summary.scalar( name='average_val_loss', tensor=avg_val_loss ) learning_rate_scalar = tf.summary.scalar( name='learning_rate_scalar', tensor=learning_rate ) train_merge_summary_op = tf.summary.merge( [avg_train_loss_scalar, learning_rate_scalar] + train_summary_op_updates ) val_merge_summary_op = tf.summary.merge([avg_val_loss_scalar]) # set tensorflow saver saver = tf.train.Saver() model_save_dir = 'model/tusimple_lanenet_multi_gpu_{:s}'.format(net_flag) os.makedirs(model_save_dir, exist_ok=True) train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) model_name = 'tusimple_lanenet_{:s}_{:s}.ckpt'.format(net_flag, str(train_start_time)) model_save_path = ops.join(model_save_dir, model_name) # set sess config sess_config = tf.ConfigProto(device_count={'GPU': CFG.TRAIN.GPU_NUM}, allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.TRAIN.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = CFG.TRAIN.TF_ALLOW_GROWTH sess_config.gpu_options.allocator_type = 'BFC' # Set the training parameters train_epochs = CFG.TRAIN.EPOCHS log.info('Global configuration is as follows:') log.info(CFG) sess = tf.Session(config=sess_config) summary_writer.add_graph(sess.graph) with sess.as_default(): tf.train.write_graph( graph_or_graph_def=sess.graph, logdir='', name='{:s}/lanenet_model.pb'.format(model_save_dir)) if weights_path is None: log.info('Training from scratch') init = tf.global_variables_initializer() sess.run(init) else: log.info('Restore model from last model checkpoint {:s}'.format(weights_path)) saver.restore(sess=sess, save_path=weights_path) train_cost_time_mean = [] val_cost_time_mean = [] for epoch in range(train_epochs): # training part t_start = time.time() _, train_loss_value, train_summary, lr = \ sess.run( fetches=[train_op, avg_train_loss, train_merge_summary_op, learning_rate] ) if math.isnan(train_loss_value): log.error('Train loss is nan') return cost_time = time.time() - t_start train_cost_time_mean.append(cost_time) summary_writer.add_summary(summary=train_summary, global_step=epoch) # validation part t_start_val = time.time() val_loss_value, val_summary = \ sess.run(fetches=[avg_val_loss, val_merge_summary_op]) summary_writer.add_summary(val_summary, global_step=epoch) cost_time_val = time.time() - t_start_val val_cost_time_mean.append(cost_time_val) if epoch % CFG.TRAIN.DISPLAY_STEP == 0: log.info('Epoch_Train: {:d} total_loss= {:6f} ' 'lr= {:6f} mean_cost_time= {:5f}s '. format(epoch + 1, train_loss_value, lr, np.mean(train_cost_time_mean)) ) train_cost_time_mean.clear() if epoch % CFG.TRAIN.VAL_DISPLAY_STEP == 0: log.info('Epoch_Val: {:d} total_loss= {:6f}' ' mean_cost_time= {:5f}s '. format(epoch + 1, val_loss_value, np.mean(val_cost_time_mean)) ) val_cost_time_mean.clear() if epoch % 2000 == 0: saver.save(sess=sess, save_path=model_save_path, global_step=epoch) return
import tensorflow as tf # Creates a graph. with tf.device('/device:GPU:0'): a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a') b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b') c = tf.matmul(a, b) ### # python -m tensorboard.main --logdir=[PATH_TO_LOGDIR] # [PATH_TO_LOGDIR] - without ' or ", + add '/' # FOR EXAMPLE: "python -m tensorboard.main --logdir=logs/" # where 'logs/' - is your directory where your log file placed from writer or smth else # ERRORS: # 1) tensorboard shows no graph -> try to insert writer code somewhere else near the session part # or after initialization of all variables ### writer = tf.summary.FileWriter('logs') writer.add_graph(tf.get_default_graph()) # Creates a session with log_device_placement set to True. sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) # Runs the op. variable = sess.run([c])
def main(args=None): args = parser.parse_args(args) # read config file and default config with open('config/default.yaml') as f: default_config = utils.AttrDict(yaml.safe_load(f)) with open(args.config) as f: config = utils.AttrDict(yaml.safe_load(f)) if args.learning_rate is not None: args.reset_learning_rate = True # command-line parameters have higher precedence than config file for k, v in vars(args).items(): if v is not None: config[k] = v # set default values for parameters that are not defined for k, v in default_config.items(): config.setdefault(k, v) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # disable TensorFlow's debugging logs decoding_mode = any(arg is not None for arg in (args.decode, args.eval, args.align)) # enforce parameter constraints assert config.steps_per_eval % config.steps_per_checkpoint == 0, ( 'steps-per-eval should be a multiple of steps-per-checkpoint') assert decoding_mode or args.train, ( 'you need to specify at least one action (decode, eval, align, or train)') assert not (args.average and args.ensemble) if args.train and args.purge: utils.log('deleting previous model') shutil.rmtree(config.model_dir, ignore_errors=True) os.makedirs(config.model_dir, exist_ok=True) # copy config file to model directory config_path = os.path.join(config.model_dir, 'config.yaml') if args.train and not os.path.exists(config_path): with open(args.config) as config_file, open(config_path, 'w') as dest_file: content = config_file.read() content = re.sub(r'model_dir:.*?\n', 'model_dir: {}\n'.format(config.model_dir), content, flags=re.MULTILINE) dest_file.write(content) # also copy default config config_path = os.path.join(config.model_dir, 'default.yaml') if args.train and not os.path.exists(config_path): shutil.copy('config/default.yaml', config_path) # copy source code to model directory tar_path = os.path.join(config.model_dir, 'code.tar.gz') if args.train and not os.path.exists(tar_path): with tarfile.open(tar_path, "w:gz") as tar: for filename in os.listdir('translate'): if filename.endswith('.py'): tar.add(os.path.join('translate', filename), arcname=filename) logging_level = logging.DEBUG if args.verbose else logging.INFO # always log to stdout in decoding and eval modes (to avoid overwriting precious train logs) log_path = os.path.join(config.model_dir, config.log_file) logger = utils.create_logger(log_path if args.train else None) logger.setLevel(logging_level) utils.log('label: {}'.format(config.label)) utils.log('description:\n {}'.format('\n '.join(config.description.strip().split('\n')))) utils.log(' '.join(sys.argv)) # print command line try: # print git hash commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip() utils.log('commit hash {}'.format(commit_hash)) except: pass utils.log('tensorflow version: {}'.format(tf.__version__)) # log parameters utils.debug('program arguments') for k, v in sorted(config.items(), key=itemgetter(0)): utils.debug(' {:<20} {}'.format(k, pformat(v))) if isinstance(config.dev_prefix, str): config.dev_prefix = [config.dev_prefix] if config.tasks is not None: config.tasks = [utils.AttrDict(task) for task in config.tasks] tasks = config.tasks else: tasks = [config] for task in tasks: for parameter, value in config.items(): task.setdefault(parameter, value) task.encoders = [utils.AttrDict(encoder) for encoder in task.encoders] task.decoders = [utils.AttrDict(decoder) for decoder in task.decoders] for encoder_or_decoder in task.encoders + task.decoders: for parameter, value in task.items(): encoder_or_decoder.setdefault(parameter, value) device = None if config.no_gpu: device = '/cpu:0' device_id = None elif config.gpu_id is not None: device = '/gpu:{}'.format(config.gpu_id) device_id = config.gpu_id else: device_id = 0 # hide other GPUs so that TensorFlow won't use memory on them os.environ['CUDA_VISIBLE_DEVICES'] = '' if device_id is None else str(device_id) utils.log('creating model') utils.log('using device: {}'.format(device)) with tf.device(device): config.checkpoint_dir = os.path.join(config.model_dir, 'checkpoints') if config.weight_scale: if config.initializer == 'uniform': initializer = tf.random_uniform_initializer(minval=-config.weight_scale, maxval=config.weight_scale) else: initializer = tf.random_normal_initializer(stddev=config.weight_scale) else: initializer = None tf.get_variable_scope().set_initializer(initializer) # exempt from creating gradient ops config.decode_only = decoding_mode if config.tasks is not None: model = MultiTaskModel(**config) else: model = TranslationModel(**config) # count parameters utils.log('model parameters ({})'.format(len(tf.global_variables()))) parameter_count = 0 for var in tf.global_variables(): utils.log(' {} {}'.format(var.name, var.get_shape())) if not var.name.startswith('gradients'): # not counting parameters created by training algorithm (e.g. Adam) v = 1 for d in var.get_shape(): v *= d.value parameter_count += v utils.log('number of parameters: {:.2f}M'.format(parameter_count / 1e6)) tf_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) tf_config.gpu_options.allow_growth = config.allow_growth tf_config.gpu_options.per_process_gpu_memory_fraction = config.mem_fraction def average_checkpoints(main_sess, sessions): for var in tf.global_variables(): avg_value = sum(sess.run(var) for sess in sessions) / len(sessions) main_sess.run(var.assign(avg_value)) with tf.Session(config=tf_config) as sess: best_checkpoint = os.path.join(config.checkpoint_dir, 'best') if config.ensemble and len(config.checkpoints) > 1: model.initialize(config.checkpoints) elif config.average and len(config.checkpoints) > 1: model.initialize(reset=True) sessions = [tf.Session(config=tf_config) for _ in config.checkpoints] for sess_, checkpoint in zip(sessions, config.checkpoints): model.initialize(sess=sess_, checkpoints=[checkpoint]) average_checkpoints(sess, sessions) elif (not config.checkpoints and decoding_mode and (os.path.isfile(best_checkpoint + '.index') or os.path.isfile(best_checkpoint + '.index'))): # in decoding and evaluation mode, unless specified otherwise (by `checkpoints`), # try to load the best checkpoint model.initialize([best_checkpoint]) else: # loads last checkpoint, unless `reset` is true model.initialize(**config) if args.decode is not None: model.decode(**config) elif args.eval is not None: model.evaluate(on_dev=False, **config) elif args.align is not None: model.align(**config) elif args.train: try: model.train(**config) except KeyboardInterrupt: sys.exit()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--input_dir", type=str, required=True, help="either a directory containing subdirectories " "train, val, test, etc, or a directory containing " "the tfrecords") parser.add_argument("--results_dir", type=str, default='results', help="ignored if output_gif_dir is specified") parser.add_argument("--results_gif_dir", type=str, help="default is results_dir. ignored if output_gif_dir is specified") parser.add_argument("--results_png_dir", type=str, help="default is results_dir. ignored if output_png_dir is specified") parser.add_argument("--output_gif_dir", help="output directory where samples are saved as gifs. default is " "results_gif_dir/model_fname") parser.add_argument("--output_png_dir", help="output directory where samples are saved as pngs. default is " "results_png_dir/model_fname") parser.add_argument("--checkpoint", help="directory with checkpoint or checkpoint name (e.g. checkpoint_dir/model-200000)") parser.add_argument("--mode", type=str, choices=['val', 'test'], default='val', help='mode for dataset, val or test.') parser.add_argument("--dataset", type=str, help="dataset class name") parser.add_argument("--dataset_hparams", type=str, help="a string of comma separated list of dataset hyperparameters") parser.add_argument("--model", type=str, help="model class name") parser.add_argument("--model_hparams", type=str, help="a string of comma separated list of model hyperparameters") parser.add_argument("--batch_size", type=int, default=8, help="number of samples in batch") parser.add_argument("--num_samples", type=int, help="number of samples in total (all of them by default)") parser.add_argument("--num_epochs", type=int, default=1) parser.add_argument("--num_stochastic_samples", type=int, default=5) parser.add_argument("--gif_length", type=int, help="default is sequence_length") parser.add_argument("--fps", type=int, default=4) parser.add_argument("--gpu_mem_frac", type=float, default=0, help="fraction of gpu memory to use") parser.add_argument("--seed", type=int, default=7) args = parser.parse_args() if args.seed is not None: tf.set_random_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) args.results_gif_dir = args.results_gif_dir or args.results_dir args.results_png_dir = args.results_png_dir or args.results_dir dataset_hparams_dict = {} model_hparams_dict = {} if args.checkpoint: checkpoint_dir = os.path.normpath(args.checkpoint) if not os.path.exists(checkpoint_dir): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), checkpoint_dir) if not os.path.isdir(args.checkpoint): checkpoint_dir, _ = os.path.split(checkpoint_dir) with open(os.path.join(checkpoint_dir, "options.json")) as f: print("loading options from checkpoint %s" % args.checkpoint) options = json.loads(f.read()) args.dataset = args.dataset or options['dataset'] args.model = args.model or options['model'] try: with open(os.path.join(checkpoint_dir, "dataset_hparams.json")) as f: dataset_hparams_dict = json.loads(f.read()) except FileNotFoundError: print("dataset_hparams.json was not loaded because it does not exist") try: with open(os.path.join(checkpoint_dir, "model_hparams.json")) as f: model_hparams_dict = json.loads(f.read()) model_hparams_dict.pop('num_gpus', None) # backwards-compatibility except FileNotFoundError: print("model_hparams.json was not loaded because it does not exist") args.output_gif_dir = args.output_gif_dir or os.path.join(args.results_gif_dir, os.path.split(checkpoint_dir)[1]) args.output_png_dir = args.output_png_dir or os.path.join(args.results_png_dir, os.path.split(checkpoint_dir)[1]) else: if not args.dataset: raise ValueError('dataset is required when checkpoint is not specified') if not args.model: raise ValueError('model is required when checkpoint is not specified') args.output_gif_dir = args.output_gif_dir or os.path.join(args.results_gif_dir, 'model.%s' % args.model) args.output_png_dir = args.output_png_dir or os.path.join(args.results_png_dir, 'model.%s' % args.model) print('----------------------------------- Options ------------------------------------') for k, v in args._get_kwargs(): print(k, "=", v) print('------------------------------------- End --------------------------------------') VideoDataset = datasets.get_dataset_class(args.dataset) dataset = VideoDataset(args.input_dir, mode=args.mode, num_epochs=args.num_epochs, seed=args.seed, hparams_dict=dataset_hparams_dict, hparams=args.dataset_hparams) def override_hparams_dict(dataset): hparams_dict = dict(model_hparams_dict) hparams_dict['context_frames'] = dataset.hparams.context_frames hparams_dict['sequence_length'] = dataset.hparams.sequence_length hparams_dict['repeat'] = dataset.hparams.time_shift return hparams_dict VideoPredictionModel = models.get_model_class(args.model) model = VideoPredictionModel(mode='test', hparams_dict=override_hparams_dict(dataset), hparams=args.model_hparams) if args.num_samples: if args.num_samples > dataset.num_examples_per_epoch(): raise ValueError('num_samples cannot be larger than the dataset') num_examples_per_epoch = args.num_samples else: num_examples_per_epoch = dataset.num_examples_per_epoch() if num_examples_per_epoch % args.batch_size != 0: raise ValueError('batch_size should evenly divide the dataset') inputs, target = dataset.make_batch(args.batch_size) if not isinstance(model, models.GroundTruthVideoPredictionModel): # remove ground truth data past context_frames to prevent accidentally using it for k, v in inputs.items(): if k != 'actions': inputs[k] = v[:, :model.hparams.context_frames] input_phs = {k: tf.placeholder(v.dtype, v.shape, '%s_ph' % k) for k, v in inputs.items()} target_ph = tf.placeholder(target.dtype, target.shape, 'targets_ph') with tf.variable_scope(''): model.build_graph(input_phs, target_ph) for output_dir in (args.output_gif_dir, args.output_png_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) with open(os.path.join(output_dir, "options.json"), "w") as f: f.write(json.dumps(vars(args), sort_keys=True, indent=4)) with open(os.path.join(output_dir, "dataset_hparams.json"), "w") as f: f.write(json.dumps(dataset.hparams.values(), sort_keys=True, indent=4)) with open(os.path.join(output_dir, "model_hparams.json"), "w") as f: f.write(json.dumps(model.hparams.values(), sort_keys=True, indent=4)) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem_frac) config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True) sess = tf.Session(config=config) model.restore(sess, args.checkpoint) sample_ind = 0 while True: if args.num_samples and sample_ind >= args.num_samples: break try: input_results, target_result = sess.run([inputs, target]) except tf.errors.OutOfRangeError: break print("evaluation samples from %d to %d" % (sample_ind, sample_ind + args.batch_size)) feed_dict = {input_ph: input_results[name] for name, input_ph in input_phs.items()} for stochastic_sample_ind in range(args.num_stochastic_samples): gen_images = sess.run(model.outputs['gen_images'], feed_dict=feed_dict) for i, gen_images_ in enumerate(gen_images): gen_images_ = (gen_images_ * 255.0).astype(np.uint8) gen_images_fname = 'gen_image_%05d_%02d.gif' % (sample_ind + i, stochastic_sample_ind) save_gif(os.path.join(args.output_gif_dir, gen_images_fname), gen_images_[:args.gif_length] if args.gif_length else gen_images_, fps=args.fps) for t, gen_image in enumerate(gen_images_): gen_image_fname = 'gen_image_%05d_%02d_%02d.png' % (sample_ind + i, stochastic_sample_ind, t) gen_image = cv2.cvtColor(gen_image, cv2.COLOR_RGB2BGR) cv2.imwrite(os.path.join(args.output_png_dir, gen_image_fname), gen_image) sample_ind += args.batch_size
def __init__(self, custom_db=None): self.batch_size = 4 dir_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vggface') filename = 'weight.mat' filepath = os.path.join(dir_path, filename) if not os.path.exists(filepath): raise FileNotFoundError('Weight file not found, path=%s' % filepath) data = loadmat(filepath) # read meta info meta = data['meta'] classes = meta['classes'] normalization = meta['normalization'] self.average_image = np.squeeze(normalization[0][0]['averageImage'][0][0][0][0]).reshape(1, 1, 1, 3) self.input_hw = tuple(np.squeeze(normalization[0][0]['imageSize'][0][0])[:2]) self.input_node = tf.placeholder(tf.float32, shape=(None, self.input_hw[0], self.input_hw[1], 3), name='image') self.class_names = [str(x[0][0]) for x in classes[0][0]['description'][0][0]] input_norm = tf.subtract(self.input_node, self.average_image, name='normalized_image') # read layer info layers = data['layers'] current = input_norm network = {} for layer in layers[0]: name = layer[0]['name'][0][0] layer_type = layer[0]['type'][0][0] if layer_type == 'conv': if name[:2] == 'fc': padding = 'VALID' else: padding = 'SAME' stride = layer[0]['stride'][0][0] kernel, bias = layer[0]['weights'][0][0] # kernel = np.transpose(kernel, (1, 0, 2, 3)) bias = np.squeeze(bias).reshape(-1) conv = tf.nn.conv2d(current, tf.constant(kernel), strides=(1, stride[0], stride[0], 1), padding=padding) current = tf.nn.bias_add(conv, bias) elif layer_type == 'relu': current = tf.nn.relu(current) elif layer_type == 'pool': stride = layer[0]['stride'][0][0] pool = layer[0]['pool'][0][0] current = tf.nn.max_pool(current, ksize=(1, pool[0], pool[1], 1), strides=(1, stride[0], stride[0], 1), padding='SAME') elif layer_type == 'softmax': current = tf.nn.softmax(tf.reshape(current, [-1, len(self.class_names)])) network[name] = current self.network = network self.graph = tf.get_default_graph() config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)) self.persistent_sess = tf.Session(graph=self.graph, config=config) self.db = None if custom_db: db_path = custom_db else: db_path = DeepFaceConfs.get()['recognizer']['vgg'].get('db', '') db_path = os.path.join(dir_path, db_path) with open(db_path, 'rb') as f: self.db = pickle.load(f) # warm-up self.persistent_sess.run([self.network['prob'], self.network['fc7']], feed_dict={ self.input_node: np.zeros((self.batch_size, 224, 224, 3), dtype=np.uint8) })
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("run", choices=[ "train_and_eval", "train", "eval", "infer", "export", "score" ], help="Run type.") parser.add_argument("--config", required=True, nargs="+", help="List of configuration files.") parser.add_argument("--auto_config", default=False, action="store_true", help="Enable automatic configuration values.") parser.add_argument("--model_type", default="", choices=list( classes_in_module(catalog, public_only=True)), help="Model type from the catalog.") parser.add_argument("--model", default="", help="Custom model configuration file.") parser.add_argument( "--run_dir", default="", help="If set, model_dir will be created relative to this location.") parser.add_argument( "--data_dir", default="", help="If set, data files are expected to be relative to this location." ) parser.add_argument("--features_file", default=[], nargs="+", help="Run inference on this file.") parser.add_argument( "--predictions_file", default="", help= ("File used to save predictions. If not set, predictions are printed " "on the standard output.")) parser.add_argument("--log_prediction_time", default=False, action="store_true", help="Logs some prediction time metrics.") parser.add_argument( "--checkpoint_path", default=None, help=("Checkpoint or directory to use for inference or export " "(when a directory is set, the latest checkpoint is used).")) parser.add_argument("--export_dir_base", default=None, help="The base directory of the exported model.") parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use for in-graph replication.") parser.add_argument( "--chief_host", default="", help="hostname:port of the chief worker (for distributed training).") parser.add_argument( "--worker_hosts", default="", help=("Comma-separated list of hostname:port of workers " "(for distributed training).")) parser.add_argument( "--ps_hosts", default="", help=("Comma-separated list of hostname:port of parameter servers " "(for distributed training).")) parser.add_argument( "--task_type", default="chief", choices=["chief", "worker", "ps", "evaluator"], help="Type of the task to run (for distributed training).") parser.add_argument("--task_index", type=int, default=0, help="ID of the task (for distributed training).") parser.add_argument("--log_level", default="INFO", choices=["DEBUG", "ERROR", "FATAL", "INFO", "WARN"], help="Logs verbosity.") parser.add_argument("--seed", type=int, default=None, help="Random seed.") parser.add_argument("--gpu_allow_growth", default=False, action="store_true", help="Allocate GPU memory dynamically.") parser.add_argument( "--intra_op_parallelism_threads", type=int, default=0, help=("Number of intra op threads (0 means the system picks " "an appropriate number).")) parser.add_argument( "--inter_op_parallelism_threads", type=int, default=0, help=("Number of inter op threads (0 means the system picks " "an appropriate number).")) parser.add_argument( "--session_config", default=None, help=( "Path to a file containing a tf.ConfigProto message in text format " "and used to create the TensorFlow sessions.")) args = parser.parse_args() tf.logging.set_verbosity(getattr(tf.logging, args.log_level)) # Setup cluster if defined. if args.chief_host: os.environ["TF_CONFIG"] = json.dumps({ "cluster": { "chief": [args.chief_host], "worker": args.worker_hosts.split(","), "ps": args.ps_hosts.split(",") }, "task": { "type": args.task_type, "index": args.task_index } }) # Load and merge run configurations. config = load_config(args.config) if args.run_dir: config["model_dir"] = os.path.join(args.run_dir, config["model_dir"]) if args.data_dir: config["data"] = _prefix_paths(args.data_dir, config["data"]) is_chief = args.task_type == "chief" if is_chief and not tf.gfile.Exists(config["model_dir"]): tf.logging.info("Creating model directory %s", config["model_dir"]) tf.gfile.MakeDirs(config["model_dir"]) model = load_model(config["model_dir"], model_file=args.model, model_name=args.model_type, serialize_model=is_chief) session_config = tf.ConfigProto( intra_op_parallelism_threads=args.intra_op_parallelism_threads, inter_op_parallelism_threads=args.inter_op_parallelism_threads, gpu_options=tf.GPUOptions(allow_growth=args.gpu_allow_growth)) if args.session_config is not None: with open(args.session_config, "rb") as session_config_file: text_format.Merge(session_config_file.read(), session_config) runner = Runner(model, config, seed=args.seed, num_devices=args.num_gpus, session_config=session_config, auto_config=args.auto_config) if args.run == "train_and_eval": runner.train_and_evaluate() elif args.run == "train": runner.train() elif args.run == "eval": runner.evaluate(checkpoint_path=args.checkpoint_path) elif args.run == "infer": if not args.features_file: parser.error("--features_file is required for inference.") elif len(args.features_file) == 1: args.features_file = args.features_file[0] runner.infer(args.features_file, predictions_file=args.predictions_file, checkpoint_path=args.checkpoint_path, log_time=args.log_prediction_time) elif args.run == "export": runner.export(checkpoint_path=args.checkpoint_path, export_dir_base=args.export_dir_base) elif args.run == "score": if not args.features_file: parser.error("--features_file is required for scoring.") if not args.predictions_file: parser.error("--predictions_file is required for scoring.") runner.score(args.features_file, args.predictions_file, checkpoint_path=args.checkpoint_path)
def main(args): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser("~"), ".keras", "datasets") if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (train_data, train_labels), (eval_data, eval_labels) = keras.datasets.mnist.load_data( "MNIST-data-%d" % hvd.rank() ) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. train_data = np.reshape(train_data, (-1, 784)) / 255.0 eval_data = np.reshape(eval_data, (-1, 784)) / 255.0 # Horovod: pin GPU to be used to process local rank (one GPU per process) if not args.use_only_cpu: config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) estimator_config = tf.estimator.RunConfig(session_config=config) else: estimator_config = None # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. model_dir = args.model_dir if hvd.rank() == 0 else None # Create the Estimator mnist_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir=model_dir, config=estimator_config ) # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from # rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights or # restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True ) # Horovod: adjust number of steps based on number of GPUs. mnist_classifier.train( input_fn=train_input_fn, steps=args.num_steps // hvd.size(), hooks=[bcast_hook] ) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False ) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print(eval_results)
print ("Error! model_parallelism should be one of [1, 2, 4, 8, 16, 32]") exit(0) if model_parallelism * data_parallelism > 32: print ("Error! model_parallellism * data_parallelism should less than 32.") exit(0) if data_parallelism > 1: if batch_size < data_parallelism: print ("Error! batch_size must >= data_parallelism") exit(0) if batch_size % data_parallelism != 0: print ("Error! batch_size must be multiple of data_parallelism") exit(0) config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) config.mlu_options.save_offline_model = False config.mlu_options.data_parallelism = data_parallelism config.mlu_options.model_parallelism = model_parallelism config.mlu_options.core_num = core_num config.mlu_options.fusion = True config.mlu_options.core_version = core_version config.mlu_options.precision = precision #config.graph_options.rewrite_options.remapping = 2 #config.graph_options.rewrite_options.constant_folding = 2 #config.graph_options.rewrite_options.arithmetic_optimization = 2 config.mlu_options.optype_black_list ="StridedSlice"
def get_session(): config = tf.ConfigProto() config.gpu_options.allow_growth = True return tf.Session(config=config)
def __init__(self, net_class: AbstractNetClass, dataset_loader: AbstractDatasetLoader, optimizer: AbstractOptimizerClass, num_gpus: int = 1, seed=1337, train_data_size: int = 45000, batch_size: int = 100, dataset_path: str = "./Datasets/cifarDataset.npy", work_path: str = "../", experiment_name: str = "model0", is_calc_angle=False): self.optimizer = optimizer self.dataset_path = dataset_path self.model_dir = work_path + "./models/" + experiment_name + "_" + str(NetFrame.static_instance_counter) + "/" self.plot_dir = self.model_dir + "plots/" + experiment_name + "/" self.default_checkpoints_path = self.model_dir + "checkpoints/convNet.ckp" self.default_log_path = self.model_dir + "log/" self.experimentName = experiment_name self.batch_size = batch_size self.is_calc_angle = is_calc_angle u.check_and_create_path(self.model_dir) u.check_and_create_path(self.default_log_path) u.check_and_create_path(self.plot_dir) # Delete all existing plots and logs if u.check_and_create_path(self.plot_dir): for files in os.listdir(self.plot_dir): os.remove(os.path.join(self.plot_dir, files)) if u.check_and_create_path(self.default_log_path): for files in os.listdir(self.default_log_path): os.remove(os.path.join(self.default_log_path, files)) self.static_instance_counter += 1 # Set random seeds np.random.seed(seed) tf.set_random_seed(seed) self.__sess = tf.Session(graph=tf.get_default_graph(), config=tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions( allow_growth=True), log_device_placement=False)) # TODO change to is debug self.__writer = tf.summary.FileWriter(self.default_log_path, filename_suffix=".event", flush_secs=10) self.__iterator, self.inference_mode_var, train_size, eval_size, test_size = dataset_loader.get_iterator( self.__sess, self.dataset_path, train_data_size, self.batch_size, num_gpus) # ok has to be the same iterator for all gpus self.__num_train_it_per_epoch = train_size // self.batch_size # floor division self.__num_train_it_per_epoch += 1 if train_size % self.batch_size != 0 else 0 self.__num_eval_it_per_epoch = eval_size // self.batch_size # floor division self.__num_eval_it_per_epoch += 1 if eval_size % self.batch_size != 0 else 0 self.__num_test_it_per_epoch = test_size // self.batch_size # floor division self.__num_test_it_per_epoch += 1 if test_size % self.batch_size != 0 else 0 # with tf.device('/cpu:0'): print("loading Network: " + net_class.get_name()) # self.__grad_op, self.__loss_reg_op, self.__loss_op,self.__acc_op, self.__acc_update_op, self.batch_assign_ops, self.reuse_binary_tensor = net_class.get_model( # self.__iterator, self.inference_mode_var, batch_size, num_gpus) self.__grad_op, self.__loss_reg_op, _, self.__acc_op, self.__acc_update_op, self.batch_assign_ops, self.reuse_binary_tensor = net_class.get_model( self.__iterator, self.inference_mode_var, batch_size, num_gpus) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): self._loss_tensor_update_ops = tf.identity(self.__loss_reg_op) # get gradient, calc mean gradient, update gradient # build grad vars for angle determination if self.is_calc_angle: with tf.variable_scope("grad_vars"): grad_vars = [] train_vars = [e[1] for e in self.__grad_op] gradient_tensors = [e[0] for e in self.__grad_op] for var in train_vars: new_var = tf.Variable(tf.zeros(var.shape), trainable=False, name=var.name[0:-2]) grad_vars.append(new_var) # ass_old_step ops ass_grads = [] for grad_var, grad in zip(grad_vars, gradient_tensors): assign = tf.assign(grad_var, grad) ass_grads.append(assign) with tf.control_dependencies(ass_grads): gradient_tensors = [tf.identity(g) for g in gradient_tensors] self.__grad_op = list(zip(gradient_tensors, train_vars)) self.optimizer.initialize(self.__sess, self.__grad_op, self.__loss_reg_op, None, self.plot_dir, self.reuse_binary_tensor) # ,batch_assign_ops=self.batch_assign_ops) if self.is_calc_angle: if isinstance(self.optimizer, PAOptimizerSuper): vars = self.optimizer.step_direction_variables elif isinstance(self.optimizer, TfOptimizer): if isinstance(self.optimizer.optimizer, tf.train.MomentumOptimizer): vars = [self.optimizer.optimizer.get_slot(t_var, "momentum") for t_var in tf.trainable_variables()] self.step_direction_norm_op = u.get_calc_norm_op(vars) self.step_direction_angle_op = u.get_calc_angel_op(vars, self.__grad_op) self.__sess.run(tf.global_variables_initializer()) # since parameter (weight) variables are added before # optimizer variables all weights get the same g._last_id with different optimizers. # -> same weight initialization self.metric_variables_initializer = [x.initializer for x in tf.get_collection(tf.GraphKeys.METRIC_VARIABLES)] # get number of parameters sum_ = 0 for train_var in tf.trainable_variables(): prod = 1 for e in train_var.get_shape(): prod = e * prod sum_ += prod print("amount parameters: ", sum_) # saver has to be inizialized after model is build and variables are defined self.__saver = tf.train.Saver() # save graph for tensorboard # self.__writer.add_graph(self.__sess.graph) # self.__writer.flush() sys.stdout.flush() return