def test_w_is_selected_rows(self): place = core.CUDAPlace(0) if core.is_float16_supported(place): for inplace in [True, False]: self.check_with_place(place, inplace)
def test_check_output(self): if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) if core.is_float16_supported(place): self.check_output_with_place(place, atol=1e-1)
def test_check_output(self): place = core.CUDAPlace(0) if core.is_float16_supported(place): self.check_output_with_place(place)
def test_check_output(self): place = core.CUDAPlace(0) self.check_output_with_place(place, check_eager=False)
def test_check_grad_ingore_y(self): place = core.CUDAPlace(0) self.check_grad_with_place( place, ['X'], 'Out', no_grad_set=set('Y'), check_eager=False)
def main(): if args.data_set == "cifar10": classdim = 10 if args.data_format == 'NCHW': data_shape = [3, 32, 32] else: data_shape = [32, 32, 3] else: classdim = 102 if args.data_format == 'NCHW': data_shape = [3, 224, 224] else: data_shape = [224, 224, 3] # Input data data_file = fluid.layers.open_recordio_file(filename='./train.recordio', shapes=[[-1, 3, 224, 224], [-1, 1]], lod_levels=[0, 0], dtypes=['float32', 'int64']) data_file = fluid.layers.create_double_buffer_reader(reader=data_file, place='CUDA:0') images, label = fluid.layers.read_file(data_file) # Train program net = vgg16_bn_drop(images) predict = fluid.layers.fc(input=net, size=classdim, act='softmax') cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) # Evaluator batch_size_tensor = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size_tensor) # inference program inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): inference_program = fluid.io.get_inference_program( target_vars=[batch_acc, batch_size_tensor]) # Optimization optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) opts = optimizer.minimize(avg_cost) fluid.memory_optimize(fluid.default_main_program()) # Initialize executor place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) exe = fluid.Executor(place) # Parameter initialization exe.run(fluid.default_startup_program()) iters = 0 accuracy = fluid.average.WeightedAverage() for pass_id in range(args.num_passes): # train start_time = time.time() num_samples = 0 accuracy.reset() while not data_file.eof(): loss, acc, weight = exe.run( fluid.default_main_program(), fetch_list=[avg_cost, batch_acc, batch_size_tensor]) accuracy.add(value=acc, weight=weight) iters += 1 num_samples += args.batch_size print( "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" % (pass_id, iters, loss, acc) ) # The accuracy is the accumulation of batches, but not the current batch. pass_elapsed = time.time() - start_time pass_train_acc = accuracy.eval() print( "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f\n" % (pass_id, num_samples / pass_elapsed, pass_train_acc))
def test_run(self): x = layers.data(name='x', shape=[-1, self.batch_size, self.hidden_size], dtype='float32') sequence_length = layers.data(name="sequence_length", shape=[-1], dtype='float32') rnn_out, last_hidden = basic_gru( x, None, self.hidden_size, num_layers=self.num_layers, \ batch_first = self.batch_first, bidirectional=self.is_bidirect, sequence_length=sequence_length ) last_hidden.persisbale = True rnn_out.persisbale = True if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) else: place = core.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) param_list = fluid.default_main_program().block(0).all_parameters() # process weight and bias gate_weight = [] gate_bias = [] candidate_weight = [] candidate_bias = [] for i in range(self.num_layers): gate_w_name = "basic_gru_layers_" + str(i) + "/BasicGRUUnit_0.w_0" gate_b_name = "basic_gru_layers_" + str(i) + "/BasicGRUUnit_0.b_0" candidate_w_name = "basic_gru_layers_" + str( i) + "/BasicGRUUnit_0.w_1" candidate_b_name = "basic_gru_layers_" + str( i) + "/BasicGRUUnit_0.b_1" gate_w = np.array( fluid.global_scope().find_var(gate_w_name).get_tensor()) gate_w = np.random.uniform(-0.1, 0.1, size=gate_w.shape).astype('float32') fluid.global_scope().find_var(gate_w_name).get_tensor().set( gate_w, place) gate_b = np.array( fluid.global_scope().find_var(gate_b_name).get_tensor()) gate_b = np.random.uniform(-0.1, 0.1, size=gate_b.shape).astype('float32') fluid.global_scope().find_var(gate_b_name).get_tensor().set( gate_b, place) candidate_w = np.array( fluid.global_scope().find_var(candidate_w_name).get_tensor()) candidate_w = np.random.uniform( -0.1, 0.1, size=candidate_w.shape).astype('float32') fluid.global_scope().find_var(candidate_w_name).get_tensor().set( candidate_w, place) candidate_b = np.array( fluid.global_scope().find_var(candidate_b_name).get_tensor()) candidate_b = np.random.uniform( -0.1, 0.1, size=candidate_b.shape).astype('float32') fluid.global_scope().find_var(candidate_b_name).get_tensor().set( candidate_b, place) gate_weight.append(gate_w) gate_bias.append(gate_b) candidate_weight.append(candidate_w) candidate_bias.append(candidate_b) if self.is_bidirect: for i in range(self.num_layers): gate_w_name = "basic_gru_reverse_layers_" + str( i) + "/BasicGRUUnit_0.w_0" gate_b_name = "basic_gru_reverse_layers_" + str( i) + "/BasicGRUUnit_0.b_0" candidate_w_name = "basic_gru_reverse_layers_" + str( i) + "/BasicGRUUnit_0.w_1" candidate_b_name = "basic_gru_reverse_layers_" + str( i) + "/BasicGRUUnit_0.b_1" gate_w = np.array( fluid.global_scope().find_var(gate_w_name).get_tensor()) gate_w = np.random.uniform(-0.1, 0.1, size=gate_w.shape).astype('float32') fluid.global_scope().find_var(gate_w_name).get_tensor().set( gate_w, place) gate_b = np.array( fluid.global_scope().find_var(gate_b_name).get_tensor()) gate_b = np.random.uniform(-0.1, 0.1, size=gate_b.shape).astype('float32') fluid.global_scope().find_var(gate_b_name).get_tensor().set( gate_b, place) candidate_w = np.array(fluid.global_scope().find_var( candidate_w_name).get_tensor()) candidate_w = np.random.uniform( -0.1, 0.1, size=candidate_w.shape).astype('float32') fluid.global_scope().find_var( candidate_w_name).get_tensor().set(candidate_w, place) candidate_b = np.array(fluid.global_scope().find_var( candidate_b_name).get_tensor()) candidate_b = np.random.uniform( -0.1, 0.1, size=candidate_b.shape).astype('float32') fluid.global_scope().find_var( candidate_b_name).get_tensor().set(candidate_b, place) gate_weight.append(gate_w) gate_bias.append(gate_b) candidate_weight.append(candidate_w) candidate_bias.append(candidate_b) step_input_np = np.random.uniform(-0.1, 0.1, (self.seq_len, self.batch_size, self.hidden_size)).astype('float32') sequence_length_np = np.random.randint( self.seq_len // 2, self.seq_len, size=(self.batch_size)).astype('int64') out = exe.run(feed={ 'x': step_input_np, 'sequence_length': sequence_length_np }, fetch_list=[rnn_out, last_hidden]) api_rnn_out = out[0] api_last_hidden = out[1] np_out = gru_np(step_input_np, None, self.hidden_size, gate_weight, gate_bias, candidate_weight, candidate_bias, num_layers=self.num_layers, batch_first=self.batch_first, is_bidirect=self.is_bidirect, sequence_length=sequence_length_np) self.assertTrue(np.allclose(api_rnn_out, np_out[0], rtol=1e-4, atol=0)) self.assertTrue( np.allclose(api_last_hidden, np_out[1], rtol=1e-4, atol=0))
def train(logger, args): """train a model""" logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: if six.PY2: vocab = pickle.load(fin) else: vocab = pickle.load(fin, encoding='bytes') logger.info('vocab size is {} and embed dim is {}'.format( vocab.size(), vocab.embed_dim)) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.trainset, args.devset) #brc_data logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Initialize the model...') if not args.use_gpu: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() # build model main_program = fluid.Program() startup_prog = fluid.Program() if args.enable_ce: main_program.random_seed = args.random_seed startup_prog.random_seed = args.random_seed with fluid.program_guard(main_program, startup_prog): with fluid.unique_name.guard(): avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model( args.hidden_size, vocab, args) # clone from default main program and use it as the validation program inference_program = main_program.clone(for_test=True) # build optimizer if args.optim == 'sgd': optimizer = fluid.optimizer.SGD( learning_rate=args.learning_rate) elif args.optim == 'adam': optimizer = fluid.optimizer.Adam( learning_rate=args.learning_rate) elif args.optim == 'rprop': optimizer = fluid.optimizer.RMSPropOptimizer( learning_rate=args.learning_rate) else: logger.error('Unsupported optimizer: {}'.format(args.optim)) exit(-1) if args.weight_decay > 0.0: obj_func = avg_cost + args.weight_decay * l2_loss(main_program) optimizer.minimize(obj_func) else: obj_func = avg_cost optimizer.minimize(obj_func) # initialize parameters place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() exe = Executor(place) if args.load_dir: logger.info('load from {}'.format(args.load_dir)) fluid.io.load_persistables(exe, args.load_dir, main_program=main_program) else: exe.run(startup_prog) embedding_para = fluid.global_scope().find_var( 'embedding_para').get_tensor() embedding_para.set(vocab.embeddings.astype(np.float32), place) # prepare data feed_list = [ main_program.global_block().var(var_name) for var_name in feed_order ] feeder = fluid.DataFeeder(feed_list, place) logger.info('Training the model...') parallel_executor = fluid.ParallelExecutor( main_program=main_program, use_cuda=bool(args.use_gpu), loss_name=avg_cost.name) print_para(main_program, parallel_executor, logger, args) for pass_id in range(1, args.pass_num + 1): pass_start_time = time.time() pad_id = vocab.get_id(vocab.pad_token) if args.enable_ce: #这里是关键 train_reader初始化 train_reader = lambda: brc_data.gen_mini_batches( 'train', args.batch_size, pad_id, shuffle=False) else: train_reader = lambda: brc_data.gen_mini_batches( 'train', args.batch_size, pad_id, shuffle=True) train_reader = read_multiple(train_reader, dev_count) log_every_n_batch, n_batch_loss = args.log_interval, 0 total_num, total_loss = 0, 0 #total_num初始化 for batch_id, batch_list in enumerate(train_reader(), 1): feed_data = batch_reader(batch_list, args) fetch_outs = parallel_executor.run( feed=list(feeder.feed_parallel(feed_data, dev_count)), fetch_list=[obj_func.name], return_numpy=False) cost_train = np.array(fetch_outs[0]).mean() total_num += args.batch_size * dev_count n_batch_loss += cost_train total_loss += cost_train * args.batch_size * dev_count if args.enable_ce and batch_id >= 100: break if log_every_n_batch > 0 and batch_id % log_every_n_batch == 0: print_para(main_program, parallel_executor, logger, args) logger.info( 'Average loss from batch {} to {} is {}'.format( batch_id - log_every_n_batch + 1, batch_id, "%.10f" % (n_batch_loss / log_every_n_batch))) n_batch_loss = 0 if args.dev_interval > 0 and batch_id % args.dev_interval == 0: if brc_data.dev_set is not None: eval_loss, bleu_rouge = validation( inference_program, avg_cost, s_probs, e_probs, match, feed_order, place, dev_count, vocab, brc_data, logger, args) logger.info( 'Dev eval result: {}'.format(bleu_rouge)) pass_end_time = time.time() time_consumed = pass_end_time - pass_start_time logger.info('epoch: {0}, epoch_time_cost: {1:.2f}'.format( pass_id, time_consumed)) logger.info( 'Evaluating the model after epoch {}'.format(pass_id)) if brc_data.dev_set is not None: eval_loss, bleu_rouge = validation(inference_program, avg_cost, s_probs, e_probs, match, feed_order, place, dev_count, vocab, brc_data, logger, args) logger.info('Dev eval result: {}'.format(bleu_rouge)) else: logger.warning( 'No dev set is loaded for evaluation in the dataset!') logger.info('total_num = %s' % total_num) logger.info('Average train loss for epoch {} is {}'.format( pass_id, "%.10f" % (1.0 * total_loss / total_num))) if pass_id % args.save_interval == 0: model_path = os.path.join(args.save_dir, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(executor=exe, dirname=model_path, main_program=main_program) if args.enable_ce: # For CE print("kpis\ttrain_cost_card%d\t%f" % (dev_count, total_loss / total_num)) if brc_data.dev_set is not None: print("kpis\ttest_cost_card%d\t%f" % (dev_count, eval_loss)) print("kpis\ttrain_duration_card%d\t%f" % (dev_count, time_consumed))
def test_checkout_grad(self): place = core.CUDAPlace(0) if core.is_float16_supported(place): self.check_grad_with_place(place, ['X'], 'Out', max_relative_error=0.8)
def train(args, data_reader=ctc_reader): """OCR CTC training""" num_classes = None train_images = None train_list = None test_images = None test_list = None num_classes = data_reader.num_classes( ) if num_classes is None else num_classes data_shape = data_reader.data_shape() # define network images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int32', lod_level=1) sum_cost, error_evaluator, inference_program, model_average = ctc_train_net( images, label, args, num_classes) # data reader train_reader = data_reader.train(args.batch_size, train_images_dir=train_images, train_list_file=train_list) test_reader = data_reader.test(args.batch_size, test_images_dir=test_images, test_list_file=test_list) # prepare environment place = fluid.CPUPlace() if args.use_gpu: place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) # load init model if args.init_model is not None: model_dir = args.init_model model_file_name = None if not os.path.isdir(args.init_model): model_dir = os.path.dirname(args.init_model) model_file_name = os.path.basename(args.init_model) fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name) print "Init model from: %s." % args.init_model fetch_vars = [sum_cost] fetch_vars.extend([e for e in error_evaluator]) def test_parallel(exe, pass_id, batch_id): distance_evaluator = fluid.metrics.EditDistance(None) test_fetch = [v.name for v in error_evaluator] distance_evaluator.reset() for idx, data in enumerate(test_reader()): test_ret = exe.run(test_fetch, feed=get_feeder_data(data, place)) distance_evaluator.update(distances=test_ret[0], seq_num=np.mean(test_ret[1])) return distance_evaluator.eval() def test(exe, pass_id): distance_evaluator = fluid.metrics.EditDistance(None) test_fetch = [v.name for v in error_evaluator] distance_evaluator.reset() for idx, data in enumerate(test_reader()): test_ret = exe.run(inference_program, feed=get_feeder_data(data, place), fetch_list=test_fetch) distance_evaluator.update(distances=test_ret[0], seq_num=np.mean(test_ret[1])) return distance_evaluator.eval() def train_parallel(train_exe): var_names = [var.name for var in fetch_vars] #test_exe = fluid.ParallelExecutor( # use_cuda=True, main_program=inference_program, share_vars_from=train_exe) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() test_exe = fluid.Executor(place) for pass_id in range(args.pass_num): batch_id = 1 total_loss = 0.0 total_seq_error = 0.0 # train a pass num_samples, start_time = 0, time.time() for idx, data in enumerate(train_reader()): batch_start_time = time.time() results = train_exe.run(var_names, feed=get_feeder_data(data, place)) results = [np.array(result).sum() for result in results] total_loss += results[0] total_seq_error += results[1] # training log if batch_id % args.log_period == 0: print( "Pass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq err: %s; Speed: %.5f samples/sec" % (pass_id, batch_id, total_loss / (batch_id * args.batch_size), total_seq_error / (batch_id * args.batch_size), len(data) / (time.time() - batch_start_time))) batch_id += 1 num_samples += len(data) print_train_time(start_time, time.time(), num_samples) # run test if model_average: with model_average.apply(test_exe): #test_ret = test_parallel(test_exe, pass_id, batch_id) test_ret = test(test_exe, pass_id) else: #test_ret = test_parallel(test_exe, pass_id, batch_id) test_ret = test(test_exe, pass_id) print("Pass[%d]; Test avg seq error: %s\n" % (pass_id, test_ret[1])) if args.local: place = core.CPUPlace() if args.use_gpu else core.CUDAPlace(0) startup_exe = fluid.Executor(place) startup_exe.run(fluid.default_startup_program()) exec_strategy = ExecutionStrategy() exec_strategy.use_cuda = args.use_gpu train_exe = fluid.ParallelExecutor( use_cuda=args.use_gpu, main_program=fluid.default_main_program(), loss_name=sum_cost.name, exec_strategy=exec_strategy) train_parallel(train_exe) else: port = os.getenv("PADDLE_PSERVER_PORT", "6174") trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) pserver_ips = os.getenv("PADDLE_PSERVER_IPS") trainers = int(os.getenv("PADDLE_TRAINERS")) eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # the IP of the local machine, needed by pserver only current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port # the role, should be either PSERVER or TRAINER training_role = os.getenv("PADDLE_TRAINING_ROLE") t = distribute_transpiler.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() if training_role == "PSERVER": pserver_program = t.get_pserver_program(current_endpoint) pserver_startup_program = t.get_startup_program( current_endpoint, pserver_program) exe = fluid.Executor(core.CPUPlace()) exe.run(pserver_startup_program) exe.run(pserver_program) elif training_role == "TRAINER": exe.run(fluid.default_startup_program()) trainer_program = t.get_trainer_program() exec_strategy = ExecutionStrategy() exec_strategy.use_cuda = args.use_gpu exec_strategy.num_threads = 1 train_exe = fluid.ParallelExecutor(use_cuda=args.use_gpu, main_program=trainer_program, loss_name=sum_cost.name, exec_strategy=exec_strategy) train_parallel(train_exe) else: raise ValueError( "env PADDLE_TRAINING_ROLE should be in [PSERVER, TRIANER]")
def test_check_output(self): if core.is_compiled_with_cuda() and core.op_support_gpu("dropout"): self.check_output_with_place(core.CUDAPlace(0), atol=1e-3)
def test_cuda_place(self): if not core.is_compiled_with_cuda(): return place = core.CUDAPlace(0) self.check_momentum_step(place) self.check_sgd_step(place)
def test_sparse_sgd(self): places = [core.CPUPlace()] if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: self.check_with_place(place)
def setUp(self): self.scope = core.Scope() self.place = core.CUDAPlace(0)
def test_check_output(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace() self.check_output_with_place(place, atol=1e-5, check_dygraph=(self.use_mkldnn == False))
def test_check_output(self): if self.has_cuda(): place = core.CUDAPlace(0) self.check_output_with_place(place, atol=1e-5) else: pass
def test_check_output(self): place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace() self.check_output_with_place(place, atol=1e-5)
def test_run(self): inputs_basic_lstm = fluid.data( name='inputs_basic_lstm', shape=[None, None, self.input_size], dtype='float32') sequence_length = fluid.data( name="sequence_length", shape=[None], dtype='int64') inputs_dynamic_rnn = layers.transpose(inputs_basic_lstm, perm=[1, 0, 2]) cell = LSTMCell(self.hidden_size, name="LSTMCell_for_rnn") output, final_state = dynamic_rnn( cell=cell, inputs=inputs_dynamic_rnn, sequence_length=sequence_length, is_reverse=False) output_new = layers.transpose(output, perm=[1, 0, 2]) rnn_out, last_hidden, last_cell = basic_lstm(inputs_basic_lstm, None, None, self.hidden_size, num_layers=1, \ batch_first = False, bidirectional=False, sequence_length=sequence_length, forget_bias = 1.0) if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) else: place = core.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) inputs_basic_lstm_np = np.random.uniform( -0.1, 0.1, (self.seq_len, self.batch_size, self.input_size)).astype('float32') sequence_length_np = np.ones( self.batch_size, dtype='int64') * self.seq_len inputs_np = np.random.uniform( -0.1, 0.1, (self.batch_size, self.input_size)).astype('float32') pre_hidden_np = np.random.uniform( -0.1, 0.1, (self.batch_size, self.hidden_size)).astype('float32') pre_cell_np = np.random.uniform( -0.1, 0.1, (self.batch_size, self.hidden_size)).astype('float32') param_names = [[ "LSTMCell_for_rnn/BasicLSTMUnit_0.w_0", "basic_lstm_layers_0/BasicLSTMUnit_0.w_0" ], [ "LSTMCell_for_rnn/BasicLSTMUnit_0.b_0", "basic_lstm_layers_0/BasicLSTMUnit_0.b_0" ]] for names in param_names: param = np.array(fluid.global_scope().find_var(names[0]).get_tensor( )) param = np.random.uniform( -0.1, 0.1, size=param.shape).astype('float32') fluid.global_scope().find_var(names[0]).get_tensor().set(param, place) fluid.global_scope().find_var(names[1]).get_tensor().set(param, place) out = exe.run(feed={ 'inputs_basic_lstm': inputs_basic_lstm_np, 'sequence_length': sequence_length_np, 'inputs': inputs_np, 'pre_hidden': pre_hidden_np, 'pre_cell': pre_cell_np }, fetch_list=[output_new, rnn_out]) self.assertTrue(np.allclose(out[0], out[1], rtol=1e-4))
def train_parallel(train_args, test_args, args, train_prog, test_prog, startup_prog, nccl_id_var, num_trainers, trainer_id): over_all_start = time.time() place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) feeder = None if not args.use_reader_op: feed_var_list = [ var for var in train_prog.global_block().vars.itervalues() if var.is_data ] feeder = fluid.DataFeeder(feed_var_list, place) # generate fake: if args.use_fake_data: for var in feed_var_list: v = startup_prog.global_block()._clone_variable(var) var.persistable = True v.persistable = True real_shape = list(var.shape) real_shape[0] = args.batch_size / args.gpus startup_prog.global_block().append_op(outputs={"Out": v}, type="fill_constant", attrs={ "shape": real_shape, "value": 1.0, "dtype": var.dtype }) if nccl_id_var and trainer_id == 0: #FIXME(wuyi): wait other trainer to start listening time.sleep(30) startup_exe = fluid.Executor(place) startup_exe.run(startup_prog) strategy = fluid.ExecutionStrategy() strategy.num_threads = args.cpus strategy.allow_op_delay = False build_strategy = fluid.BuildStrategy() if args.reduce_strategy == "reduce": build_strategy.reduce_strategy = fluid.BuildStrategy( ).ReduceStrategy.Reduce else: build_strategy.reduce_strategy = fluid.BuildStrategy( ).ReduceStrategy.AllReduce build_strategy.fuse_broadcast_op = args.fuse_broadcast_op avg_loss = train_args[0] if args.update_method == "pserver": # parameter server mode distributed training, merge # gradients on local server, do not initialize # ParallelExecutor with multi server all-reduce mode. num_trainers = 1 trainer_id = 0 exe = fluid.ParallelExecutor(True, avg_loss.name, main_program=train_prog, exec_strategy=strategy, build_strategy=build_strategy, num_trainers=num_trainers, trainer_id=trainer_id) if not args.no_test: if args.update_method == "pserver": test_scope = None else: # NOTE: use an empty scope to avoid test exe using NCCLID test_scope = fluid.Scope() test_exe = fluid.ParallelExecutor(True, main_program=test_prog, share_vars_from=exe) for pass_id in range(args.pass_num): num_samples = 0 iters = 0 start_time = time.time() if not args.use_reader_op: reader_generator = train_args[3]() #train_reader batch_id = 0 data = None if args.use_reader_op: train_args[4].start() while True: if not args.use_reader_op: data = next(reader_generator, None) if data == None: break if args.profile and batch_id == 5: profiler.start_profiler("All") profiler.reset_profiler() elif args.profile and batch_id == 10: print("profiling total time: ", time.time() - start_time) profiler.stop_profiler( "total", "/tmp/profile_%d_pass%d" % (trainer_id, pass_id)) if iters == args.iterations: reader_generator.close() break if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 fetch_list = [avg_loss.name] acc_name_list = [v.name for v in train_args[2]] fetch_list.extend(acc_name_list) if args.use_fake_data or args.use_reader_op: try: fetch_ret = exe.run(fetch_list) except fluid.core.EOFException as eof: break except fluid.core.EnforceNotMet as ex: traceback.print_exc() break else: fetch_ret = exe.run(fetch_list, feed=feeder.feed(data)) if args.use_reader_op: num_samples += args.batch_size * args.gpus else: num_samples += len(data) iters += 1 if batch_id % 1 == 0: fetched_data = [np.mean(np.array(d)) for d in fetch_ret] print("Pass %d, batch %d, loss %s, accucacys: %s" % (pass_id, batch_id, fetched_data[0], fetched_data[1:])) batch_id += 1 print_train_time(start_time, time.time(), num_samples) if args.use_reader_op: train_args[4].reset() # reset reader handle else: del reader_generator if not args.no_test and test_args[2]: test_feeder = None if not args.use_reader_op: test_feed_var_list = [ var for var in test_prog.global_block().vars.itervalues() if var.is_data ] test_feeder = fluid.DataFeeder(test_feed_var_list, place) test_ret = test_parallel(test_exe, test_args, args, test_prog, test_feeder) print("Pass: %d, Test Accuracy: %s\n" % (pass_id, [np.mean(np.array(v)) for v in test_ret])) print("total train time: ", time.time() - over_all_start)
def test_check_output(self): if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) self.check_output_with_place(place, atol=1e-3)
def check_forward_backward(self, shape, begin_norm_axis, has_scale=True, has_bias=True, y_grad_scale=1.0, use_mkldnn=False): def test_with_place(place, shape, begin_norm_axis, use_mkldnn=use_mkldnn): # attr epsilon = 0.00001 x_shape = shape D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) scale_shape = [D] np.random.seed(123) x = np.random.random_sample(x_shape).astype(np.float32) scale = np.random.random_sample(scale_shape).astype( np.float32) if has_scale else None bias = np.random.random_sample(scale_shape).astype( np.float32) if has_bias else None y_grad = (np.random.random_sample(x_shape) * y_grad_scale).astype( np.float32) # reference forward & backward y, mean, variance = _reference_layer_norm_naive( x, scale, bias, epsilon, begin_norm_axis) x_grad, scale_grad, bias_grad = _reference_layer_norm_grad( x, y_grad, scale, bias, mean, variance, begin_norm_axis) var_dict = locals() var_dict['y@GRAD'] = y_grad var_names = ['x', 'mean', 'variance', 'y', 'y@GRAD'] if has_scale: var_names += ['scale'] if has_bias: var_names += ['bias'] ground_truth = {name: var_dict[name] for name in var_names} program = fluid.Program() with fluid.program_guard(program): block = program.global_block() for name in ground_truth: block.create_var(name=name, dtype='float32', shape=ground_truth[name].shape) inputs = {"X": block.var('x')} fetch_list = [ 'y', 'mean', 'variance', 'x@GRAD', ] if has_scale: inputs["Scale"] = block.var('scale') fetch_list += ['scale@GRAD'] if has_bias: inputs["Bias"] = block.var('bias') fetch_list += ['bias@GRAD'] layer_norm_op = block.append_op( type="layer_norm", inputs=inputs, outputs={ "Y": block.var('y'), "Mean": block.var('mean'), # share the same memory "Variance": block.var('variance'), # share the same memory }, attrs={ "epsilon": epsilon, "begin_norm_axis": begin_norm_axis, "use_mkldnn": use_mkldnn }) # generate backward op_desc grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( layer_norm_op.desc, set(), []) grad_op_desc = grad_op_desc_list[0] new_op_desc = block.desc.append_op() new_op_desc.copy_from(grad_op_desc) for var_name in grad_op_desc.output_arg_names(): block.desc.var(var_name.encode("ascii")) grad_op_desc.infer_var_type(block.desc) grad_op_desc.infer_shape(block.desc) for arg in grad_op_desc.output_arg_names(): grad_var = block.desc.find_var(arg.encode("ascii")) grad_var.set_dtype(core.VarDesc.VarType.FP32) program._sync_with_cpp() exe = fluid.Executor(place) out = exe.run(program, feed={ name: var_dict[name] for name in ['x', 'scale', 'bias', 'y@GRAD'] }, fetch_list=fetch_list) self.__assert_close(y, out[0], "y") self.__assert_close(mean, out[1], "mean") self.__assert_close(variance, out[2], "variance", 1e-3) self.__assert_close(x_grad, out[3], "x_grad") if has_scale: self.__assert_close(scale_grad, out[fetch_list.index('scale@GRAD')], "scale_grad", 1e-3) if has_bias: self.__assert_close(bias_grad, out[fetch_list.index('bias@GRAD')], "bias_grad") places = [core.CPUPlace()] if core.is_compiled_with_cuda() and core.op_support_gpu( "layer_norm") and self.use_cudnn: places.append(core.CUDAPlace(0)) for place in places: test_with_place(place, shape, begin_norm_axis)
def test_check_grad(self): if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) self.check_grad_with_place(place, ['Updates'], 'Out', in_place=True)
def test_check_grad_normal(self): place = core.CUDAPlace(0) self.check_grad_with_place(place, ['X', 'Y'], 'Out', check_eager=False)
def test_check_grad(self): self.check_grad_with_place(core.CUDAPlace(0), ["Logits"], "Loss")
def test_check_output(self): if self.use_cudnn: place = core.CUDAPlace(0) self.check_output_with_place(place, atol=1e-5) else: self.check_output()
def test_check_output(self): self.check_output_with_place(core.CUDAPlace(0), atol=5e-2)
def test_slice(self): place = fluid.CPUPlace() self._test_slice(place) if core.is_compiled_with_cuda(): self._test_slice(core.CUDAPlace(0))
def test_check_grad(self): self.check_grad_with_place(core.CUDAPlace(0), ["Logits"], "Loss", numeric_grad_delta=6e-1, max_relative_error=6e-1)
def test_check_grad_ignore_uv(self): place = core.CUDAPlace(0) if core.is_float16_supported(place): self.check_grad_with_place(place, ['X'], 'Out')
def test_check_grad(self): place = core.CUDAPlace(0) if core.is_float16_supported(place): self.check_grad(['x0'], 'Out', max_relative_error=0.15)