def nmt_test(model_path_pruned, DATA_PATH, GPU_ID, translate_param1_path, translate_param2_path, group_dict): cuda.set_device(GPU_ID) valid_data = torch.load(DATA_PATH + '') fields = onmt.IO.load_fields(torch.load(DATA_PATH + '')) valid_data.fields = fields checkpoint = torch.load(model_path_pruned, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] with cuda.device(GPU_ID): ref_model = onmt.ModelConstructor.make_base_model(model_opt, fields, True, checkpoint) ref_model.eval() ref_model.generator.eval() masked_model = MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device()) # ref_model is at current_device, no copy will happen translate_opt, translate_dummy_opt = translate_opt_initialize(translate_param1_path, translate_param2_path, DATA_PATH, model_path_pruned, GPU_ID) translator = init_translate_model(translate_opt, translate_dummy_opt) del translator.model translator.model = ref_model tt=open(translate_opt.tgt, 'r') references = [[t] for t in tt] translate_data = onmt.IO.ONMTDataset( translate_opt.src, translate_opt.tgt, fields, use_filter_pred=False) prune_data = onmt.IO.OrderedIterator( dataset=translate_data, device=GPU_ID, batch_size=1, train=False, sort=False, shuffle=False) sparsity = masked_model.get_sparsity() total_param = masked_model.total_parameters_of_pretrain() tmp_fit1 = evaluate(masked_model, valid_data, fields) tmp_fit2 = evaluate_trans(translator, references, prune_data, translate_data) return total_param, sparsity, tmp_fit1, tmp_fit2
def main(): valid_data = torch.load(TRAIN_DATA + '') fields = onmt.IO.load_fields(torch.load(TRAIN_DATA + '')) valid_data.fields = fields # we need to clear this assignment relationg if we want to transfere valid among threads if GPU_ID == 0 or GPU_ID == 1: cuda.set_device(GPU_ID) with cuda.device(GPU_ID): # '/fl/deepModels/tmp/','/fl/deepModels/tmp/' checkpoint_path = '/fl/deepModels/tmp/' checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] ref_model = onmt.ModelConstructor.make_base_model( model_opt, fields, True, checkpoint) ref_model.eval() ref_model.generator.eval() masked_model = MaskedModel( ref_model, group_dict, cuda.current_device(), cuda.current_device( )) # ref_model is at current_device, no copy will happen # train data loading translate_opt, translate_dummy_opt = translate_opt_initialize( '/fl/NMTSWPO/workspace/', '/fl/NMTSWPO/workspace/') translator = init_translate_model(translate_opt, translate_dummy_opt) del translator.model translator.model = masked_model tt = open(translate_opt.tgt, 'r', encoding='utf-8') references = [[t] for t in tt] p = 0.3 translate_data = onmt.IO.ONMTDataset(translate_opt.src, translate_opt.tgt, fields, use_filter_pred=False) prune_data = onmt.IO.OrderedIterator(dataset=translate_data, device=GPU_ID, batch_size=1, train=False, sort=False, shuffle=False) tmp_crate = len(masked_model.group_name_list) * [p] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate_trans(translator, references, prune_data, translate_data) logger.scalar_summary('test_bleu', tmp_fit[1] * 100, int(p * 100)) logger.scalar_summary('test_ppl', tmp_fit[0], int(p * 100)) print('percentage %s => bleu (%.4f), ppl (%.4f)' % (p * 100, tmp_fit[1] * 100, tmp_fit[0]))
def train(net, loader, ep, scheduler=None, writer=None): global n_iter if scheduler: scheduler.step() net.train() loss_all, norm_all = [], [] train_iter = tqdm(loader) for images, labels in train_iter: n_iter += 1 images, labels = images.cuda(), labels.cuda() embedding = net(images) loss = criterion(embedding, labels) loss_all.append(loss.item()) if writer: writer.add_scalar('loss/train', loss.item(), n_iter) print(cuda.memory_allocated(cuda.current_device())) optimizer.zero_grad() loss.backward() optimizer.step() train_iter.set_description("[Train][Epoch %d] Loss: %.5f" % (ep, loss.item())) print('[Epoch %d] Loss: %.5f\n' % (ep, torch.Tensor(loss_all).mean()))
def check_devices(): for i in range(device_count()): print("Found device {}:".format(i), get_device_name(i)) if device_count() == 0: print("No GPU device found") else: print("Current cuda device is", get_device_name(current_device()))
def eval(net, loader, ep): K = [1, 10, 100, 1000] net.eval() test_iter = tqdm(loader) embeddings_all, labels_all = [], [] test_iter.set_description("[Eval][Epoch %d]" % ep) with torch.no_grad(): for images, labels in test_iter: images, labels = images.cuda(), labels.cuda() embedding = net(images) embeddings_all.append( labels_all.append( print(cuda.memory_allocated(cuda.current_device())) embeddings_all = labels_all = rec = recall(embeddings_all, labels_all, K=K) print("Embedding Size: %d" % len(embeddings_all)) print(labels_all.sum()) for k, r in zip(K, rec): print('[Epoch %d] Recall@%d: [%.4f]\n' % (ep, k+1, 100 * r)) return rec[0]
def which_processor(): """Check if fastai/torch is using GPU or CPU""" if is_available(): device_nr = current_device() print(f" (Torch) is using GPU: {get_device_name(device_nr)}") else: print("Cuda is not available. is using CPU")
def test_sequential_move_to_cuda_via_to(self): """Test moving AnalogSequential to cuda (from CPU), using ``.to()``.""" if not cuda.is_compiled(): raise SkipTest('not compiled with CUDA support') # Map the original tile classes to the expected ones after `cuda()`. tile_classes = { tiles.AnalogTile: tiles.CudaAnalogTile, tiles.CudaAnalogTile: tiles.CudaAnalogTile } layer = self.get_layer() expected_class = tile_classes[layer.analog_tile.tile.__class__] expected_device = device('cuda', current_device()) # Create a container and move to cuda. model = AnalogSequential(layer)'cuda')) analog_tile = layer.analog_tile self.assertEqual(analog_tile.device, expected_device) self.assertEqual(analog_tile.get_analog_ctx().data.device, expected_device) if analog_tile.shared_weights is not None: self.assertEqual(, expected_device) self.assertEqual([0], analog_tile.tile.get_x_size()) self.assertEqual([1], analog_tile.tile.get_d_size()) # Assert the tile has been moved to cuda. self.assertIsInstance(layer.analog_tile.tile, expected_class)
def get_device_id(self) -> Union[str, torch.device]: if cuda.is_available(): device_id = self.non_default_device_to_use or cuda.current_device() return torch.device('cuda', device_id) elif self.fallback_to_cpu: return "cpu" else: raise CudaNotAvailable("Cuda not available")
def get_gpu_statistics(self): id = cuda.current_device() print("Max memory allocated on GPU %d: %d bytes" % (id, cuda.max_memory_allocated(id))) print("Max memory cached on GPU %d: %d bytes" % (id, cuda.max_memory_cached(id))) print("Current memory allocated on GPU %d: %d bytes" % (id, cuda.memory_allocated(id))) print("Current memory cached on GPU %d: %d bytes" % (id, cuda.memory_cached(id)))
def get_device(force_cpu: bool) \ -> Tuple[str, str]: """Gets the available device. :param force_cpu: Force CPU usage? :type force_cpu: bool :return: Device and device name. :rtype: str, str """ return ('cuda', cuda.get_device_name(cuda.current_device())) \ if cuda.is_available() and not force_cpu else \ ('cpu', processor())
def enable_GPU(self): # Get the GPU device name. device_name = tf.test.gpu_device_name() # The device name should look like the following: if device_name == '/device:GPU:0': print('Found GPU at: {}'.format(device_name)) else: raise SystemError('GPU device not found') # From torch: device = cuda.current_device() return device
def device_info(the_device: str) \ -> None: """Prints an informative message about the device that we are using. :param the_device: The device. :type the_device: str """ from torch.cuda import get_device_name, current_device from platform import processor actual_device = get_device_name(current_device()) \ if the_device.startswith('cuda') else processor() cmd_msg(f'Using device: `{actual_device}`.')
def __init__(self, source_tile: AnalogTile): if not cuda.is_compiled(): raise CudaError('aihwkit has not been compiled with CUDA support') # Create a new instance of the rpu config. new_rpu_config = deepcopy(source_tile.rpu_config) # Create the tile, replacing the simulator tile. super().__init__(source_tile.out_size, source_tile.in_size, new_rpu_config, source_tile.bias, source_tile.in_trans, source_tile.out_trans) self.cuda(current_device())
def __init__(self, source_tile: FloatingPointTile): if not cuda.is_compiled(): raise CudaError('aihwkit has not been compiled with CUDA support') # Create a new instance of the rpu config. new_rpu_config = deepcopy(source_tile.rpu_config) # Create the tile, replacing the simulator tile. super().__init__(source_tile.out_size, source_tile.in_size, new_rpu_config, source_tile.bias, source_tile.in_trans, source_tile.out_trans) self.tile = tiles.CudaFloatingPointTile(source_tile.tile) # Set the cuda properties = current_stream() self.device = torch_device(current_device())
def __init__(self, out_size: int, in_size: int, resistive_device: Optional[BaseResistiveDevice] = None, bias: bool = False, in_trans: bool = False, out_trans: bool = False): if not cuda.is_compiled(): raise RuntimeError( 'aihwkit has not been compiled with CUDA support') super().__init__(out_size, in_size, resistive_device, bias, in_trans, out_trans) self.tile = tiles.CudaAnalogTile(self.tile) = current_stream() self.device = torch_device(current_device())
def make_loss_compute(model, tgt_vocab, dataset, gpu_id=None, copy_attn=False, copy_attn_force=False): """ This returns user-defined LossCompute object, which is used to compute loss in train/validate process. You can implement your own *LossCompute class, by subclassing LossComputeBase. """ if copy_attn: compute = onmt.modules.CopyGeneratorLossCompute( model.generator, tgt_vocab, dataset, copy_attn_force) else: compute = onmt.Loss.NMTLossCompute(model.generator, tgt_vocab) if gpu_id == None: gpu_id = cuda.current_device() compute.cuda(gpu_id) return compute
def __init__(self, source_tile: AnalogTile): if not cuda.is_compiled(): raise RuntimeError( 'aihwkit has not been compiled with CUDA support') # Create a new instance of the resistive device. new_resistive_device = deepcopy(source_tile.resistive_device) # Create the tile, replacing the simulator tile. super().__init__(source_tile.out_size, source_tile.in_size, new_resistive_device, source_tile.bias, source_tile.in_trans, source_tile.out_trans) self.tile = tiles.CudaAnalogTile(source_tile.tile) # Set the cuda properties = current_stream() self.device = torch_device(current_device())
def config_cuda(use_cuda): if not use_cuda: print('Using cpu') torch.device('cpu') return 'cpu' elif not cuda.is_available(): print('Cuda not found, using cpu') torch.device('cpu') return 'cpu' print('Configuring cuda...') torch.device('cuda') cuda.set_device(0) current_dev = cuda.current_device() current_dev_name = cuda.get_device_name(current_dev) current_dev_specs = cuda.get_device_properties(current_dev) print(f'Current Device: {current_dev}') print(f'Current Device Name: {current_dev_name}') print(f'Current Device Specs: {current_dev_specs}') print() return 'cuda'
def test_save_with_cuda(self): """Whether model is correctly reconstructed after saving""" if not cuda.is_compiled(): raise SkipTest('not compiled with CUDA support') # Map the original tile classes to the expected ones after `cuda()`. tile_classes = { tiles.AnalogTile: tiles.CudaAnalogTile, tiles.CudaAnalogTile: tiles.CudaAnalogTile } layer = self.get_layer() model = AnalogSequential(layer) model.cuda() with TemporaryFile() as file: save(model.state_dict(), file) # Create a new model and load its state dict. checkpoint = load(file) model.load_state_dict(checkpoint) expected_device = device('cuda', current_device()) expected_class = tile_classes[layer.analog_tile.tile.__class__] analog_tile = model[0].analog_tile self.assertEqual(analog_tile.device, expected_device) self.assertEqual(analog_tile.get_analog_ctx().data.device, expected_device) if analog_tile.shared_weights is not None: self.assertEqual(, expected_device) self.assertEqual([0], analog_tile.tile.get_x_size()) self.assertEqual([1], analog_tile.tile.get_d_size()) # Assert the tile has been moved to cuda. self.assertIsInstance(layer.analog_tile.tile, expected_class)
def __init__(self, model, use_cuda=None, model_name='nnTrainer_model'): # Basics super(nnTrainer, self).__init__() self.model = model self.model_name = model_name.split('.')[0] self.results_path = 'results' if not os.path.exists(self.results_path): os.makedirs(self.results_path) # Use CUDA? self.use_cuda = use_cuda if ( use_cuda != None and cuda.is_available()) else cuda.is_available() self.device = 'cpu' if (not self.use_cuda) else ( 'cuda:' + str(cuda.current_device())) self.device = torch.device(self.device) clog('Model CUDA:', self.use_cuda, '| Device:', self.device) # Current loss and loss history self.train_loss = 0 self.valid_loss = 0 self.train_loss_hist = [] self.valid_loss_hist = []
def make_sequence_video(cfg): with open(cfg) as fd: data_specs = json.load(fd) temp_size = data_specs['temp_size'] num_of_temp_features = data_specs['temp_features'] m = load_model(data_specs['model_path'],, #, "model_params.json") use_gpu = torch_cuda.is_available() device = torch_device(torch_cuda.current_device()) if use_gpu else torch_device("cpu") r = get_images_classes( data_specs['images_path'], data_specs['info_dict'], data_specs['class_of_interest'] ) train_d, val_d, test_d = split_data( data_specs['positive_ev_path'], r, window_size=temp_size, future_size=0, shuffle=False ) full_df = pd.concat([train_d, val_d, test_d], ignore_index=True) info_path = data_specs["info_path"] ds = PixelLevelDs( full_df, info_path=info_path, add_polarity="neg" if num_of_temp_features > 1 else "", ) # make_video(m, ds) make_non_repeating_video(m, ds)
def main(): total_times = 100 run_times = 0 init_threshold = ... start_t = time.time() valid_data = torch.load(TRAIN_DATA + '') fields = onmt.IO.load_fields(torch.load(TRAIN_DATA + '')) # fields = onmt.IO.load_fields_from_vocab(torch.load(TRAIN_DATA + '')) valid_data.fields = fields # we need to clear this assignment relationg if we want to transfere valid among threads checkpoint = torch.load(weights, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] # masked_models = [] with cuda.device(GPU_ID): ref_model = onmt.ModelConstructor.make_base_model( model_opt, fields, True, checkpoint) ref_model.eval() ref_model.generator.eval() masked_model = MaskedModel( ref_model, group_dict, cuda.current_device(), cuda.current_device( )) # ref_model is at current_device, no copy will happen # masked_models.append(masked_model) train_opt, _, _ = opt_initialize(checkpoint, '', '') if GPU_ID: cuda.set_device(GPU_ID) # 只需要原始的accuracy acc_of_no_prune = 0 get_acc_of_no_prune = False print(time_now(), "start while") while run_times < total_times: print("-----------------------------------------") print("-----------------------------------------") print("start Iteration ", run_times) print("----------------test model for masked_model------") masked_model.make_evaluable() tmp_fit = evaluate(masked_model, valid_data, fields) print('acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) model_sparsity = masked_model.get_sparsity() print('Sparsity: {}'.format(model_sparsity)) # init threshold best_threshold = 0 itr_time = time.time() xxx = np.arange(0., 1, 0.01) print(time_now(), "start testing pruning") masked_model.make_evaluable() for i in range(len(xxx)): # best_threshold = 0.2 # break tmp_crate = len(masked_model.group_name_list) * [xxx[i]] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate(masked_model, valid_data, fields) print('percentage %s => acc (%.4f), ppl (%.4f)' % (xxx[i] * 100, tmp_fit[1], tmp_fit[0])) if i == 0 and not get_acc_of_no_prune: acc_of_no_prune = tmp_fit[1] acc_of_no_prune = int(acc_of_no_prune * 10) / 10 get_acc_of_no_prune = True elif acc_of_no_prune - tmp_fit[1] > acc_percent_prune: best_threshold = xxx[i] - 0.01 break # ------------------------------------------------- # Start writing # prune again print(time_now(), " init accuracy of model:", acc_of_no_prune) print("accuracy constraint:", acc_percent_prune) print("-------test------------:", get_acc_of_no_prune) print(time_now(), " apply pruning with threshold:", best_threshold) tmp_crate = len(masked_model.group_name_list) * [best_threshold] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() # print information tmp_fit = evaluate(masked_model, valid_data, fields) print('percentage %s => acc (%.4f), ppl (%.4f)' % (best_threshold * 100, tmp_fit[1], tmp_fit[0])) model_sparsity = masked_model.get_sparsity() print('Sparsity: {}'.format(model_sparsity)) #--------------- start retraining -------------- # first store model print(time_now(), "start saving model") _, saved_model = update_checkpoint(checkpoint, masked_model, run_times, acc_percent_prune) print(time_now(), "finish saving model:", saved_model) model_for_train = masked_model pretrained_leaf_dict = model_for_train.make_trainable() optim = build_optim(model_for_train.masked_model, checkpoint, train_opt, pretrained_leaf_dict) print("finish building optim") print(time_now(), "start loading data for retraining") train = torch.load( + '') valid = torch.load( + '') train_fields = load_fields(train, valid, checkpoint, train_opt) print(time_now(), "finish data loading") recovered = train_model(model_for_train, train, valid, train_fields, optim, train_opt, run_times, acc_of_no_prune) print(time_now(), "finish retraining ") if not recovered: exit() else: print("------------Accuracy recorverd!--------------------") print("recovered accuracy:", acc_of_no_prune) masked_model = MaskedModel(model_for_train.masked_model, group_dict, cuda.current_device(), cuda.current_device()) #------------------------------------------------- print('------------- save checkpoint ---------------') _, saved_model = update_checkpoint(checkpoint, model_for_train, run_times, acc_percent_prune, t=True) print(time_now(), ' saving model:', saved_model) print("-------------print evaluation info ---------------") model_for_train.make_evaluable() tmp_fit = evaluate(model_for_train, valid_data, fields) print('acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) model_sparsity = model_for_train.get_sparsity() print('Sparsity: {}'.format(model_sparsity)) print("----------------test model for masked_model------") masked_model.make_evaluable() tmp_fit = evaluate(masked_model, valid_data, fields) print('acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) model_sparsity = masked_model.get_sparsity() print('Sparsity: {}'.format(model_sparsity)) #-------------------------------------------------- print("BLEU evaluation:") translate_opt, translate_dummy_opt = translate_opt_initialize( '', '') translator = init_translate_model(translate_opt, translate_dummy_opt) del translator.model translator.model = model_for_train.masked_model tt = open(translate_opt.tgt, 'r') references = [[t] for t in tt] translate_data = onmt.IO.ONMTDataset(translate_opt.src, translate_opt.tgt, fields, use_filter_pred=False) prune_data = onmt.IO.OrderedIterator(dataset=translate_data, device=GPU_ID, batch_size=1, train=False, sort=False, shuffle=False) tmp_fit2 = evaluate_trans(translator, references, prune_data, translate_data) print('Finsished => bleu (%.4f), ppl (%.4f)' % (tmp_fit2[1] * 100, tmp_fit2[0])) #-------------------------------------------------- run_times += 1
from torch import cuda import onmt import import onmt.Models import onmt.ModelConstructor import onmt.modules from onmt.Utils import use_gpu import opts import argparse import glob print torch.cuda.is_available() print cuda.device_count() print cuda.current_device() parser = argparse.ArgumentParser( description='', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # opts.add_md_help_argument(parser) opts.model_opts(parser) opts.train_opts(parser) opt = parser.parse_args() if opt.word_vec_size != -1: opt.src_word_vec_size = opt.word_vec_size opt.tgt_word_vec_size = opt.word_vec_size
import tensorflow as tf hello = tf.constant("hello TensorFlow!") sess=tf.Session() # find PyTorch packages import pkg_resources l = [d for d in pkg_resources.working_set if 'pytorch' in str(d)] print(l) # confirm PyTorch sees the GPU from torch import cuda import torch print('PyTorch version', torch.__version__) print('PyTorch cuda available', cuda.is_available()) print('PyTorch device count', cuda.device_count()) print('PyTorch device', cuda.get_device_name(cuda.current_device())) # confirm Keras sees the GPU from keras import backend print('keras GPUs:', backend.tensorflow_backend._get_available_gpus()) import os os.system('nvidia-smi') os.system('nvcc --version') import ray ray.init(num_gpus=1) print('ray GPU IDs', ray.get_gpu_ids())
def main(): total_times = 100 run_times = 0 init_threshold = ... start_t = time.time() valid_data = torch.load(TRAIN_DATA + '') fields = onmt.IO.load_fields(torch.load(TRAIN_DATA + '')) # fields = onmt.IO.load_fields_from_vocab(torch.load(TRAIN_DATA + '')) valid_data.fields = fields # we need to clear this assignment relationg if we want to transfere valid among threads checkpoint = torch.load(weights, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] masked_models = [] with cuda.device(GPU_ID): ref_model = onmt.ModelConstructor.make_base_model( model_opt, fields, True, checkpoint) ref_model.eval() ref_model.generator.eval() masked_model = MaskedModel( ref_model, group_dict, cuda.current_device(), cuda.current_device( )) # ref_model is at current_device, no copy will happen masked_models.append(masked_model) if GPU_ID: cuda.set_device(GPU_ID) # 1 means 1% acc acc_percent_prune = 1 # 只需要原始的accuracy acc_of_no_prune = 0 get_acc_of_no_prune = False print(time_now(), "start while") while run_times < total_times: print("-----------------------------------------") print("start Iteration ", run_times) # init threshold best_threshold = 0 itr_time = time.time() ''' display all the names of parameters ''' ''' aa=ref_model.named_parameters aa_namelist = [ak[0] for ak in aa] ''' ''' test MP ''' translate_opt, translate_dummy_opt = translate_opt_initialize( '', '') translator = init_translate_model(translate_opt, translate_dummy_opt) del translator.model translator.model = masked_model tt = open(translate_opt.tgt, 'r') references = [[t] for t in tt] xxx = np.arange(0., 1, 0.01) #for i in range(len(masked_model.group_name_list)): # tmp_crate = len(masked_model.group_name_list)*[0.] print(time_now(), "start testing pruning") masked_model.make_evaluable() for i in range(len(xxx)): # best_threshold = 0.55 # break translate_data = onmt.IO.ONMTDataset(translate_opt.src, translate_opt.tgt, fields, use_filter_pred=False) prune_data = onmt.IO.OrderedIterator(dataset=translate_data, device=GPU_ID, batch_size=1, train=False, sort=False, shuffle=False) tmp_crate = len(masked_model.group_name_list) * [xxx[i]] #tmp_crate[i] = 0.01 masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate(masked_model, valid_data, fields) #tmp_fit = evaluate_trans(translator, references, prune_data, translate_data) #logger.scalar_summary('test_bleu', tmp_fit[1]*100, int(xxx[i]*100)) #logger.scalar_summary('acc', tmp_fit[1], int(xxx[i]*100)) #logger.scalar_summary('ppl', tmp_fit[0], int(xxx[i]*100)) #logger.scalar_summary('test_ppl', tmp_fit[0], int(xxx[i]*100)) #print('group %s => acc (%.4f), ppl (%.4f)' % (masked_model.group_name_list[i], tmp_fit[1], tmp_fit[0])) #print('percentage %s => bleu (%.4f), ppl (%.4f)' % (xxx[i]*100, tmp_fit[1]*100, tmp_fit[0])) # print('percentage %s => acc (%.4f), ppl (%.4f)' % (xxx[i]*100, tmp_fit[1], tmp_fit[0])) if i == 0 and not get_acc_of_no_prune: acc_of_no_prune = tmp_fit[1] acc_of_no_prune = int(acc_of_no_prune * 100) / 100 get_acc_of_no_prune = True elif acc_of_no_prune - tmp_fit[1] > acc_percent_prune: best_threshold = xxx[i] - 0.01 break # ------------------------------------------------- # Start writing # prune again print(time_now(), " start accuracy:", acc_of_no_prune) print("-------test------------:", get_acc_of_no_prune) print(time_now(), " apply pruning with threshold:", best_threshold) tmp_crate = len(masked_model.group_name_list) * [best_threshold] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() # print information tmp_fit = evaluate(masked_model, valid_data, fields) print('percentage %s => acc (%.4f), ppl (%.4f)' % (best_threshold * 100, tmp_fit[1], tmp_fit[0])) model_sparsity = masked_model.get_sparsity() print('Sparsity: {}'.format(model_sparsity)) #--------------- start retraining -------------- # first store model print(time_now(), "start saving model") _, saved_model = update_checkpoint(checkpoint, masked_model, run_times) print(time_now(), "finish saving model") print(time_now(), "start loading model") checkpoint = torch.load(SAVE_MODEL_TMP_FOLDER + saved_model, map_location=lambda storage, loc: storage) train_opt, _, _ = opt_initialize(checkpoint, '', '') # train data loading print(time_now(), "start loading data for retraining") train = torch.load( + '') valid = torch.load( + '') print(time_now(), "finish data loading") train_fields = load_fields(train, valid, checkpoint, train_opt) model_for_train = init_train_model(checkpoint, train_opt, train_fields) masked_model = MaskedModel(model_for_train, group_dict, cuda.current_device(), cuda.current_device()) masked_model.make_trainable() print(time_now(), "building optm") optim = build_optim(model_for_train, checkpoint, train_opt) print(time_now(), "start restraining") recovered = train_model(model_for_train, train, valid, train_fields, optim, train_opt, run_times, acc_of_no_prune) print(time_now(), "finish retraining ") if not recovered: exit() else: print("------------Accuracy recorverd!--------------------") print("recovered accuracy:", acc_of_no_prune) run_times += 1 masked_model.make_evaluable() tmp_fit = evaluate(masked_model, valid_data, fields) print("------------------for test-------------------") print('percentage %s => acc (%.4f), ppl (%.4f)' % (best_threshold * 100, tmp_fit[1], tmp_fit[0]))
def main(): data_path = "{}/data/penn".format(cfg.PROJECT_ROOT) model_path = "{}/model/original_model/language_model/{}".format( cfg.PROJECT_ROOT, '') total_times = 20 run_times = 0 orginal_acc = 0 init_threshold = ... start_t = time.time() # get data corpus = data.Corpus(data_path) ntokens = len(corpus.dictionary) eval_batch_size = TEST_BATCH_SIZE train_data = batchify(corpus.train, TRAIN_BATCH_SIZE) val_data = batchify(corpus.valid, TEST_BATCH_SIZE) valid_data = val_data test_data = batchify(corpus.test, TEST_BATCH_SIZE) ref_model = None # Load the best saved model. with cuda.device(GPU_ID): ff = open(model_path, 'rb') ref_model = torch.load(ff) ref_model.eval() masked_model = MaskedModel( ref_model, group_dict, cuda.current_device(), cuda.current_device( )) # ref_model is at current_device, no copy will happen #pdb.set_trace() ff.close() if GPU_ID: cuda.set_device(GPU_ID) print(time_now(), "get accuray of no pruning model") masked_model.make_evaluable() tmp_crate = len(masked_model.group_name_list) * [0] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus, TEST_BATCH_SIZE) # 只需要原始的accuracy acc_of_no_prune = tmp_fit[1] fit_of_no_prune = tmp_fit original_acc = acc_of_no_prune pruning_arr = [] ppl_arr = [] #acc_of_no_prune = int(acc_of_no_prune*10)/10 print("init accuracy of model:", acc_of_no_prune) print("accuracy constraint:", acc_percent_prune) init_threshold = [0] while run_times < total_times: print("-----------------------------------------") print("-----------------------------------------") print("-----------------------------------------") print("start Iteration ", run_times) print("test model---------------") LR = LR_INIT previous_pr = None previous_fit = None best_pr = None best_fit = None for prune_rate in range(1, 100): tmp_crate = len(masked_model.group_name_list) * [0.01 * prune_rate] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus, TEST_BATCH_SIZE) print( "each layer {} \% | {} % in total => validation acc {}\%, validation ppl {}" .format(prune_rate, masked_model.get_sparsity() * 100, tmp_fit[1] * 100., tmp_fit[0])) if (not best_pr) and (tmp_fit[1] + acc_percent_prune) < original_acc: best_pr = previous_pr best_fit = previous_fit previous_pr = tmp_crate previous_fit = tmp_fit print('==============================') print("The best pruning rates are: {}".format(best_pr)) if (not best_pr) or (best_pr[0] == init_threshold[0]): print( "Not better than last iteration of pruning, stop the process.") exit() masked_model.change_mask(best_pr, apply_MP_on_mask) masked_model.apply_mask() test_fit = evaluate_lm(masked_model.masked_model, test_data, corpus, TEST_BATCH_SIZE) print("{} \% => validation acc {}\%, validation ppl {}".format( best_pr[0], best_fit[1] * 100., best_fit[0])) print("{} \% => test acc {}\%, test ppl {}".format( best_pr[0], test_fit[1] * 100., test_fit[0])) print('==============================') init_threshold = best_pr saved_model_name = '' % ( name_mark, run_times, Model_type, layer_group_type, str(acc_percent_prune)), cfg.LM_MODEL_TMP_FOLDER + saved_model_name) #--------------- start retraining -------------- model_for_train = masked_model with open(cfg.LM_MODEL_TMP_FOLDER + saved_model_name, 'rb') as f: model_tmp_load = torch.load(f) model_for_train.masked_model = model_tmp_load model_for_train.change_mask(init_threshold, apply_MP_on_mask) model_for_train.apply_mask() model_for_train.make_trainable() recovered = False best_val_loss = None try: for epoch in range(1, RETRAIN_EPOCHS + 1): epoch_start_time = time.time() train(model_for_train, ntokens, train_data, TRAIN_BATCH_SIZE, SEQ_LEN, corpus, GRAD_CLIP, TRAIN_LOG_INTERVAL, epoch) val_eval = evaluate_lm(model_for_train.masked_model, val_data, corpus, TEST_BATCH_SIZE) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid acc {:5.2f} | ' 'valid ppl {:8.2f}'.format( epoch, (time.time() - epoch_start_time), val_eval[1], val_eval[0])) val_loss = val_eval[2] print('-' * 89) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: with open( "{}/{}{}_iterative_retrain_model_runtime{}_epoch_{}.pt" .format(cfg.LM_MODEL_PATH, name_mark, acc_percent_prune, run_times, epoch), 'wb') as f:, f) best_val_loss = val_loss else: # Anneal the learning rate if no improvement has been seen in the validation dataset. LR /= 4.0 if val_eval[1] >= original_acc: recovered = True break except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') print(time_now(), "finish retraining ") if not recovered: exit() else: print("------------Accuracy recorverd!--------------------") print("recovered accuracy (>= {})".format(acc_of_no_prune)) model_for_train.make_evaluable() model_for_train.apply_mask() ref_model = model_for_train.masked_model print("validate acc of the model---------------") tmp_fit = evaluate_lm(ref_model, valid_data, corpus, TEST_BATCH_SIZE) print('ref_model', 'acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) print("-------------print TEST evaluation info ---------------") tmp_fit = evaluate_lm(ref_model, test_data, corpus, TEST_BATCH_SIZE) print('percentage %s => acc (%.4f), ppl (%.4f)' % (init_threshold[0] * 100, tmp_fit[1], tmp_fit[0])) masked_model = model_for_train run_times += 1
def main(): total_times = 100 run_times = 0 init_threshold = ... start_t = time.time() valid_data = torch.load(TRAIN_DATA + '') fields = onmt.IO.load_fields(torch.load(TRAIN_DATA + '')) # fields = onmt.IO.load_fields_from_vocab(torch.load(TRAIN_DATA + '')) valid_data.fields = fields # we need to clear this assignment relationg if we want to transfere valid among threads checkpoint = torch.load(weights, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] masked_models = [] with cuda.device(GPU_ID): ref_model = onmt.ModelConstructor.make_base_model(model_opt, fields, True, checkpoint) ref_model.eval() ref_model.generator.eval() masked_model = MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device()) # ref_model is at current_device, no copy will happen masked_models.append(masked_model) train_opt, _, _ = opt_initialize(checkpoint, '', '') if GPU_ID: cuda.set_device(GPU_ID) print("BLEU evaluation:") translate_opt, translate_dummy_opt = translate_opt_initialize('', '') translator = init_translate_model(translate_opt, translate_dummy_opt) del translator.model translator.model = masked_model.masked_model tt=open(translate_opt.tgt, 'r') references = [[t] for t in tt] translate_data = onmt.IO.ONMTDataset(translate_opt.src, translate_opt.tgt,fields,use_filter_pred=False) prune_data = onmt.IO.OrderedIterator(dataset=translate_data, device=GPU_ID,batch_size=1, train=False, sort=False,shuffle=False) tmp_fit2 = evaluate_trans(translator, references, prune_data, translate_data) print('Finsished => bleu (%.4f), ppl (%.4f)' % (tmp_fit2[1]*100, tmp_fit2[0])) exit() # print(time_now(), "get accuray of no pruning model") # masked_model.make_evaluable() # tmp_crate = len(masked_model.group_name_list)*[0] # masked_model.change_mask(tmp_crate, apply_MP_on_mask) # masked_model.apply_mask() # tmp_fit = evaluate(masked_model, valid_data, fields) # # 只需要原始的accuracy # acc_of_no_prune = tmp_fit[1] # acc_of_no_prune = int(acc_of_no_prune*10)/10 print("init accuracy of model:", acc_of_no_prune) print("accuracy constraint:", acc_percent_prune) while run_times < total_times: print("-----------------------------------------") print("-----------------------------------------") print("-----------------------------------------") print(time_now(), "start Iteration ", run_times) print("test model---------------") ref_model.eval() ref_model.generator.eval() tmp_fit = evaluate(ref_model, valid_data, fields) print('ref_model','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) print("test model---------------") masked_models[0].make_evaluable() tmp_fit = evaluate(masked_models[0], valid_data, fields) print('masked_models[0]','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) model_sparsity = masked_models[0].get_sparsity() print('masked_models[0] Sparsity: {}'.format(model_sparsity)) itr_time = time.time() for gpu_candidate in other_GPU_IDs: with cuda.device(gpu_candidate): masked_models.append(MaskedModel(ref_model, group_dict, GPU_ID, gpu_candidate)) # if the gpu_candidate is the same as ref_model, it will return the ref_model #------------- Here ------------------------- # del ref_model # do pruning ncs_start = time.time() print('Itration %d, model loading: %d sec' % (run_times, ncs_start - itr_time)) if run_times == 0: if START_THRESHOLD is not None: init_threshold = START_THRESHOLD else: init_threshold = len(masked_models[0].group_name_list)*[0.25] # if run_times == 0: # init_threshold = len(masked_models[0].group_name_list)*[0.25] print("init threshold:", init_threshold) prune_acc_now = acc_percent_prune+tmp_fit[1]-acc_of_no_prune print('pruning acc now:', prune_acc_now) best_found, saved_model, best_masked_model = NCS_MP(init_threshold, 0.05, fields, masked_models, valid_data, prune_acc_now, run_times, checkpoint) init_threshold = best_found #best_found, saved_model, best_masked_model = NCS_MP(init_threshold, 0.05, fields, masked_models, valid_data, 0.01, run_times, checkpoint) end_t = time.time() print('NCS Time: {} min'.format((end_t - itr_time)/60.)) print('Best found thresholds:') for i in range(len(masked_models[0].group_name_list)): print("layer {}: {}%".format(masked_models[0].group_name_list[i], 100*best_found[i])) print("BLEU evaluation:") translate_opt, translate_dummy_opt = translate_opt_initialize('', '') translator = init_translate_model(translate_opt, translate_dummy_opt) del translator.model translator.model = best_masked_model tt=open(translate_opt.tgt, 'r') references = [[t] for t in tt] translate_data = onmt.IO.ONMTDataset(translate_opt.src, translate_opt.tgt,fields,use_filter_pred=False) prune_data = onmt.IO.OrderedIterator(dataset=translate_data, device=GPU_ID,batch_size=1, train=False, sort=False,shuffle=False) tmp_fit = evaluate_trans(translator, references, prune_data, translate_data) print('Finsished => bleu (%.4f), ppl (%.4f)' % (tmp_fit[1]*100, tmp_fit[0])) # clear no used models for gpu_model in masked_models: del gpu_model #--------------- start retraining -------------- model_for_train = best_masked_model pretrained_leaf_dict = model_for_train.make_trainable() optim = build_optim(model_for_train.masked_model, checkpoint, train_opt, pretrained_leaf_dict) print(time_now(), "start loading data for retraining") train = torch.load( '') valid = torch.load( + '') train_fields = load_fields(train, valid, checkpoint, train_opt) print(time_now(), "finish data loading") model_for_train.change_mask(init_threshold, apply_MP_on_mask) model_for_train.apply_mask() model_for_train.make_trainable() recovered = train_model(model_for_train, train, valid, train_fields, optim, train_opt, run_times, acc_of_no_prune) print(time_now(), "finish retraining ") if not recovered: exit() else: print("------------Accuracy recorverd!--------------------") print("recovered accuracy:", acc_of_no_prune) model_for_train.make_evaluable() ref_model = model_for_train.masked_model masked_models = [MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device())] print("test model---------------") tmp_fit = evaluate(ref_model, valid_data, fields) print('ref_model','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) print("test model---------------") tmp_fit = evaluate(masked_models[0], valid_data, fields) print('masked_models[0]','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) model_sparsity = masked_models[0].get_sparsity() print('masked_models[0] Sparsity: {}'.format(model_sparsity)) print('------------- save checkpoint ---------------') _, saved_model = update_checkpoint(checkpoint, model_for_train, run_times, acc_percent_prune, t=True) print(time_now(), ' saving model:', saved_model) print("-------------print evaluation info ---------------") tmp_fit = evaluate(model_for_train, valid_data, fields) print('percentage %s => acc (%.4f), ppl (%.4f)' % (best_found*100, tmp_fit[1], tmp_fit[0])) #-------------------------------------------------- print("BLEU evaluation:") translate_opt, translate_dummy_opt = translate_opt_initialize('', '') translator = init_translate_model(translate_opt, translate_dummy_opt) del translator.model translator.model = model_for_train.masked_model tt=open(translate_opt.tgt, 'r') references = [[t] for t in tt] translate_data = onmt.IO.ONMTDataset(translate_opt.src, translate_opt.tgt,fields,use_filter_pred=False) prune_data = onmt.IO.OrderedIterator(dataset=translate_data, device=GPU_ID,batch_size=1, train=False, sort=False,shuffle=False) tmp_fit2 = evaluate_trans(translator, references, prune_data, translate_data) print('Finsished => bleu (%.4f), ppl (%.4f)' % (tmp_fit2[1]*100, tmp_fit2[0])) #-------------------------------------------------- run_times += 1
def main(): data_path = "{}/data/penn".format(DATA_PATH) model_path = "{}/deepModels/torch_models/language-model/{}".format(MODEL_PATH, '') #model_path = "{}/deepModels/torch_models/language-model/{}".format(MODEL_PATH, '') total_times = 1 run_times = 0 orginal_acc = 0 init_threshold = ... start_t = time.time() # get data corpus = data.Corpus(data_path) ntokens = len(corpus.dictionary) eval_batch_size = TEST_BATCH_SIZE train_data = batchify(corpus.train, TRAIN_BATCH_SIZE) val_data = batchify(corpus.valid, TEST_BATCH_SIZE) valid_data = val_data test_data = batchify(corpus.test, TEST_BATCH_SIZE) ref_model = None # Load the best saved model. masked_models = [] with cuda.device(GPU_ID): ff = open(model_path, 'rb') ref_model = torch.load(ff) ref_model.eval() masked_model = MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device()) # ref_model is at current_device, no copy will happen #pdb.set_trace() masked_models.append(masked_model) ff.close() if GPU_ID: cuda.set_device(GPU_ID) print(time_now(), "get accuray of no pruning model") masked_model.make_evaluable() tmp_crate = len(masked_model.group_name_list)*[0] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus, TEST_BATCH_SIZE) # 只需要原始的accuracy acc_of_no_prune = tmp_fit[1] fit_of_no_prune = tmp_fit original_acc = acc_of_no_prune #acc_of_no_prune = int(acc_of_no_prune*10)/10 print("=============TiPO start========================") print("init accuracy of model:", acc_of_no_prune) print("accuracy constraint:", acc_percent_prune) previous_pr = None best_pr = None ncs_std = 0.05 while run_times < total_times: print("-----------------------------------------") print("-----------------------------------------") print("-----------------------------------------") print("start Iteration ", run_times) print("test model---------------") LR = LR_INIT #ref_model.generator.eval() print("test model---------------") masked_models[0].make_evaluable() tmp_fit = evaluate_lm(masked_models[0].masked_model, valid_data, corpus, TEST_BATCH_SIZE) print('masked_models[0]','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) if run_times == 0: init_threshold = len(masked_models[0].group_name_list) * [0.6] itr_time = time.time() for gpu_candidate in other_GPU_IDs: with cuda.device(gpu_candidate): masked_models.append(MaskedModel(ref_model, group_dict, GPU_ID, gpu_candidate)) # if the gpu_candidate is the same as ref_model, it will return the ref_model #------------- Here ------------------------- # del ref_model # do pruning ncs_start = time.time() print('Itration %d, model loading: %d sec' % (run_times, ncs_start - itr_time)) print("init threshold:", init_threshold) best_found, saved_model, best_masked_model = NCS_MP(init_threshold, ncs_std, masked_models, valid_data, corpus, acc_percent_prune, fit_of_no_prune, run_times) #best_found, saved_model, best_masked_model = init_threshold, '/raid/lab_tk/liguiying/deepModels/torch_models/language-model/prune_tmp/', masked_models[0] init_threshold = best_found #best_found, saved_model, best_masked_model = NCS_MP(init_threshold, 0.05, fields, masked_models, valid_data, 0.01, run_times, checkpoint) end_t = time.time() print('NCS Time: {} min'.format((end_t - itr_time)/60.)) print('Best found thresholds:') for i in range(len(masked_models[0].group_name_list)): print("layer {}: {}%".format(masked_models[0].group_name_list[i], 100*best_found[i])) print("TEST PPL evaluation:") tmp_fit = evaluate_lm(best_masked_model.masked_model, test_data, corpus, TEST_BATCH_SIZE) print('Finsished => acc (%.4f percent), ppl (%.4f)' % (tmp_fit[1]*100, tmp_fit[0])) # clear no used models for gpu_model in masked_models: del gpu_model if not best_pr : best_pr = best_masked_model.get_sparsity() else: tmp_pr = best_masked_model.get_sparsity() if best_pr > tmp_pr: print("No improvement! Stop the PROCESS.") exit() elif best_pr == tmp_pr: if tmp_fit[1] <fit_of_no_prune[1]: ncs_std /= 10 else: ncs_std *= 10 else: best_pr = tmp_pr #if run_times % 5 == 0: # ncs_std /= 10 #--------------- start retraining -------------- model_for_train = best_masked_model #pretrained_leaf_dict = model_for_train.make_trainable() #print(model_for_train.map_dict.keys()) #pdb.set_trace() #fix_no_leaf(model_for_train, pretrained_leaf_dict) #pdb.set_trace() with open(SAVE_MODEL_TMP_FOLDER + saved_model, 'rb') as f: model_tmp_load = torch.load(f) model_for_train.masked_model = model_tmp_load.masked_model model_for_train.change_mask(init_threshold, apply_MP_on_mask) model_for_train.apply_mask() model_for_train.make_trainable() recovered = False best_val_loss = None try: for epoch in range(1, RETRAIN_EPOCHS + 1): epoch_start_time = time.time() train(model_for_train, ntokens, train_data, TRAIN_BATCH_SIZE, SEQ_LEN, corpus, GRAD_CLIP, TRAIN_LOG_INTERVAL, epoch) val_eval = evaluate_lm(model_for_train.masked_model, val_data, corpus, TEST_BATCH_SIZE) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid acc {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_eval[1], val_eval[0])) val_loss = val_eval[2] print('-' * 89) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: with open("{}/{}{}_iterative_retrain_model_runtime{}_epoch_{}.pt".format(SAVE_MODEL_FOLDER, name_mark, acc_percent_prune, run_times, epoch), 'wb') as f:, f) best_val_loss = val_loss else: # Anneal the learning rate if no improvement has been seen in the validation dataset. LR /= 4.0 if val_eval[1] >= original_acc: recovered = True except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') print(time_now(), "finish retraining ") if not recovered: print("NOT RECORVER!") exit() else: print("------------Accuracy recorverd!--------------------") print("recovered accuracy (>= {})".format(acc_of_no_prune)) model_for_train.make_evaluable() model_for_train.apply_mask() ref_model = model_for_train.masked_model masked_models = [MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device())] print("validate acc of the model---------------") tmp_fit = evaluate_lm(ref_model, valid_data, corpus, TEST_BATCH_SIZE) print('ref_model','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) tmp_fit = evaluate_lm(masked_models[0].masked_model, valid_data, corpus, TEST_BATCH_SIZE) print('masked_models[0]','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) print('------------- save checkpoint ---------------') saved_model = update_checkpoint(model_for_train, run_times, acc_percent_prune, t=True) print(time_now(), ' saving model:', saved_model) print("-------------print TEST evaluation info ---------------") tmp_fit = evaluate_lm(model_for_train.masked_model, test_data, corpus, TEST_BATCH_SIZE) print('percentage %s => acc (%.4f), ppl (%.4f)' % (model_for_train.get_sparsity()*100, tmp_fit[1], tmp_fit[0])) run_times += 1
def get_memory_use(): device = cuda.current_device() message = cuda.get_device_name(device) + ':\n' message += 'allocated:' + str(cuda.memory_allocated(device)) + '/' + str(cuda.max_memory_allocated()) + '\n' message += 'cached:' + str(cuda.memory_cached(device)) + '/' + str(cuda.max_memory_cached()) + '\n' return message
import tensorflow as tf #%% Check that gpu is available from tensorflow.python.client import device_lib assert 'GPU' in str(device_lib.list_local_devices()) # confirm Keras sees the GPU from keras import backend assert len(backend.tensorflow_backend._get_available_gpus()) > 0 # confirm PyTorch sees the GPU from torch import cuda assert cuda.is_available() assert cuda.device_count() > 0 print(cuda.get_device_name(cuda.current_device())) #%% cb = [ModelCheckpoint("model.hdf5", save_best_only=True, period=3)] model = Sequential() model.add(CuDNNGRU(48, input_shape=(None, n_features))) model.add(Dense(10, activation='relu')) model.add(Dense(1)) model.summary() #%% # Compile and fit model