def _async_copy(inputs, device_ids): nr_devs = len(device_ids) assert type(inputs) in (tuple, list) assert len(inputs) == nr_devs outputs = [] for i, dev in zip(inputs, device_ids): with cuda.device(dev): outputs.append(async_copy_to(i, dev)) return tuple(outputs)
def _async_copy_stream(inputs, device_ids): nr_devs = len(device_ids) assert type(inputs) in (tuple, list) assert len(inputs) == nr_devs outputs = [] streams = [_get_stream(d) for d in device_ids] for i, dev, stream in zip(inputs, device_ids, streams): with cuda.device(dev): main_stream = cuda.current_stream() with cuda.stream(stream): outputs.append(async_copy_to(i, dev, main_stream=main_stream)) main_stream.wait_stream(stream) return outputs
def _generic_fmm(proc_idx, queue, device_id): # Unpack the function arguments a: ArgsFmm = queue.get() X1: torch.Tensor = a.X1 X2: torch.Tensor = a.X2 cuda_inputs = X1.is_cuda out = a.out kernel, gpu_dtype = a.kernel, a.gpu_dtype max_mem = a.max_mem num_streams = a.num_streams # flags and local variables change_dtype = gpu_dtype != X1.dtype X1_equal_X2 = _gpu_tns_same_memory(X1, X2) use_gpu_bufs = change_dtype or not cuda_inputs stride = "F" if is_f_contig(out, strict=True) else "C" j_iter = 0 dts = sizeof_dtype(gpu_dtype) tc_device = torch.device('cuda:%d' % (int(device_id))) avail_mem = max_mem / dts # Choose block sizes n, m such that we won't run out of GPU memory ntot, d = X1.shape mtot = X2.shape[0] extra_mem = kernel.extra_mem() if cuda_inputs and not change_dtype: # No allocation will be performed by us. Only in-kernel stuff. n, m = select_dim_over_nm(max_n=ntot, max_m=mtot, d=d, coef_nd=extra_mem.get('nd', 0), coef_md=extra_mem.get('md', 0), coef_nm=extra_mem.get('nm', 0), coef_n=extra_mem.get('n', 0), coef_m=extra_mem.get('m', 0), rest=extra_mem.get('d', 0), max_mem=avail_mem) else: n, m = select_dim_over_nm( max_n=ntot, max_m=mtot, d=d, coef_nd=num_streams * (extra_mem.get('nd', 0) + 1), coef_md=num_streams * (extra_mem.get('md', 0) + 1), coef_nm=num_streams * (extra_mem.get('nm', 0) + 1), coef_n=extra_mem.get('n', 0), coef_m=extra_mem.get('m', 0), rest=extra_mem.get('d', 0), max_mem=avail_mem) # Create streams streams = [tcd.Stream(device=tc_device) for _ in range(num_streams)] # Create buffers if use_gpu_bufs: gX1 = create_same_stride((n, d), X1, gpu_dtype, tc_device) gX2_list = [ create_same_stride((m, d), X2, gpu_dtype, tc_device) for _ in range(num_streams) ] gout_list = [ create_same_stride((n, m), out, gpu_dtype, tc_device) for _ in range(num_streams) ] if not cuda_inputs: cpu_buf_list = [ create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True) for _ in range(num_streams) ] # Define helpers for the copy-back operations (from cpu_buf to output) copy_ops = [None] * num_streams def wrap_copy_op(stream_idx): if copy_ops[stream_idx] is not None: copy_ops[stream_idx]() copy_ops[stream_idx] = None def do_copy_op(output, buf, i_, ic_, j_, jc_): # This function will also do the type conversion output[i_:i_ + ic_, j_:j_ + jc_].copy_(buf[:ic_, :jc_]) # Kernel computation begin with tcd.device(tc_device): for i in range(0, ntot, n): ic = min(n, ntot - i) with tcd.stream(streams[j_iter % len(streams)]): X1_chunk = X1.narrow(0, i, ic) if use_gpu_bufs: cur_gX1 = gX1.narrow(0, 0, ic) cur_gX1.copy_(X1_chunk, non_blocking=True) else: cur_gX1 = X1_chunk for j in range(0, mtot, m): jc = min(m, mtot - j) # Choose the buffers for this inner iteration stream_id = j_iter % len(streams) stream = streams[stream_id] if use_gpu_bufs: gX2 = gX2_list[stream_id] gout = gout_list[stream_id] if not cuda_inputs: cpu_buf = cpu_buf_list[stream_id] # Sync for buffers we must use now (e.g. 2 previous iters) with tcd.stream(stream): # Inner-loop stream.synchronize() wrap_copy_op(stream_id) if X1_equal_X2 and j < i: # Shortcut for symmetric kernels jc = min(m, mtot - j) out[i:i + ic, j:j + jc].copy_(out[j:j + jc, i:i + ic].T, non_blocking=True) j_iter += 1 continue # Copy (CPU->GPU) X2_chunk = X2.narrow(0, j, jc) if use_gpu_bufs: cur_gX2 = gX2.narrow(0, 0, jc) cur_gX2.copy_(X2_chunk, non_blocking=True) else: cur_gX2 = X2_chunk if use_gpu_bufs: cur_gout = gout[:ic, :jc] else: cur_gout = out[i:i + ic, j:j + jc] cur_gout.fill_(0.0) # Compute ddd = kernel._prepare(cur_gX1, cur_gX2) kernel._apply(cur_gX1, cur_gX2.T, cur_gout) cur_gout = kernel._finalize(cur_gout, ddd) # Copy Back (GPU->CPU) if not cuda_inputs: # copy_ does not care about the contiguity of copies, as long as it's consistent # however, in case of C-contiguous inputs it will create an intermediate array # which is undesired. We use cuda_memcpy2d_async which works well with C-contiguous # arrays. if stride == "F": copy_to_host(ic, jc, cur_gout, 0, 0, cpu_buf, 0, 0, s=stream) else: cuda_memcpy2d_async(dst=cpu_buf.data_ptr(), dpitch=cpu_buf.stride(0) * dts, src=cur_gout.data_ptr(), spitch=cur_gout.stride(0) * dts, width=jc * dts, height=ic, stream=stream._as_parameter_) copy_ops[stream_id] = partial(do_copy_op, out, cpu_buf, i, ic, j, jc) elif change_dtype: out.narrow(0, i, ic).narrow(1, j, jc).copy_(cur_gout, non_blocking=True) j_iter += 1 for i in range(num_streams): streams[i].synchronize() wrap_copy_op(i) return out
def prune_and_eval(rank, size, orig_fit, acc_constraint, valid, es, ref_model, num_runs, final_results): _valid = valid gpu_id = GPU_ID total_iterations = es.Tmax / es.popsize individual_iter_count = 0 #ref_model = masked_models[rank] X = torch.Tensor(copy.deepcopy(es.pop)) communicate_size = es.n + 4 # the size of tensors transfer accross computers communicate_tensor = torch.FloatTensor(communicate_size * [0.]) fitness_list = [] itr_best_remain = 0 if rank == 0: # rank 0 is the main process to collect finesses X.share_memory_() #fitness_list = [torch.FloatTensor([0.0,0.1,0.2,0.3]).share_memory_() for i in range(size)] fitness_list = [ torch.FloatTensor(communicate_size * [0.]).share_memory_() for i in range(size) ] if rank >= 1 and rank < size: # split tasks to different GPUs gpu_id = other_GPU_IDs[rank - 1] with cuda.device(gpu_id): local_fields = onmt.IO.load_fields(torch.load(TRAIN_DATA + '.vocab.pt')) _valid.fields = local_fields # fields can not be packed, so reconstruct it in each threahds while (individual_iter_count < total_iterations): if rank == 0: # master node itr_X = torch.Tensor(es.ask()) # broadcast the fathers X.copy_(itr_X) dist.broadcast(itr_X, 0) else: # recieve fathers from the source process dist.broadcast(X, 0) # apply MP on model x = X.numpy()[rank] ref_model.change_mask(x, apply_MP_on_mask) ref_model.apply_mask() # evaluate pruned network fitness = evaluate(ref_model, _valid, local_fields) communicate_tensor[0] = fitness[0] communicate_tensor[1] = fitness[1] communicate_tensor[2] = rank communicate_tensor[3] = ref_model.get_sparsity() for i in range(x.size): communicate_tensor[i + 4] = X[rank, i] #x[i] # sync fitness if rank == 0: # collect fitness across processes dist.gather(communicate_tensor, gather_list=fitness_list) else: dist.gather(communicate_tensor, dst=0) # judge new solutions if rank == 0: # negatively correlated search in master node fit = [] X_ = [] for i in range(es.popsize): the_fitness = 100 for j in range(len( fitness_list)): # results of fitness evaluation if int(fitness_list[j] [2]) == i: # 0:ppl, 1:acc, 2:rank of individual X_.append(fitness_list[j].numpy()[4:]) if orig_fit[1] - fitness_list[j][ 1] <= acc_constraint: the_fitness = -fitness_list[j][3] else: the_fitness = (orig_fit[1] - fitness_list[j][1] ) / acc_constraint continue fit.append(the_fitness) es.tell(X_, fit) itr_best_remain = min(fit) final_results['result_NCS'].copy_(torch.Tensor(es.result()[0])) individual_iter_count += 1 if rank == 0: # record status logger.scalar_summary( 'ncs_%s_fitness' % num_runs, es.result()[1], num_runs * total_iterations + individual_iter_count) logger.scalar_summary( 'ncs_%s_best_itr_remain' % num_runs, itr_best_remain, num_runs * total_iterations + individual_iter_count) logger.histo_summary( 'ncs_%s_pop' % num_runs, es.result()[0], num_runs * total_iterations + individual_iter_count) logger.histo_summary( 'pop of 1', X_[0], num_runs * total_iterations + individual_iter_count) logger.scalar_summary( 'sp of 1', -fitness_list[0][3], num_runs * total_iterations + individual_iter_count) logger.scalar_summary( 'rank of 1', fitness_list[0][2], num_runs * total_iterations + individual_iter_count) logger.histo_summary( 'pop of 2', X_[1], num_runs * total_iterations + individual_iter_count) logger.scalar_summary( 'sp of 2', -fitness_list[1][3], num_runs * total_iterations + individual_iter_count) logger.scalar_summary( 'rank of 2', fitness_list[1][2], num_runs * total_iterations + individual_iter_count) #logger.histo_summary('pop of 3', X_[2], num_runs*total_iterations + individual_iter_count) #logger.scalar_summary('sp of 3', -fitness_list[2][3], num_runs*total_iterations + individual_iter_count) #logger.scalar_summary('rank of 3', fitness_list[2][2], num_runs*total_iterations + individual_iter_count) ref_model.clear_cache()
def main(): data_path = "{}/data/penn".format(cfg.PROJECT_ROOT) model_path = "{}/model/original_model/language_model/{}".format( cfg.PROJECT_ROOT, 'lm_model_orignal.pt') total_times = 20 run_times = 0 orginal_acc = 0 init_threshold = ... start_t = time.time() # get data corpus = data.Corpus(data_path) ntokens = len(corpus.dictionary) eval_batch_size = TEST_BATCH_SIZE train_data = batchify(corpus.train, TRAIN_BATCH_SIZE) val_data = batchify(corpus.valid, TEST_BATCH_SIZE) valid_data = val_data test_data = batchify(corpus.test, TEST_BATCH_SIZE) ref_model = None # Load the best saved model. with cuda.device(GPU_ID): ff = open(model_path, 'rb') ref_model = torch.load(ff) ref_model.eval() masked_model = MaskedModel( ref_model, group_dict, cuda.current_device(), cuda.current_device( )) # ref_model is at current_device, no copy will happen #pdb.set_trace() ff.close() if GPU_ID: cuda.set_device(GPU_ID) print(time_now(), "get accuray of no pruning model") masked_model.make_evaluable() tmp_crate = len(masked_model.group_name_list) * [0] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus, TEST_BATCH_SIZE) # 只需要原始的accuracy acc_of_no_prune = tmp_fit[1] fit_of_no_prune = tmp_fit original_acc = acc_of_no_prune pruning_arr = [] ppl_arr = [] #acc_of_no_prune = int(acc_of_no_prune*10)/10 print("init accuracy of model:", acc_of_no_prune) print("accuracy constraint:", acc_percent_prune) init_threshold = [0] while run_times < total_times: print("-----------------------------------------") print("-----------------------------------------") print("-----------------------------------------") print("start Iteration ", run_times) print("test model---------------") LR = LR_INIT previous_pr = None previous_fit = None best_pr = None best_fit = None for prune_rate in range(1, 100): tmp_crate = len(masked_model.group_name_list) * [0.01 * prune_rate] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus, TEST_BATCH_SIZE) print( "each layer {} \% | {} % in total => validation acc {}\%, validation ppl {}" .format(prune_rate, masked_model.get_sparsity() * 100, tmp_fit[1] * 100., tmp_fit[0])) if (not best_pr) and (tmp_fit[1] + acc_percent_prune) < original_acc: best_pr = previous_pr best_fit = previous_fit previous_pr = tmp_crate previous_fit = tmp_fit print('==============================') print("The best pruning rates are: {}".format(best_pr)) if (not best_pr) or (best_pr[0] == init_threshold[0]): print( "Not better than last iteration of pruning, stop the process.") exit() masked_model.change_mask(best_pr, apply_MP_on_mask) masked_model.apply_mask() test_fit = evaluate_lm(masked_model.masked_model, test_data, corpus, TEST_BATCH_SIZE) print("{} \% => validation acc {}\%, validation ppl {}".format( best_pr[0], best_fit[1] * 100., best_fit[0])) print("{} \% => test acc {}\%, test ppl {}".format( best_pr[0], test_fit[1] * 100., test_fit[0])) print('==============================') init_threshold = best_pr saved_model_name = 'ncs_pruned_model_%s_iteration%s_%s_%s_acc_cons_%s.pt' % ( name_mark, run_times, Model_type, layer_group_type, str(acc_percent_prune)) torch.save(masked_model.masked_model, cfg.LM_MODEL_TMP_FOLDER + saved_model_name) #--------------- start retraining -------------- model_for_train = masked_model with open(cfg.LM_MODEL_TMP_FOLDER + saved_model_name, 'rb') as f: model_tmp_load = torch.load(f) model_for_train.masked_model = model_tmp_load model_for_train.change_mask(init_threshold, apply_MP_on_mask) model_for_train.apply_mask() model_for_train.make_trainable() recovered = False best_val_loss = None try: for epoch in range(1, RETRAIN_EPOCHS + 1): epoch_start_time = time.time() train(model_for_train, ntokens, train_data, TRAIN_BATCH_SIZE, SEQ_LEN, corpus, GRAD_CLIP, TRAIN_LOG_INTERVAL, epoch) val_eval = evaluate_lm(model_for_train.masked_model, val_data, corpus, TEST_BATCH_SIZE) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid acc {:5.2f} | ' 'valid ppl {:8.2f}'.format( epoch, (time.time() - epoch_start_time), val_eval[1], val_eval[0])) val_loss = val_eval[2] print('-' * 89) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: with open( "{}/{}{}_iterative_retrain_model_runtime{}_epoch_{}.pt" .format(cfg.LM_MODEL_PATH, name_mark, acc_percent_prune, run_times, epoch), 'wb') as f: torch.save(model_for_train, f) best_val_loss = val_loss else: # Anneal the learning rate if no improvement has been seen in the validation dataset. LR /= 4.0 if val_eval[1] >= original_acc: recovered = True break except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') print(time_now(), "finish retraining ") if not recovered: exit() else: print("------------Accuracy recorverd!--------------------") print("recovered accuracy (>= {})".format(acc_of_no_prune)) model_for_train.make_evaluable() model_for_train.apply_mask() ref_model = model_for_train.masked_model print("validate acc of the model---------------") tmp_fit = evaluate_lm(ref_model, valid_data, corpus, TEST_BATCH_SIZE) print('ref_model', 'acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) print("-------------print TEST evaluation info ---------------") tmp_fit = evaluate_lm(ref_model, test_data, corpus, TEST_BATCH_SIZE) print('percentage %s => acc (%.4f), ppl (%.4f)' % (init_threshold[0] * 100, tmp_fit[1], tmp_fit[0])) masked_model = model_for_train run_times += 1
def _work(process_id, model, dataset, args): databin = dataset[process_id] n_gpus = torch.cuda.device_count() data_loader = DataLoader(databin, shuffle=False, num_workers=args.num_workers // n_gpus, pin_memory=False) with torch.no_grad(), cuda.device(process_id): model.cuda() for iter, pack in enumerate(data_loader): img_name = pack['name'][0] label = pack['label'][0] size = pack['size'] strided_size = imutils.get_strided_size(size, 4) strided_up_size = imutils.get_strided_up_size(size, 16) outputs = [ model(img[0].cuda(non_blocking=True)) for img in pack['img'] ] strided_cam = torch.sum( torch.stack([ F.interpolate(torch.unsqueeze(o, 0), strided_size, mode='bilinear', align_corners=False)[0] for o in outputs ]), 0) highres_cam = [ F.interpolate(torch.unsqueeze(o, 1), strided_up_size, mode='bilinear', align_corners=False) for o in outputs ] highres_cam = torch.sum(torch.stack(highres_cam, 0), 0)[:, 0, :size[0], :size[1]] valid_cat = torch.nonzero(label)[:, 0] strided_cam = strided_cam[valid_cat] strided_cam /= F.adaptive_max_pool2d(strided_cam, (1, 1)) + 1e-5 highres_cam = highres_cam[valid_cat] highres_cam /= F.adaptive_max_pool2d(highres_cam, (1, 1)) + 1e-5 # save cams np.save( os.path.join(args.cam_out_dir, img_name + '.npy'), { "keys": valid_cat, "cam": strided_cam.cpu(), "high_res": highres_cam.cpu().numpy() }) if process_id == n_gpus - 1 and iter % (len(databin) // 20) == 0: print("%d " % ((5 * iter + 1) // (len(databin) // 20)), end='')
def main(): data_path = "{}/data/penn".format(DATA_PATH) model_path = "{}/deepModels/torch_models/language-model/{}".format(MODEL_PATH, 'model.pt') #model_path = "{}/deepModels/torch_models/language-model/{}".format(MODEL_PATH, 'lstm_3layer.pt') total_times = 1 run_times = 0 orginal_acc = 0 init_threshold = ... start_t = time.time() # get data corpus = data.Corpus(data_path) ntokens = len(corpus.dictionary) eval_batch_size = TEST_BATCH_SIZE train_data = batchify(corpus.train, TRAIN_BATCH_SIZE) val_data = batchify(corpus.valid, TEST_BATCH_SIZE) valid_data = val_data test_data = batchify(corpus.test, TEST_BATCH_SIZE) ref_model = None # Load the best saved model. masked_models = [] with cuda.device(GPU_ID): ff = open(model_path, 'rb') ref_model = torch.load(ff) ref_model.eval() masked_model = MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device()) # ref_model is at current_device, no copy will happen #pdb.set_trace() masked_models.append(masked_model) ff.close() if GPU_ID: cuda.set_device(GPU_ID) print(time_now(), "get accuray of no pruning model") masked_model.make_evaluable() tmp_crate = len(masked_model.group_name_list)*[0] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus, TEST_BATCH_SIZE) # 只需要原始的accuracy acc_of_no_prune = tmp_fit[1] fit_of_no_prune = tmp_fit original_acc = acc_of_no_prune #acc_of_no_prune = int(acc_of_no_prune*10)/10 print("=============TiPO start========================") print("init accuracy of model:", acc_of_no_prune) print("accuracy constraint:", acc_percent_prune) previous_pr = None best_pr = None ncs_std = 0.05 while run_times < total_times: print("-----------------------------------------") print("-----------------------------------------") print("-----------------------------------------") print("start Iteration ", run_times) print("test model---------------") LR = LR_INIT #ref_model.generator.eval() print("test model---------------") masked_models[0].make_evaluable() tmp_fit = evaluate_lm(masked_models[0].masked_model, valid_data, corpus, TEST_BATCH_SIZE) print('masked_models[0]','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) if run_times == 0: init_threshold = len(masked_models[0].group_name_list) * [0.6] itr_time = time.time() for gpu_candidate in other_GPU_IDs: with cuda.device(gpu_candidate): masked_models.append(MaskedModel(ref_model, group_dict, GPU_ID, gpu_candidate)) # if the gpu_candidate is the same as ref_model, it will return the ref_model #------------- Here ------------------------- # del ref_model # do pruning ncs_start = time.time() print('Itration %d, model loading: %d sec' % (run_times, ncs_start - itr_time)) print("init threshold:", init_threshold) best_found, saved_model, best_masked_model = NCS_MP(init_threshold, ncs_std, masked_models, valid_data, corpus, acc_percent_prune, fit_of_no_prune, run_times) #best_found, saved_model, best_masked_model = init_threshold, '/raid/lab_tk/liguiying/deepModels/torch_models/language-model/prune_tmp/ncs_pruned_model_test_iteration0_LM_time_acc_cons_0.01.pt', masked_models[0] init_threshold = best_found #best_found, saved_model, best_masked_model = NCS_MP(init_threshold, 0.05, fields, masked_models, valid_data, 0.01, run_times, checkpoint) end_t = time.time() print('NCS Time: {} min'.format((end_t - itr_time)/60.)) print('Best found thresholds:') for i in range(len(masked_models[0].group_name_list)): print("layer {}: {}%".format(masked_models[0].group_name_list[i], 100*best_found[i])) print("TEST PPL evaluation:") tmp_fit = evaluate_lm(best_masked_model.masked_model, test_data, corpus, TEST_BATCH_SIZE) print('Finsished => acc (%.4f percent), ppl (%.4f)' % (tmp_fit[1]*100, tmp_fit[0])) # clear no used models for gpu_model in masked_models: del gpu_model if not best_pr : best_pr = best_masked_model.get_sparsity() else: tmp_pr = best_masked_model.get_sparsity() if best_pr > tmp_pr: print("No improvement! Stop the PROCESS.") exit() elif best_pr == tmp_pr: if tmp_fit[1] <fit_of_no_prune[1]: ncs_std /= 10 else: ncs_std *= 10 else: best_pr = tmp_pr #if run_times % 5 == 0: # ncs_std /= 10 #--------------- start retraining -------------- model_for_train = best_masked_model #pretrained_leaf_dict = model_for_train.make_trainable() #print(model_for_train.map_dict.keys()) #pdb.set_trace() #fix_no_leaf(model_for_train, pretrained_leaf_dict) #pdb.set_trace() with open(SAVE_MODEL_TMP_FOLDER + saved_model, 'rb') as f: model_tmp_load = torch.load(f) model_for_train.masked_model = model_tmp_load.masked_model model_for_train.change_mask(init_threshold, apply_MP_on_mask) model_for_train.apply_mask() model_for_train.make_trainable() recovered = False best_val_loss = None try: for epoch in range(1, RETRAIN_EPOCHS + 1): epoch_start_time = time.time() train(model_for_train, ntokens, train_data, TRAIN_BATCH_SIZE, SEQ_LEN, corpus, GRAD_CLIP, TRAIN_LOG_INTERVAL, epoch) val_eval = evaluate_lm(model_for_train.masked_model, val_data, corpus, TEST_BATCH_SIZE) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid acc {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_eval[1], val_eval[0])) val_loss = val_eval[2] print('-' * 89) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: with open("{}/{}{}_iterative_retrain_model_runtime{}_epoch_{}.pt".format(SAVE_MODEL_FOLDER, name_mark, acc_percent_prune, run_times, epoch), 'wb') as f: torch.save(model_for_train, f) best_val_loss = val_loss else: # Anneal the learning rate if no improvement has been seen in the validation dataset. LR /= 4.0 if val_eval[1] >= original_acc: recovered = True except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') print(time_now(), "finish retraining ") if not recovered: print("NOT RECORVER!") exit() else: print("------------Accuracy recorverd!--------------------") print("recovered accuracy (>= {})".format(acc_of_no_prune)) model_for_train.make_evaluable() model_for_train.apply_mask() ref_model = model_for_train.masked_model masked_models = [MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device())] print("validate acc of the model---------------") tmp_fit = evaluate_lm(ref_model, valid_data, corpus, TEST_BATCH_SIZE) print('ref_model','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) tmp_fit = evaluate_lm(masked_models[0].masked_model, valid_data, corpus, TEST_BATCH_SIZE) print('masked_models[0]','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) print('------------- save checkpoint ---------------') saved_model = update_checkpoint(model_for_train, run_times, acc_percent_prune, t=True) print(time_now(), ' saving model:', saved_model) print("-------------print TEST evaluation info ---------------") tmp_fit = evaluate_lm(model_for_train.masked_model, test_data, corpus, TEST_BATCH_SIZE) print('percentage %s => acc (%.4f), ppl (%.4f)' % (model_for_train.get_sparsity()*100, tmp_fit[1], tmp_fit[0])) run_times += 1
def distk_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out kernel: L2DistanceKernel = a.kernel max_mem = a.max_mem N, D = X1.size() M = X2.size(0) T = v.size(1) if v is not None else w.size(1) dtype = X1.dtype # Memory usage: # v : M x T # K : n x M # X1ss : n x d # X2s : M x d # Kv : n x T # out : M x T # sq1 : n x 1 # sq2 : M x 1 # ------------ # total : n*d + M*d + n*(M + T + 1) + 2*M*T + M avail_mem = max_mem / sizeof_dtype(dtype) # FIXME: There seems to be a bug where if we let avail_mem like it is # for 32-bit data-types some copy fails. In such case we need # to free up some more memory and then everything runs fine. rest_coef = 2 * M * T if v is not None else M * T n, d = select_dim_over_d(maxD=D, maxN=N, coef_nd=1, coef_n=M + T + 1, coef_d=M, rest=rest_coef + M, tot=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) s1 = tcd.Stream() s2 = tcd.Stream() with tcd.device(ddev), tcd.stream(s1): if v is not None: v_gpu = create_same_stride((M, T), v, dtype, ddev) copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0) K_gpu = create_same_stride((n, M), X1, dtype, ddev) X1ss_gpu = create_same_stride((n, d), X1, dtype, ddev) X2s_gpu = create_same_stride((M, d), X2, dtype, ddev) Kv_gpu = create_same_stride((n, T), X1, dtype, ddev) if out.is_cuda: out_gpu = out else: out_gpu = create_same_stride((M, T), out, dtype, ddev) out_gpu.fill_(0.0) sq1_gpu = create_same_stride((n, ), X1, dtype, ddev) sq2_gpu = create_same_stride((M, ), X1, dtype, ddev) #if (d == D): # with torch.cuda.stream(s2): # cur_X2s_gpu = copy_to_device_noorder(M, d, X2, 0, 0, X2s_gpu, 0, 0, s=s2) # torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2) for i in range(0, N, n): nb = min(N - i, n) cur_K_gpu = K_gpu.narrow(0, 0, nb) # nb x M cur_K_gpu.fill_(0.0) for j in range(0, D, d): db = min(D - j, d) # Parallelize two matrix transfers (probably pointless) #if d < D: with torch.cuda.stream(s2): cur_X2s_gpu = copy_to_device_noorder(M, db, X2, 0, j, X2s_gpu, 0, 0, s=s2) torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2) cur_X1ss_gpu = copy_to_device_noorder(nb, db, X1, i, j, X1ss_gpu, 0, 0, s=s1) torch.norm(cur_X1ss_gpu, p=2, dim=1, keepdim=True, out=sq1_gpu).pow_(2) s2.synchronize() s1.synchronize() cur_K_gpu.addmm_(mat1=cur_X1ss_gpu, mat2=cur_X2s_gpu.T, alpha=-2.0) cur_K_gpu.add_(sq1_gpu) cur_K_gpu.add_(sq2_gpu.T) cur_K_gpu.clamp_min_(0) cur_K_gpu = kernel._transform(cur_K_gpu) if w is not None: # Copy split w to GPU into cur_Kv_gpu, cur_Kv_gpu = copy_to_device_noorder(nb, T, w, i, 0, Kv_gpu, 0, 0, s=s1) # n x T if v is not None: cur_Kv_gpu.addmm_(cur_K_gpu, v_gpu) else: # v cannot be None if w is None cur_Kv_gpu = Kv_gpu.narrow(0, 0, nb) # n x T torch.mm(cur_K_gpu, v_gpu, out=cur_Kv_gpu) # n x T # Multiply transposed kernel with the Kv result. out_gpu.addmm_(cur_K_gpu.T, cur_Kv_gpu) # M x T s1.synchronize() s1.synchronize() if not out.is_cuda: copy_to_host_noorder(M, T, out_gpu, 0, 0, out, 0, 0) return out
def generic_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype N, D = X1.size() M = X2.size(0) if v is None: T = w.size(1) else: T = v.size(1) # Memory usage: # v : M x T # K : n x M # X1d : n x d # X2d : M x d # Kv : n x T # out2 : M x T # sq1 : n x 1 # sq2 : M x 1 # ------------ # total : n*d + M*d + n*(M + T) + 2*M*T + M avail_mem = max_mem / sizeof_dtype(dtype) # FIXME: There seems to be a bug where if we let avail_mem like it is # for 32-bit data-types some copy fails. In such case we need # to free up some more memory and then everything runs fine. if sizeof_dtype(dtype) == 4: avail_mem /= 2 rest_coef = 2 * M * T if v is not None else M * T n, d = select_dim_over_d(maxD=D, maxN=N, coef_nd=1, coef_n=M + T + 1, coef_d=M, rest=rest_coef + M, tot=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): # Initialize GPU data ker_gpu = create_same_stride((n, M), out, dtype=dtype, device=ddev) X1s_gpu = create_same_stride((n, d), X1, dtype, ddev) X2s_gpu = create_same_stride((M, d), X2, dtype, ddev) w_gpu = create_same_stride((n, T), ker_gpu, dtype, ddev) if out.is_cuda: out_gpu = out else: out_gpu = create_same_stride((M, T), out, dtype, ddev) out_gpu.fill_(0.0) if v is not None: v_gpu = v.to(device=ddev) # M x T for i in range(0, N, n): ic = min(n, N - i) ddd = kernel._prepare(X1.narrow(0, i, ic), X2) c_g_ker = ker_gpu.narrow(0, 0, ic) c_g_ker.fill_(0.0) for k in range(0, D, d): kc = min(d, D - k) c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0, 0) c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0, 0) kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker) kernel._finalize(c_g_ker, ddd) if w is not None: c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0) else: c_g_w = w_gpu.narrow(0, 0, ic) c_g_w.fill_(0.0) if v is not None: c_g_w.addmm_(c_g_ker, v_gpu) out_gpu.addmm_(c_g_ker.T, c_g_w) if not out.is_cuda: copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0) return out
def main(): data_path = "{}/data/penn".format(DATA_PATH) model_path = "{}/deepModels/SiPO/original_model/language_model/{}".format(MODEL_PATH, 'lm_model_orignal.pt') total_times = 12 run_times = 0 orginal_acc = 0 init_threshold = ... start_t = time.time() # get data corpus = data.Corpus(data_path) ntokens = len(corpus.dictionary) eval_batch_size = TEST_BATCH_SIZE train_data = batchify(corpus.train, TRAIN_BATCH_SIZE) val_data = batchify(corpus.valid, TEST_BATCH_SIZE) valid_data = val_data test_data = batchify(corpus.test, TEST_BATCH_SIZE) ref_model = None # Load the best saved model. with cuda.device(GPU_ID): ff = open(model_path, 'rb') ref_model = torch.load(ff) ref_model.eval() masked_model = MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device()) # ref_model is at current_device, no copy will happen #pdb.set_trace() ff.close() if GPU_ID: cuda.set_device(GPU_ID) print(time_now(), "get accuray of no pruning model") masked_model.make_evaluable() tmp_crate = len(masked_model.group_name_list)*[0] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus, TEST_BATCH_SIZE) # 只需要原始的accuracy acc_of_no_prune = tmp_fit[1] fit_of_no_prune = tmp_fit original_acc = acc_of_no_prune pruning_arr = [] ppl_arr = [] #acc_of_no_prune = int(acc_of_no_prune*10)/10 print("init accuracy of model:", acc_of_no_prune) print("accuracy constraint:", acc_percent_prune) init_threshold = [0] print("-----------------------------------------") print("-----------------------------------------") print("-----------------------------------------") SAVE_MODEL_FOLDER = '/fl/checkpoint/language-mode/prune/' print("test model---------------") LR = LR_INIT previous_pr = [0.01] * 4 previous_fit = [0.01, 12.3] best_pr = [0.01] * 4 best_fit = [0.01, 12.3] for prune_rate in range(1, 100): tmp_crate = len(masked_model.group_name_list)*[0.01 * prune_rate] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus, TEST_BATCH_SIZE) print("each layer {} \% | {} % in total => validation acc {}\%, validation ppl {}".format(prune_rate, masked_model.get_sparsity()*100, tmp_fit[1]*100., tmp_fit[0])) if (tmp_fit[1] + acc_percent_prune) > original_acc: best_pr = previous_pr best_fit = previous_fit previous_pr = tmp_crate previous_fit = tmp_fit test_fit = evaluate_lm(masked_model.masked_model, test_data, corpus, TEST_BATCH_SIZE) print("{} \% => validation acc {}\%, validation ppl {}".format(best_pr[0], best_fit[1]*100., best_fit[0])) print("{} \% => test acc {}\%, test ppl {}".format(best_pr[0], test_fit[1]*100., test_fit[0])) print('==============================') masked_model.apply_mask_init() init_threshold = best_pr saved_model_name = 'ncs_pruned_model_%s_iteration%s_%s_%s_acc_cons_%s.pt' % (name_mark, run_times, Model_type, layer_group_type, str(acc_percent_prune)) torch.save(masked_model.masked_model, SAVE_MODEL_FOLDER+saved_model_name) #--------------- start retraining -------------- model_for_train = masked_model with open(SAVE_MODEL_FOLDER + saved_model_name, 'rb') as f: model_tmp_load = torch.load(f) model_for_train.masked_model = model_tmp_load model_for_train.change_mask(init_threshold, apply_MP_on_mask) model_for_train.apply_mask() model_for_train.make_trainable() recovered = False best_val_loss = None train(model_for_train, ntokens, train_data, TRAIN_BATCH_SIZE, SEQ_LEN, corpus, GRAD_CLIP, TRAIN_LOG_INTERVAL, prune_rate) print(time_now(), "finish retraining ") print("------------Accuracy recorverd!--------------------") print("recovered accuracy (>= {})".format(acc_of_no_prune)) model_for_train.make_evaluable() model_for_train.apply_mask() ref_model = model_for_train.masked_model print("validate acc of the model---------------") tmp_fit = evaluate_lm(ref_model, valid_data, corpus, TEST_BATCH_SIZE) print('ref_model','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) print("-------------print TEST evaluation info ---------------") tmp_fit = evaluate_lm(ref_model, test_data, corpus, TEST_BATCH_SIZE) print('percentage %s => acc (%.4f), ppl (%.4f)' % (init_threshold[0]*100, tmp_fit[1], tmp_fit[0])) masked_model = model_for_train print('==============================') print("The best pruning rates are: {}".format(best_pr)) masked_model.change_mask(best_pr, apply_MP_on_mask) masked_model.apply_mask() test_fit = evaluate_lm(masked_model.masked_model, test_data, corpus, TEST_BATCH_SIZE) print("{} \% => validation acc {}\%, validation ppl {}".format(best_pr[0], best_fit[1]*100., best_fit[0])) print("{} \% => test acc {}\%, test ppl {}".format(best_pr[0], test_fit[1]*100., test_fit[0])) print('==============================')
def main(): data_path = "{}/data/penn".format(DATA_PATH) model_path = "{}/deepModels/torch_models/language-model/{}".format( MODEL_PATH, 'model.pt') total_times = 12 run_times = 0 orginal_acc = 0 init_threshold = ... start_t = time.time() # get data corpus = data.Corpus(data_path) ntokens = len(corpus.dictionary) eval_batch_size = TEST_BATCH_SIZE train_data = batchify(corpus.train, TRAIN_BATCH_SIZE) val_data = batchify(corpus.valid, TEST_BATCH_SIZE) valid_data = val_data test_data = batchify(corpus.test, TEST_BATCH_SIZE) ref_model = None masked_model = None # Load the best saved model. with cuda.device(GPU_ID): ff = open(model_path, 'rb') ref_model = torch.load(ff) ref_model.eval() masked_model = MaskedModel( ref_model, group_dict, cuda.current_device(), cuda.current_device( )) # ref_model is at current_device, no copy will happen ff.close() if GPU_ID: cuda.set_device(GPU_ID) if layer_group_type == 'simple': print("MP start for LM") elif layer_group_type == 'layer': print("LMP start for LM") print(time_now(), "get accuray of no pruning model") masked_model.make_evaluable() tmp_crate = len(masked_model.group_name_list) * [0] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus, TEST_BATCH_SIZE) # 只需要原始的accuracy acc_of_no_prune = tmp_fit[1] fit_of_no_prune = tmp_fit original_acc = acc_of_no_prune #acc_of_no_prune = int(acc_of_no_prune*10)/10 print("init accuracy of model:", acc_of_no_prune) print("accuracy constraint:", acc_percent_prune) # the best pruning rate previous_pr = None previous_fit = None best_pr = None best_fit = None for prune_rate in range(1, 100): tmp_crate = len(masked_model.group_name_list) * [0.01 * prune_rate] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus, TEST_BATCH_SIZE) print("{} \% => validation acc {}\%, validation ppl {}".format( prune_rate, tmp_fit[1] * 100., tmp_fit[0])) if (not best_pr) and (tmp_fit[1] + acc_percent_prune) < original_acc: best_pr = previous_pr best_fit = previous_fit previous_pr = tmp_crate previous_fit = tmp_fit print('==============================') print("The best pruning rates are: {}".format(best_pr)) masked_model.change_mask(best_pr, apply_MP_on_mask) masked_model.apply_mask() test_fit = evaluate_lm(masked_model.masked_model, test_data, corpus, TEST_BATCH_SIZE) print("{} \% => validation acc {}\%, validation ppl {}".format( best_pr[0], best_fit[1] * 100., best_fit[0])) print("{} \% => test acc {}\%, test ppl {}".format(best_pr[0], test_fit[1] * 100., test_fit[0])) print('==============================')
def main(): data_path = "{}/data/penn".format(cfg.PROJECT_ROOT) model_path = "{}/model/original_model/language_model/{}".format(cfg.PROJECT_ROOT, 'lm_model_orignal.pt') total_times = 2 run_times = 0 orginal_acc = 0 init_threshold = ... start_t = time.time() # get data corpus = data.Corpus(data_path) ntokens = len(corpus.dictionary) eval_batch_size = TEST_BATCH_SIZE train_data = batchify(corpus.train, TRAIN_BATCH_SIZE) val_data = batchify(corpus.valid, TEST_BATCH_SIZE) valid_data = val_data test_data = batchify(corpus.test, TEST_BATCH_SIZE) ref_model = None # Load the best saved model. masked_models = [] with cuda.device(GPU_ID): ff = open(model_path, 'rb') ref_model = torch.load(ff) ref_model.eval() masked_model = MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device()) # ref_model is at current_device, no copy will happen #pdb.set_trace() masked_models.append(masked_model) ff.close() if GPU_ID: cuda.set_device(GPU_ID) print(time_now(), "get accuray of no pruning model") masked_model.make_evaluable() tmp_crate = len(masked_model.group_name_list)*[0] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus, TEST_BATCH_SIZE) # 只需要原始的accuracy acc_of_no_prune = tmp_fit[1] fit_of_no_prune = tmp_fit original_acc = acc_of_no_prune pruning_arr = [] ppl_arr = [] #acc_of_no_prune = int(acc_of_no_prune*10)/10 print("init accuracy of model:", acc_of_no_prune) print("accuracy constraint:", acc_percent_prune) while run_times < total_times: print("-----------------------------------------") print("-----------------------------------------") print("-----------------------------------------") print("start Iteration ", run_times) print("test model---------------") LR = LR_INIT ref_model.eval() #ref_model.generator.eval() tmp_fit = evaluate_lm(ref_model, valid_data, corpus, TEST_BATCH_SIZE) print('ref_model','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) print("test model---------------") masked_models[0].make_evaluable() tmp_fit = evaluate_lm(masked_models[0].masked_model, valid_data, corpus, TEST_BATCH_SIZE) print('masked_models[0]','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) itr_time = time.time() for gpu_candidate in other_GPU_IDs: with cuda.device(gpu_candidate): masked_models.append(MaskedModel(ref_model, group_dict, GPU_ID, gpu_candidate)) # if the gpu_candidate is the same as ref_model, it will return the ref_model #------------- Here ------------------------- # del ref_model # do pruning ncs_start = time.time() print('Itration %d, model loading: %d sec' % (run_times, ncs_start - itr_time)) if run_times == 0: init_threshold = len(masked_models[0].group_name_list)*[0.10] print("init threshold:", init_threshold) best_found, saved_model, best_masked_model = NCS_MP(init_threshold, 0.05, masked_models, valid_data, corpus, acc_percent_prune, fit_of_no_prune, run_times) #best_found, saved_model, best_masked_model = init_threshold, '/raid/lab_tk/liguiying/deepModels/torch_models/language-model/prune_tmp/ncs_pruned_model_test_iteration0_LM_time_acc_cons_0.01.pt', masked_models[0] #init_threshold = best_found #best_found, saved_model, best_masked_model = NCS_MP(init_threshold, 0.05, fields, masked_models, valid_data, 0.01, run_times, checkpoint) end_t = time.time() print('NCS Time: {} min'.format((end_t - itr_time)/60.)) print('Best found thresholds:') for i in range(len(masked_models[0].group_name_list)): print("layer {}: {}%".format(masked_models[0].group_name_list[i], 100*best_found[i])) print("TEST PPL evaluation:") tmp_fit = evaluate_lm(best_masked_model.masked_model, test_data, corpus, TEST_BATCH_SIZE) print('Finsished => acc (%.4f percent), ppl (%.4f)' % (tmp_fit[1]*100, tmp_fit[0])) pruning_arr.append(masked_models[0].get_sparsity()) ppl_arr.append(tmp_fit[0]) # clear no used models for gpu_model in masked_models: del gpu_model run_times += 1 pruning_arr = np.array(pruning_arr) ppl_arr = np.array(ppl_arr) print("Prunig rate : mean({}) std({})".format(pruning_arr.mean(), pruning_arr.std())) print("PPL : mean({}) std({})".format(ppl_arr.mean(), ppl_arr.std()))
proc_pipe = transf.Compose( [transf.ToPILImage(), img_resize, transf.ToTensor(), img_norm]) train_dir = 'train_images' val_dir = 'test_images' train_loader = DataLoader(DataFeed(train_dir, nat_sort=True, transform=proc_pipe), batch_size=batch_size, shuffle=False) val_loader = DataLoader(DataFeed(val_dir, nat_sort=True, transform=proc_pipe), batch_size=batch_size, shuffle=False) # Network training: with cuda.device(0): top_1 = np.zeros((1, len(train_size))) top_2 = np.zeros((1, len(train_size))) top_3 = np.zeros((1, len(train_size))) acc_loss = 0 itr = [] for idx, n in enumerate(train_size): print('```````````````````````````````````````````````````````') print('Training size is {}'.format(n)) # Build the network: net = resnet18_mod(pretrained=True, progress=True, num_classes=64) net = net.cuda() layers = list(net.children()) # Optimization parameters: criterion = nn.CrossEntropyLoss()
def _work(process_id, model, dataset, args): n_gpus = torch.cuda.device_count() databin = dataset[process_id] # data_loader = DataLoader(databin, # shuffle=False, num_workers=args.num_workers // n_gpus, pin_memory=False) data_loader = DataLoader(databin, shuffle=False, pin_memory=False) with torch.no_grad(), cuda.device(process_id): model.cuda() for iter, pack in enumerate(tqdm(data_loader)): # img_name = voc12.dataloader.decode_int_filename(pack['name'][0]) img_name = pack['name'][0] orig_img_size = np.asarray(pack['size']) if args.dataset == 'voc12': img_path = voc12.dataloader.get_img_path( img_name, args.dev_root) elif args.dataset in ['adp_morph', 'adp_func']: img_path = adp.dataloader.get_img_path( img_name, args.dev_root, args.split == 'evaluation') elif args.dataset in ['deepglobe', 'deepglobe_balanced']: img_path = deepglobe.dataloader.get_img_path( img_name, args.dev_root) else: raise KeyError('Dataset %s not yet implemented' % args.dataset) edge, dp = model(pack['img'][0].cuda(non_blocking=True)) # if img_name == '2007_001185': # cv2.imwrite('edge.png', np.uint8(255 * cv2.resize(edge.cpu().numpy()[0], tuple(orig_img_size[::-1])))) # D = dp.cpu().numpy() # hsv = np.zeros((D.shape[1], D.shape[2], 3), dtype='uint8') # hsv[..., 1] = 255 # mag, ang = cv2.cartToPolar(-D[0], -D[1]) # hsv[..., 0] = ang * 180 / np.pi / 2 # hsv[..., 2] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX) # # rgb = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR) # cv2.imwrite('dp.png', cv2.resize(rgb[:, :, ::-1], tuple(orig_img_size[::-1]))) # a=1 cam_dict = np.load(args.cam_out_dir + '/' + img_name + '.npy', allow_pickle=True).item() cam_downsized_values = torch.from_numpy(cam_dict['cam']).cuda() if args.dataset == 'voc12': if len(cam_dict['keys']) > 0: keys = np.pad(cam_dict['keys'] + 1, (1, 0), mode='constant') if edge.shape[1:] != cam_downsized_values.shape[1:]: edge = F.interpolate( edge.unsqueeze(0), size=(cam_downsized_values.shape[1:]), mode='bilinear', align_corners=False) rw = indexing.propagate_to_edge(cam_downsized_values, edge, beta=args.beta, exp_times=args.exp_times, radius=5) # radius=5 # rw = indexing.propagate_to_edge(cam_downsized_values, edge, beta=args.beta, exp_times=3, radius=5) rw_up = F.interpolate( rw, size=tuple(orig_img_size), mode='bilinear', align_corners=False)[ ..., 0, :orig_img_size[0], :orig_img_size[1]] rw_up = rw_up / torch.max(rw_up) rw_up_bg = F.pad(rw_up, (0, 0, 0, 0, 1, 0), value=args.sem_seg_bg_thres) rw_pred = torch.argmax(rw_up_bg, dim=0).cpu().numpy() rw_pred = keys[rw_pred] else: rw_pred = np.zeros(orig_img_size, dtype='uint8') elif args.dataset in ['adp_morph', 'adp_func']: keys = cam_dict['keys'] if edge.shape[1:] != cam_downsized_values.shape[1:]: edge = F.interpolate(edge.unsqueeze(0), size=(cam_downsized_values.shape[1:]), mode='bilinear', align_corners=False) rw = indexing.propagate_to_edge(cam_downsized_values, edge, beta=args.beta, exp_times=args.exp_times, radius=5) rw_up = F.interpolate( rw, size=tuple(orig_img_size), mode='bilinear', align_corners=False)[ ..., 0, :orig_img_size[0], :orig_img_size[1]] rw_up = rw_up / torch.max(rw_up) rw_pred = torch.argmax(rw_up, dim=0).cpu().numpy() rw_pred = keys[rw_pred] elif args.dataset in ['deepglobe', 'deepglobe_balanced']: if len(cam_dict['keys']) > 0: keys = cam_dict['keys'] down_fac = 6 cam_downsized_values = F.interpolate( cam_downsized_values.unsqueeze(0), size=[ x // down_fac for x in cam_downsized_values.shape[1:] ], mode='bilinear', align_corners=False)[0] edge = F.interpolate(edge.unsqueeze(0), size=(cam_downsized_values.shape[1:]), mode='bilinear', align_corners=False) rw = indexing.propagate_to_edge(cam_downsized_values, edge, beta=args.beta, exp_times=args.exp_times, radius=5) rw_up = F.interpolate( rw, size=tuple(orig_img_size // 4), mode='bilinear', align_corners=False)[..., 0, :orig_img_size[0] // 4, :orig_img_size[1] // 4] rw_up = rw_up / torch.max(rw_up) rw_pred = torch.argmax(rw_up, dim=0).cpu().numpy() rw_pred = keys[rw_pred] else: rw_pred = 5 * np.ones(tuple(orig_img_size // 4)) else: raise KeyError('Dataset %s not yet implemented' % args.dataset) imageio.imsave( os.path.join(args.sem_seg_out_dir, img_name + '.png'), rw_pred.astype(np.uint8)) # Save with colour rw_pred_clr = np.zeros(list(rw_pred.shape) + [3], dtype=np.uint8) off = 0 for t in ['bg', 'fg']: for i, c in enumerate(args.class_colours[t]): for ch in range(3): rw_pred_clr[:, :, ch] += c[ch] * np.uint8(rw_pred == (i + off)) off += len(args.class_colours[t]) imageio.imsave( os.path.join(args.sem_seg_clr_out_dir, img_name + '.png'), rw_pred_clr) # Save with colour, overlaid on original image if args.dataset not in ['deepglobe', 'deepglobe_balanced']: orig_img = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB) else: orig_img = cv2.resize( cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB), rw_pred_clr.shape[:2]) if args.dataset in ['adp_morph', 'adp_func']: rw_pred_clr = cv2.resize(rw_pred_clr, orig_img.shape[:2]) rw_pred_clr_over = np.uint8( (1 - args.overlay_r) * np.float32(orig_img) + args.overlay_r * np.float32(rw_pred_clr)) imageio.imsave( os.path.join(args.sem_seg_clr_out_dir, img_name + '_overlay.png'), rw_pred_clr_over)
def _work(process_id, model, dataset, args): n_gpus = torch.cuda.device_count() databin = dataset[process_id] data_loader = DataLoader(databin, shuffle=False, num_workers=args.num_workers // n_gpus, pin_memory=False) with torch.no_grad(), cuda.device(process_id): model.cuda() for iter, pack in enumerate(data_loader): img_name = voc12.data.decode_int_filename(pack['name'][0]) orig_img_size = np.asarray(pack['size']) strided_size = imutils.get_strided_size(orig_img_size, 4) out_setting = { "upsize": strided_size, "flip": True } img_o = pack['img'][0][0] edge, dp = model(img_o.cuda(non_blocking=True), out_setting) edge_avg = edge edge_avg = torch.sigmoid(edge_avg) cam_dict = np.load(args.cam_dir + '/' + img_name + '.npy', allow_pickle=True).item() cams = cam_dict['cam'] keys = np.pad(cam_dict['keys'] + 1, (1, 0), mode='constant') cam_downsized_values = cams.cuda() edge_padded = F.pad(edge_avg, (5, 5, 0, 5), mode='constant', value=1.0) path_index = adv_indexing.PathIndex(radius=5, default_size=(strided_size[0] + 5, strided_size[1] + 10)) sparse_aff = adv_indexing.edge_to_affinity(torch.unsqueeze(edge_padded, 0), path_index.default_path_indices) dense_aff = affinity_sparse2dense(sparse_aff, path_index.default_src_indices, path_index.default_dst_indices, (strided_size[0] + 5) * (strided_size[1] + 10)) dense_aff = dense_aff.view(strided_size[0] + 5, strided_size[1] + 10, strided_size[0] + 5, -1)[:-5, 5:-5, :-5, 5:-5] dense_aff = dense_aff.reshape(strided_size[0]*strided_size[1], -1) trans_mat = to_transition_matrix(dense_aff, beta=args.beta, times=args.t) cam_expanded = cam_downsized_values*(1 - edge_avg) rw = torch.matmul(cam_expanded.view(cam_expanded.size(0), -1), trans_mat) rw = rw.view(rw.size(0), 1, strided_size[0], strided_size[1]) rw_up = F.interpolate(rw, scale_factor=4, mode='bilinear', align_corners=False)[..., :orig_img_size[0], :orig_img_size[1]] rw_up = rw_up[:, 0] rw_up_norm = rw_up / torch.max(rw_up) rw_up_norm_bg = F.pad(rw_up_norm, (0, 0, 0, 0, 1, 0), value=args.sem_seg_bg_thres) rw_pred = torch.argmax(rw_up_norm_bg, dim=0).cpu().numpy() rw_pred = keys[rw_pred] imageio.imsave(os.path.join(args.segm_out_dir, img_name + '.png'), rw_pred.astype(np.uint8)) if process_id == n_gpus - 1 and iter % (len(databin) // 20) == 0: print("%d " % ((5*iter+1)//(len(databin) // 20)), end='')
def main(): # load valid data cuda.set_device(GPU_ID) print('loading data...') start_time = time.time() valid_data = torch.load(os.path.join(data_path, 'len50_pywmt14.valid.pt')) fields = onmt.IO.load_fields( torch.load(os.path.join(data_path, 'len50_pywmt14.vocab.pt'))) valid_data.fields = fields print('data loaded. time {} seconds.'.format(time.time() - start_time)) # load model print('load pretrained model...') start_time = time.time() checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] with cuda.device(GPU_ID): ref_model = onmt.ModelConstructor.make_base_model( model_opt, fields, True, checkpoint) ref_model.eval() ref_model.generator.eval() lds_model = LDSMaskedNMT(ref_model, MODEL_TYPE) lds_model.mask_pruned_model() print('model loaded. time {} seconds.'.format(time.time() - start_time)) lds_model.cal_LDS_for_all_matrices() lds_model.cal_dis_temp() lds_model.cal_sorted_params() tmp_fit = evaluate(lds_model.pruned_model, valid_data, fields) print(tmp_fit) test_metrics(lds_model.pruned_model, fields) get_sparity(lds_model.pruned_model) tea_model = copy.deepcopy(lds_model.pruned_model) lds_model.prun_rnn(1.0, 1.0) lds_model.prun_other(0.6) lds_model.mask_pruned_model() tmp_fit = evaluate(lds_model.pruned_model, valid_data, fields) print(tmp_fit) test_metrics(lds_model.pruned_model, fields) stu_model = copy.deepcopy(lds_model.pruned_model) # load train data cuda.set_device(GPU_ID) print('loading data...') start_time = time.time() train_data = torch.load(os.path.join(data_path, 'len50_pywmt14.train.pt')) train_fields = onmt.IO.load_fields( torch.load(os.path.join(data_path, 'len50_pywmt14.vocab.pt'))) train_data.fields = train_fields print('data loaded. time {} seconds.'.format(time.time() - start_time)) # retrain train_ts(tea_model, stu_model, train_data, epoch=1) tmp_fit = evaluate(stu_model, valid_data, fields) print(tmp_fit) test_metrics(stu_model, fields) get_sparity(stu_model) lds_model.mask_pruned_model() tmp_fit = evaluate(stu_model, valid_data, fields) print(tmp_fit) test_metrics(stu_model, fields) get_sparity(stu_model)
def sparse_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 v, w, out = a.v, a.w, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype N, D = X1.shape M = X2.size(0) if v is None: T = w.size(1) else: T = v.size(1) # Memory needs: # X1_chunk : ntot + 2 * D * ntot * density # X2 : dtot + 2 * D * M * density (because is transposed) # sparse_out : ntot + 2 * ntot * M * density (assume here density = 1) # ker_gpu : M * ntot # w_gpu : ntot * T # v_gpu : M * T # out_gpu : M * T avail_mem = max_mem / sizeof_dtype(dtype) den = 2 * D * X1.density + 2 + 3 * M + T sub = D + 2 * D * M * X2.density + M * T if v is not None: sub += M * T n = (avail_mem - sub) / den n = min(int(n), N) if n < 1: raise MemoryError("Not enough memory to run sparse dfmmv") ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): # Initialize GPU data w_gpu = create_same_stride((n, T), out, dtype, ddev) if out.is_cuda: out_gpu = out else: out_gpu = create_same_stride((M, T), out, dtype, ddev) out_gpu.fill_(0.0) ker_gpu = create_fortran((n, M), dtype, ddev) if v is not None: v_gpu = v.to(device=ddev) # M x T X2_d = SparseTensor.from_scipy( X2.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=ddev) for i in range(0, N, n): ic = min(n, N - i) X1_chunk = X1.narrow_rows(i, ic) X1_chunk_d = X1_chunk.index_to_int().to(device=ddev) ker_chunk = ker_gpu[:ic] ker_chunk.fill_(0.0) # TODO: This is wasteful (X2 will be prepared many times over) ddd = kernel._prepare_sparse(X1_chunk, X2) ker_chunk = kernel._apply_sparse(X1_chunk_d, X2_d, ker_chunk) ker_chunk = kernel._finalize(ker_chunk, ddd) if w is not None: c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0) else: c_g_w = w_gpu.narrow(0, 0, ic) c_g_w.fill_(0.0) if v is not None: c_g_w.addmm_(ker_chunk, v_gpu) out_gpu.addmm_(ker_chunk.T, c_g_w) del ddd, X1_chunk, X1_chunk_d if not out.is_cuda: copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0) return out
def main(): total_times = 100 run_times = 0 init_threshold = ... start_t = time.time() valid_data = torch.load(TRAIN_DATA + '.valid.pt') fields = onmt.IO.load_fields(torch.load(TRAIN_DATA + '.vocab.pt')) # fields = onmt.IO.load_fields_from_vocab(torch.load(TRAIN_DATA + '.vocab.pt')) valid_data.fields = fields # we need to clear this assignment relationg if we want to transfere valid among threads checkpoint = torch.load(weights, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] # masked_models = [] with cuda.device(GPU_ID): ref_model = onmt.ModelConstructor.make_base_model( model_opt, fields, True, checkpoint) ref_model.eval() ref_model.generator.eval() masked_model = MaskedModel( ref_model, group_dict, cuda.current_device(), cuda.current_device( )) # ref_model is at current_device, no copy will happen # masked_models.append(masked_model) train_opt, _, _ = opt_initialize(checkpoint, 'opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt') if GPU_ID: cuda.set_device(GPU_ID) # 只需要原始的accuracy acc_of_no_prune = 0 get_acc_of_no_prune = False print(time_now(), "start while") while run_times < total_times: print("-----------------------------------------") print("-----------------------------------------") print("start Iteration ", run_times) # init threshold best_threshold = 0 itr_time = time.time() xxx = np.arange(0., 1, 0.01) print(time_now(), "start testing pruning") masked_model.make_evaluable() for i in range(len(xxx)): # best_threshold = 0.2 # break tmp_crate = len(masked_model.group_name_list) * [xxx[i]] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate(masked_model, valid_data, fields) print('percentage %s => acc (%.4f), ppl (%.4f)' % (xxx[i] * 100, tmp_fit[1], tmp_fit[0])) if i == 0 and not get_acc_of_no_prune: acc_of_no_prune = tmp_fit[1] acc_of_no_prune = int(acc_of_no_prune * 10) / 10 get_acc_of_no_prune = True elif acc_of_no_prune - tmp_fit[1] > acc_percent_prune: best_threshold = xxx[i] - 0.01 break # ------------------------------------------------- # Start writing # prune again print(time_now(), " init accuracy of model:", acc_of_no_prune) print("accuracy constraint:", acc_percent_prune) print("-------test------------:", get_acc_of_no_prune) print(time_now(), " apply pruning with threshold:", best_threshold) tmp_crate = len(masked_model.group_name_list) * [best_threshold] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() # print information tmp_fit = evaluate(masked_model, valid_data, fields) print('percentage %s => acc (%.4f), ppl (%.4f)' % (best_threshold * 100, tmp_fit[1], tmp_fit[0])) model_sparsity = masked_model.get_sparsity() print('Sparsity: {}'.format(model_sparsity)) #--------------- start retraining -------------- # first store model print(time_now(), "start saving model") _, saved_model = update_checkpoint(checkpoint, masked_model, run_times, acc_percent_prune) print(time_now(), "finish saving model:", saved_model) model_for_train = masked_model pretrained_leaf_dict = model_for_train.make_trainable() optim = build_optim(model_for_train.masked_model, checkpoint, train_opt, pretrained_leaf_dict) print("finish building optim") print(time_now(), "start loading data for retraining") train = torch.load(train_opt.data + '.train.pt') valid = torch.load(train_opt.data + '.valid.pt') train_fields = load_fields(train, valid, checkpoint, train_opt) print(time_now(), "finish data loading") recovered = train_model(model_for_train, train, valid, train_fields, optim, train_opt, run_times, acc_of_no_prune) print(time_now(), "finish retraining ") if not recovered: exit() else: print("------------Accuracy recorverd!--------------------") print("recovered accuracy:", acc_of_no_prune) #------------------------------------------------- print('------------- save checkpoint ---------------') _, saved_model = update_checkpoint(checkpoint, model_for_train, run_times, t=True) print(time_now(), ' saving model:', saved_model) print("-------------print evaluation info ---------------") model_for_train.make_evaluable() tmp_fit = evaluate(model_for_train, valid_data, fields) print('percentage %s => acc (%.4f), ppl (%.4f)' % (best_found * 100, tmp_fit[1], tmp_fit[0])) model_sparsity = model_for_train.get_sparsity() print('Sparsity: {}'.format(model_sparsity)) #-------------------------------------------------- print("BLEU evaluation:") translate_opt, translate_dummy_opt = translate_opt_initialize( 'opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt') translator = init_translate_model(translate_opt, translate_dummy_opt) del translator.model translator.model = model_for_train.masked_model tt = open(translate_opt.tgt, 'r') references = [[t] for t in tt] translate_data = onmt.IO.ONMTDataset(translate_opt.src, translate_opt.tgt, fields, use_filter_pred=False) prune_data = onmt.IO.OrderedIterator(dataset=translate_data, device=GPU_ID, batch_size=1, train=False, sort=False, shuffle=False) tmp_fit2 = evaluate_trans(translator, references, prune_data, translate_data) print('Finsished => bleu (%.4f), ppl (%.4f)' % (tmp_fit2[1] * 100, tmp_fit2[0])) #-------------------------------------------------- run_times += 1
def distk_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1, X2, v, out = a.X1, a.X2, a.v, a.out kernel: L2DistanceKernel = a.kernel max_mem = a.max_mem N, D = X1.shape M = X2.shape[0] T = v.shape[1] dtype = X1.dtype # GPU memory usage: # X1s : n x D # X2s : m x D # vs : m x T # nm : n x m # out : n x T # ----------- # total: n*m + n * (D + T) + m * (D + T) = R avail_mem = max_mem / sizeof_dtype(dtype) #if sizeof_dtype(dtype) == 4: # avail_mem /= 2 n, m = select_dim_over_m(maxM=M, maxN=N, coef_nm=1.0, coef_n=D + T, coef_m=D + T, tot=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): nm_gpu = create_same_stride((n, m), X1, dtype, ddev) out_gpu = create_same_stride((n, T), out, dtype, ddev) X1s_gpu = create_same_stride((n, D), X1, dtype, ddev) X2s_gpu = create_same_stride((m, D), X2, dtype, ddev) vs_gpu = create_same_stride((m, T), v, dtype, ddev) for i in range(0, N, n): nb = min(n, N - i) cur_X1s_gpu = copy_to_device_noorder(nb, D, X1, i, 0, X1s_gpu, 0, 0) sq1 = torch.norm(cur_X1s_gpu, p=2, dim=1, keepdim=True).pow_(2) cur_out_gpu = out_gpu.narrow(0, 0, nb) # n x T cur_out_gpu.fill_(0.0) for j in range(0, M, m): mb = min(m, M - j) cur_X2s_gpu = copy_to_device_noorder(mb, D, X2, j, 0, X2s_gpu, 0, 0) cur_vs_gpu = copy_to_device_noorder(mb, T, v, j, 0, vs_gpu, 0, 0) # m x T cur_nm_gpu = nm_gpu[:nb, :mb] # n x m sq2 = torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True).pow_(2) torch.mm(cur_X1s_gpu, cur_X2s_gpu.T, out=cur_nm_gpu) cur_nm_gpu.mul_(-2.0) cur_nm_gpu.add_(sq1) cur_nm_gpu.add_(sq2.T) cur_nm_gpu.clamp_min_(0) kernel._transform(cur_nm_gpu) # Multiply by the vector v # FIXME: This is the cause of mapping errors in case of float32 calculations. cur_out_gpu.addmm_(cur_nm_gpu, cur_vs_gpu) # n x T # send result to CPU copy_to_host_noorder(nb, T, out_gpu, 0, 0, out, i, 0) return out
def main(): total_times = 10 run_times = 0 init_threshold = ... start_t = time.time() valid_data = torch.load(TRAIN_DATA + '.valid.pt') fields = onmt.IO.load_fields(torch.load(TRAIN_DATA + '.vocab.pt')) valid_data.fields = fields # we need to clear this assignment relationg if we want to transfere valid among threads if GPU_ID: cuda.set_device(GPU_ID) while run_times < total_times: itr_time = time.time() # intit pruning checkpoint = torch.load(weights, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] masked_models = [] with cuda.device(GPU_ID): ref_model = onmt.ModelConstructor.make_base_model( model_opt, fields, True, checkpoint) ref_model.eval() ref_model.generator.eval() masked_model = MaskedModel( ref_model, group_dict, cuda.current_device(), cuda.current_device( )) # ref_model is at current_device, no copy will happen masked_models.append(masked_model) ''' display all the names of parameters ''' ''' aa=ref_model.named_parameters aa_namelist = [ak[0] for ak in aa] ''' ''' test MP ''' ''' translate_opt, translate_dummy_opt = translate_opt_initialize('opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt') translator = init_translate_model(translate_opt, translate_dummy_opt) del translator.model translator.model = masked_model tt=open(translate_opt.tgt, 'r') references = [[t] for t in tt] xxx=np.arange(0.,1, 0.01) #for i in range(len(masked_model.group_name_list)): # tmp_crate = len(masked_model.group_name_list)*[0.] for i in range(len(xxx)): translate_data = onmt.IO.ONMTDataset( translate_opt.src, translate_opt.tgt, fields, use_filter_pred=False) prune_data = onmt.IO.OrderedIterator( dataset=translate_data, device=GPU_ID, batch_size=1, train=False, sort=False, shuffle=False) tmp_crate = len(masked_model.group_name_list)*[xxx[i]] #tmp_crate[i] = 0.01 masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate(masked_model, valid_data, fields) #tmp_fit = evaluate_trans(translator, references, prune_data, translate_data) #logger.scalar_summary('test_bleu', tmp_fit[1]*100, int(xxx[i]*100)) logger.scalar_summary('acc', tmp_fit[1], int(xxx[i]*100)) logger.scalar_summary('ppl', tmp_fit[0], int(xxx[i]*100)) #logger.scalar_summary('test_ppl', tmp_fit[0], int(xxx[i]*100)) #print('group %s => acc (%.4f), ppl (%.4f)' % (masked_model.group_name_list[i], tmp_fit[1], tmp_fit[0])) #print('percentage %s => bleu (%.4f), ppl (%.4f)' % (xxx[i]*100, tmp_fit[1]*100, tmp_fit[0])) print('percentage %s => acc (%.4f), ppl (%.4f)' % (xxx[i]*100, tmp_fit[1], tmp_fit[0])) exit() ''' with cuda.device(GPU_ID): masked_models.append( MaskedModel(ref_model, group_dict, GPU_ID, gpu_candidate) ) # if the gpu_candidate is the same as ref_model, it will return the ref_model del ref_model # do pruning ncs_start = time.time() print('Itration %d, model loading: %d sec' % (run_times, ncs_start - itr_time)) init_threshold = len(masked_models[0].group_name_list) * [0.10] best_found, saved_model, best_masked_model = NCS_MP( init_threshold, 0.05, fields, masked_models, valid_data, 1.0, run_times, checkpoint) init_thresholds = best_found[0] # best pop found end_t = time.time() print('NCS Time: {} min'.format((end_t - itr_time) / 60.)) # clear no used models for gpu_model in masked_models: del gpu_model #----------------------------------------------------------------- ''' masked_models = [] with cuda.device(GPU_ID): ref_model = best_masked_model masked_model = MaskedModel(ref_model, group_dict2, cuda.current_device(), cuda.current_device()) # ref_model is at current_device, no copy will happen masked_models.append(masked_model) for gpu_candidate in other_GPU_IDs: with cuda.device(gpu_candidate): masked_models.append(MaskedModel(ref_model, group_dict2, GPU_ID, gpu_candidate)) # if the gpu_candidate is the same as ref_model, it will return the ref_model del ref_model # do pruning ncs_start = time.time() print('*Itration %d, model loading: %d sec' % (run_times, ncs_start - itr_time)) init_threshold = len(masked_models[0].group_name_list)*[0.10] best_found, saved_model, best_masked_model = NCS_MP(init_threshold, 0.05, fields, masked_models, valid_data, 0.5, run_times, checkpoint) init_thresholds = best_found[0] # best pop found end_t = time.time() print('NCS Time: {} min'.format((end_t - itr_time)/60.)) # clear no used models for gpu_model in masked_models: del gpu_model ''' exit() # training checkpoint = torch.load(saved_model, map_location=lambda storage, loc: storage) train_opt, _, _ = opt_initialize(checkpoint, 'opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt') # train data loading train = torch.load(train_opt.data + '.train.pt') valid = torch.load(train_opt.data + '.valid.pt') train_fields = load_fields(train, valid, checkpoint, train_opt) model_for_train = init_train_model(checkpoint1, train_opt, train_fields) # fields need data optim = build_optim(model_for_train, checkpoint, train_opt) model_sparsity = get_sparsity(model_for_train) logger.scalar_summary('model_sparsity_%s' % num_runs, model_sparsity, run_times) print('Sparsity: {}'.format(model_sparsity)) train_model(model_for_train, train, valid, train_fields, optim, train_opt, run_times) # update global variabel weights run_times += 1
def sparse_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 v, out = a.v, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype ntot, dtot = X1.shape mtot, T = v.size() avail_mem = max_mem / sizeof_dtype(dtype) # Memory needs: # X1_chunk : N + 2*D*N*density # X2_chunk : D + 2*D*M*density (because is transposed) # sparse_out : N + 2*N*M*(density) (assume density = 1) # ker_gpu : M*N # mmv_gpu : N*T # v_gpu : M*T # Other: GPU buffer n, m = select_dim_over_m( maxM=mtot, maxN=ntot, tot=avail_mem, coef_nm=3, coef_n=2 + 2 * dtot * X1.density + T, coef_m=2 * dtot * X2.density + T, rest=dtot, ) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): v_gpu = v.to(device=ddev) # M x T mmv_gpu = create_same_stride((n, T), out, dtype, ddev) # ker_gpu should be fortran-ordered due to cusparse csr2dense function ker_gpu = create_fortran((n, m), dtype=dtype, device=ddev) for i in range(0, ntot, n): ic = min(n, ntot - i) cur_mmv_gpu = mmv_gpu[:ic] # n x T cur_mmv_gpu.fill_(0.0) X1_chunk = X1.narrow_rows(i, ic) X1_chunk_d = X1_chunk.index_to_int().to(device=ddev) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc) # Prepare sparse on CPU ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) # Transpose X2-chunk and convert it to CSR. This uses lots of RAM X2_chunk_d = SparseTensor.from_scipy( X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=ddev) cur_ker_gpu = ker_gpu[:ic, :jc] cur_ker_gpu.fill_(0.0) # Run the matrix multiplication (kernel apply) cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d, cur_ker_gpu) cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd) # Multiply by the vector v cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc)) del ddd, X2_chunk, X2_chunk_d # send result to CPU copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0) del X1_chunk, X1_chunk_d return out
def _align_nn(self, model_path, source_kb, target_kb, candidate_selector, cuda_device, batch_size=128): """ Align using neural network model :param source_kb: :param target_kb: :param candidate_selector: :param cuda_device: GPU device number :return: """ # returns json representation of entity def _form_json_entity(ent_to_json, kb): all_rels = [ kb.relations[r_id] for r_id in ent_to_json.relation_ids ] par_ents = [ kb.get_entity_by_research_entity_id(r.entity_ids[1]) for r in all_rels if r.relation_type in constants.UMLS_PARENT_REL_LABELS ] chd_ents = [ kb.get_entity_by_research_entity_id(r.entity_ids[1]) for r in all_rels if r.relation_type in constants.UMLS_CHILD_REL_LABELS ] return { 'research_entity_id': ent_to_json.research_entity_id, 'canonical_name': ent_to_json.canonical_name, 'aliases': ent_to_json.aliases, 'definition': ent_to_json.definition, 'other_contexts': ent_to_json.other_contexts, 'par_relations': [e.canonical_name for e in par_ents], 'chd_relations': [e.canonical_name for e in chd_ents] } from emma.allennlp_classes.ontoemma_dataset_reader import OntologyMatchingDatasetReader from emma.allennlp_classes.ontoemma_model import OntoEmmaNN from emma.allennlp_classes.ontoemma_predictor import OntoEmmaPredictor alignment, s_ent_ids, t_ent_ids = self._align_string_equiv( source_kb, target_kb) sys.stdout.write("%i alignments with string equivalence\n" % len(alignment)) if cuda_device > 0: with device(cuda_device): archive = load_archive(model_path, cuda_device=cuda_device) else: archive = load_archive(model_path, cuda_device=cuda_device) predictor = Predictor.from_archive(archive, 'ontoemma-predictor') sys.stdout.write("Making predictions...\n") s_ent_tqdm = tqdm.tqdm(s_ent_ids, total=len(s_ent_ids)) temp_alignments = defaultdict(list) if cuda_device > 0: with device(cuda_device): batch_json_data = [] for s_ent_id in s_ent_tqdm: s_ent = source_kb.get_entity_by_research_entity_id( s_ent_id) for t_ent_id in candidate_selector.select_candidates( s_ent_id)[:constants.KEEP_TOP_K_CANDIDATES]: t_ent = target_kb.get_entity_by_research_entity_id( t_ent_id) json_data = { 'source_ent': _form_json_entity(s_ent, source_kb), 'target_ent': _form_json_entity(t_ent, target_kb), 'label': 0 } batch_json_data.append(json_data) if len(batch_json_data) == batch_size: results = predictor.predict_batch_json( batch_json_data, cuda_device) for model_input, output in zip( batch_json_data, results): if output['predicted_label'] == [1.0]: temp_alignments[model_input['source_ent'][ 'research_entity_id']].append( (model_input['target_ent'] ['research_entity_id'], output['score'][0])) batch_json_data = [] else: for s_ent_id in s_ent_tqdm: s_ent = source_kb.get_entity_by_research_entity_id(s_ent_id) for t_ent_id in candidate_selector.select_candidates( s_ent_id)[:constants.KEEP_TOP_K_CANDIDATES]: t_ent = target_kb.get_entity_by_research_entity_id( t_ent_id) json_data = { 'source_ent': _form_json_entity(s_ent, source_kb), 'target_ent': _form_json_entity(t_ent, target_kb), 'label': 0 } output = predictor.predict_json(json_data, cuda_device) if output['predicted_label'] == [1.0]: temp_alignments[json_data['source_ent'][ 'research_entity_id']].append( (json_data['target_ent']['research_entity_id'], output['score'][0])) alignment = [] for s_ent_id, matches in temp_alignments.items(): if len(matches) > 0: m_sort = sorted(matches, key=lambda p: p[1], reverse=True) if m_sort[0][1] >= constants.NN_SCORE_THRESHOLD: alignment.append((s_ent_id, m_sort[0][0], m_sort[0][1])) return alignment
def _work(process_id, model, dataset, args): databin = dataset[process_id] n_gpus = torch.cuda.device_count() data_loader = DataLoader(databin, shuffle=False, num_workers=args.num_workers // n_gpus, pin_memory=False) with torch.no_grad(), cuda.device(process_id): model.cuda() for iter, pack in enumerate(tqdm(data_loader)): img_name = pack['name'][0] label = pack['label'][0] size = pack['size'] strided_size = imutils.get_strided_size(size, 4) strided_up_size = imutils.get_strided_up_size(size, 16) # Run through each scale of image outputs = [ model(img[0].cuda(non_blocking=True)) for img in pack['img'] ] # Each output is resized to strided_size (lower than original) and summed strided_cam = torch.sum( torch.stack([ F.interpolate(torch.unsqueeze(o, 0), strided_size, mode='bilinear', align_corners=False)[0] for o in outputs ]), 0) # Each output is resized to strided_up_size (which should be orignal size?) highres_cam = [ F.interpolate(torch.unsqueeze(o, 1), strided_up_size, mode='bilinear', align_corners=False) for o in outputs ] highres_cam = torch.sum(torch.stack(highres_cam, 0), 0)[:, 0, :size[0], :size[1]] # Pick the cams corresponding to image-level labels # Normalize by max value across H x W dimension for each channel valid_cat = torch.nonzero(label, as_tuple=False)[:, 0] strided_cam = strided_cam[valid_cat] strided_cam /= F.adaptive_max_pool2d(strided_cam, (1, 1)) + 1e-5 highres_cam = highres_cam[valid_cat] highres_cam /= F.adaptive_max_pool2d(highres_cam, (1, 1)) + 1e-5 # save cams np.save( os.path.join(args.cam_out_dir, img_name + '.npy'), { "keys": valid_cat, "cam": strided_cam.cpu(), "high_res": highres_cam.cpu().numpy() }) if process_id == n_gpus - 1 and iter % (len(databin) // 20) == 0: print("%d " % ((5 * iter + 1) // (len(databin) // 20)), end='') sys.stdout.flush()
def __init__(self, pretrained, group_dict, source_device_id, target_device_id): super(MaskedModel, self).__init__() self.skip_mark = 'skip' #skip the layer while pruning self.mask_dict = {} #self.sort_tensors = {} self.layer_element_num = [] self.layer_num = 0 self.layer_name_dict = {} self.sparsity = 1.0 self.total_parameter_num = 0 # used for group parameters self.group_name_list = [] self.map_dict = {} self.group_num_dict = {} self.group_parameter_dict = {} self.group_threshold_list = {} # gpu realted self.sgpu_id = source_device_id self.tgpu_id = target_device_id # settings for retraining self.pre_forward_fn = None self.forward_fn = None # init group dicts self.group_name_list = [ k for k in group_dict.keys() ] # the indices of list will map the the thresholds list which is accepted at pruning self.group_threshold_list = len(self.group_name_list) * [0.] for key, layer_names in group_dict.items(): self.group_num_dict[key] = 0 # maping the layer name to group name for layer_name in layer_names: self.map_dict[layer_name] = key pretrained_model_on_device = None # for each retrieval, transfer the pretrained model to a dictionary if source_device_id == target_device_id: pretrained_model_on_device = pretrained else: pretrained_model_on_device = my_replicate( pretrained, source_device_id, target_device_id) #[source_device_id, target1, ...] with cuda.device(target_device_id): self.pretrained_model_dict = dict([ (n, v) for n, v in pretrained_model_on_device.named_parameters() ]) # 20191023 lgy #self.pretrained_model_dict = dict([(n,v) for n,v in pretrained_model_on_device.state_dict().items()]) for param_name, module_param in self.pretrained_model_dict.items(): if param_name in self.map_dict: # ignore no-grouped layers self.group_num_dict[ self.map_dict[param_name]] += module_param.nelement() for group_name in self.group_name_list: self.group_parameter_dict[group_name] = torch.cuda.FloatTensor( 1, self.group_num_dict[group_name]) self.generate_mask( self.pretrained_model_dict) # init self.mask_dict self.masked_model = my_replicate(pretrained, source_device_id, target_device_id) #self.generator = self.masked_model.generator self.encoder = self.masked_model.encoder self.decoder = self.masked_model.decoder
def _work(process_id, model, dataset, args): databin = dataset[process_id] n_gpus = torch.cuda.device_count() data_loader = DataLoader(databin, shuffle=False, pin_memory=False) with torch.no_grad(), cuda.device(process_id): model.cuda() with tqdm(total=len(data_loader)) as pbar: for iter, pack in enumerate(data_loader): img_name = pack['name'][0] size = pack['size'] strided_size = imutils.get_strided_size(size, 4) strided_up_size = imutils.get_strided_up_size(size, 16) if args.dataset in ['adp_morph', 'adp_func']: outputs, labels = zip(*[ model(img.cuda( non_blocking=True), orig_img.cuda( non_blocking=True)) for img, orig_img in zip(pack['img'], pack['orig_img']) ]) else: outputs, labels = zip(*[ model(img.cuda(non_blocking=True)) for img in pack['img'] ]) if 'train' in args.split: label = pack['label'][0] else: label = labels[0][args.use_cls] valid_cat = torch.nonzero(label)[:, 0] if args.dataset in ['adp_morph', 'adp_func']: if torch.cuda.is_available(): valid_cat = torch.cat( (torch.from_numpy( np.array(range(len(args.class_names['bg'])), dtype=np.int64)).cuda(), valid_cat.cuda() + len(args.class_names['bg']))) else: valid_cat = torch.cat( (torch.from_numpy( np.array(range(len(args.class_names['bg'])), dtype=np.int64)), valid_cat + len(args.class_names['bg']))) if len(valid_cat) > 0: strided_cam = torch.sum( torch.stack([ F.interpolate(torch.unsqueeze(o, 0), strided_size, mode='bilinear', align_corners=False)[0] for o in outputs ]), 0) highres_cam = [ F.interpolate(torch.unsqueeze(o, 1), strided_up_size, mode='bilinear', align_corners=False) for o in outputs ] highres_cam = torch.sum(torch.stack(tuple(highres_cam), 0), 0)[:, 0, :size[0], :size[1]] strided_cam = strided_cam[valid_cat] strided_cam /= F.adaptive_max_pool2d(strided_cam, (1, 1)) + 1e-5 highres_cam = highres_cam[valid_cat] highres_cam /= F.adaptive_max_pool2d(highres_cam, (1, 1)) + 1e-5 # save cams if args.dataset not in ['deepglobe', 'deepglobe_balanced']: np.save( os.path.join(args.cam_out_dir, img_name + '.npy'), { "keys": valid_cat.cpu().numpy(), "cam": strided_cam.cpu().numpy(), "high_res": highres_cam.cpu().numpy() }) else: np.save( os.path.join(args.cam_out_dir, img_name + '.npy'), { "keys": valid_cat.cpu().numpy(), "cam": strided_cam.cpu().numpy() }) else: np.save( os.path.join(args.cam_out_dir, img_name + '.npy'), { "keys": np.empty(0), "cam": np.empty(0), "high_res": np.empty(0) }) pbar.update(1)
def main(): total_times = 100 run_times = 0 init_threshold = ... start_t = time.time() valid_data = torch.load(TRAIN_DATA + '.valid.pt') fields = onmt.IO.load_fields(torch.load(TRAIN_DATA + '.vocab.pt')) # fields = onmt.IO.load_fields_from_vocab(torch.load(TRAIN_DATA + '.vocab.pt')) valid_data.fields = fields # we need to clear this assignment relationg if we want to transfere valid among threads checkpoint = torch.load(weights, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] masked_models = [] with cuda.device(GPU_ID): ref_model = onmt.ModelConstructor.make_base_model(model_opt, fields, True, checkpoint) ref_model.eval() ref_model.generator.eval() masked_model = MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device()) # ref_model is at current_device, no copy will happen masked_models.append(masked_model) train_opt, _, _ = opt_initialize(checkpoint, 'opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt') if GPU_ID: cuda.set_device(GPU_ID) print("BLEU evaluation:") translate_opt, translate_dummy_opt = translate_opt_initialize('opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt') translator = init_translate_model(translate_opt, translate_dummy_opt) del translator.model translator.model = masked_model.masked_model tt=open(translate_opt.tgt, 'r') references = [[t] for t in tt] translate_data = onmt.IO.ONMTDataset(translate_opt.src, translate_opt.tgt,fields,use_filter_pred=False) prune_data = onmt.IO.OrderedIterator(dataset=translate_data, device=GPU_ID,batch_size=1, train=False, sort=False,shuffle=False) tmp_fit2 = evaluate_trans(translator, references, prune_data, translate_data) print('Finsished => bleu (%.4f), ppl (%.4f)' % (tmp_fit2[1]*100, tmp_fit2[0])) exit() # print(time_now(), "get accuray of no pruning model") # masked_model.make_evaluable() # tmp_crate = len(masked_model.group_name_list)*[0] # masked_model.change_mask(tmp_crate, apply_MP_on_mask) # masked_model.apply_mask() # tmp_fit = evaluate(masked_model, valid_data, fields) # # 只需要原始的accuracy # acc_of_no_prune = tmp_fit[1] # acc_of_no_prune = int(acc_of_no_prune*10)/10 print("init accuracy of model:", acc_of_no_prune) print("accuracy constraint:", acc_percent_prune) while run_times < total_times: print("-----------------------------------------") print("-----------------------------------------") print("-----------------------------------------") print(time_now(), "start Iteration ", run_times) print("test model---------------") ref_model.eval() ref_model.generator.eval() tmp_fit = evaluate(ref_model, valid_data, fields) print('ref_model','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) print("test model---------------") masked_models[0].make_evaluable() tmp_fit = evaluate(masked_models[0], valid_data, fields) print('masked_models[0]','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) model_sparsity = masked_models[0].get_sparsity() print('masked_models[0] Sparsity: {}'.format(model_sparsity)) itr_time = time.time() for gpu_candidate in other_GPU_IDs: with cuda.device(gpu_candidate): masked_models.append(MaskedModel(ref_model, group_dict, GPU_ID, gpu_candidate)) # if the gpu_candidate is the same as ref_model, it will return the ref_model #------------- Here ------------------------- # del ref_model # do pruning ncs_start = time.time() print('Itration %d, model loading: %d sec' % (run_times, ncs_start - itr_time)) if run_times == 0: if START_THRESHOLD is not None: init_threshold = START_THRESHOLD else: init_threshold = len(masked_models[0].group_name_list)*[0.25] # if run_times == 0: # init_threshold = len(masked_models[0].group_name_list)*[0.25] print("init threshold:", init_threshold) prune_acc_now = acc_percent_prune+tmp_fit[1]-acc_of_no_prune print('pruning acc now:', prune_acc_now) best_found, saved_model, best_masked_model = NCS_MP(init_threshold, 0.05, fields, masked_models, valid_data, prune_acc_now, run_times, checkpoint) init_threshold = best_found #best_found, saved_model, best_masked_model = NCS_MP(init_threshold, 0.05, fields, masked_models, valid_data, 0.01, run_times, checkpoint) end_t = time.time() print('NCS Time: {} min'.format((end_t - itr_time)/60.)) print('Best found thresholds:') for i in range(len(masked_models[0].group_name_list)): print("layer {}: {}%".format(masked_models[0].group_name_list[i], 100*best_found[i])) print("BLEU evaluation:") translate_opt, translate_dummy_opt = translate_opt_initialize('opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt') translator = init_translate_model(translate_opt, translate_dummy_opt) del translator.model translator.model = best_masked_model tt=open(translate_opt.tgt, 'r') references = [[t] for t in tt] translate_data = onmt.IO.ONMTDataset(translate_opt.src, translate_opt.tgt,fields,use_filter_pred=False) prune_data = onmt.IO.OrderedIterator(dataset=translate_data, device=GPU_ID,batch_size=1, train=False, sort=False,shuffle=False) tmp_fit = evaluate_trans(translator, references, prune_data, translate_data) print('Finsished => bleu (%.4f), ppl (%.4f)' % (tmp_fit[1]*100, tmp_fit[0])) # clear no used models for gpu_model in masked_models: del gpu_model #--------------- start retraining -------------- model_for_train = best_masked_model pretrained_leaf_dict = model_for_train.make_trainable() optim = build_optim(model_for_train.masked_model, checkpoint, train_opt, pretrained_leaf_dict) print(time_now(), "start loading data for retraining") train = torch.load(train_opt.data+ '.train.pt') valid = torch.load(train_opt.data + '.valid.pt') train_fields = load_fields(train, valid, checkpoint, train_opt) print(time_now(), "finish data loading") model_for_train.change_mask(init_threshold, apply_MP_on_mask) model_for_train.apply_mask() model_for_train.make_trainable() recovered = train_model(model_for_train, train, valid, train_fields, optim, train_opt, run_times, acc_of_no_prune) print(time_now(), "finish retraining ") if not recovered: exit() else: print("------------Accuracy recorverd!--------------------") print("recovered accuracy:", acc_of_no_prune) model_for_train.make_evaluable() ref_model = model_for_train.masked_model masked_models = [MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device())] print("test model---------------") tmp_fit = evaluate(ref_model, valid_data, fields) print('ref_model','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) print("test model---------------") tmp_fit = evaluate(masked_models[0], valid_data, fields) print('masked_models[0]','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0])) model_sparsity = masked_models[0].get_sparsity() print('masked_models[0] Sparsity: {}'.format(model_sparsity)) print('------------- save checkpoint ---------------') _, saved_model = update_checkpoint(checkpoint, model_for_train, run_times, acc_percent_prune, t=True) print(time_now(), ' saving model:', saved_model) print("-------------print evaluation info ---------------") tmp_fit = evaluate(model_for_train, valid_data, fields) print('percentage %s => acc (%.4f), ppl (%.4f)' % (best_found*100, tmp_fit[1], tmp_fit[0])) #-------------------------------------------------- print("BLEU evaluation:") translate_opt, translate_dummy_opt = translate_opt_initialize('opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt') translator = init_translate_model(translate_opt, translate_dummy_opt) del translator.model translator.model = model_for_train.masked_model tt=open(translate_opt.tgt, 'r') references = [[t] for t in tt] translate_data = onmt.IO.ONMTDataset(translate_opt.src, translate_opt.tgt,fields,use_filter_pred=False) prune_data = onmt.IO.OrderedIterator(dataset=translate_data, device=GPU_ID,batch_size=1, train=False, sort=False,shuffle=False) tmp_fit2 = evaluate_trans(translator, references, prune_data, translate_data) print('Finsished => bleu (%.4f), ppl (%.4f)' % (tmp_fit2[1]*100, tmp_fit2[0])) #-------------------------------------------------- run_times += 1
def _work(process_id, model, dataset, args): databin = dataset[process_id] n_gpus = torch.cuda.device_count() data_loader = DataLoader(databin, shuffle=False, num_workers=args.num_workers // n_gpus, pin_memory=True) print("dcpu", args.num_workers // n_gpus) cam_sizes = [[], [], [], []] # scale 0,1,2,3 with cuda.device(process_id): model.cuda() gcam = GradCAM(model=model, candidate_layers=[args.target_layer]) for iter, pack in enumerate(data_loader): img_name = pack['name'][0] if os.path.exists(os.path.join(args.cam_out_dir, img_name + '.npy')): continue size = pack['size'] strided_size = imutils.get_strided_size(size, 4) strided_up_size = imutils.get_strided_up_size(size, 16) outputs_cam = [] n_classes = len(list(torch.nonzero(pack['label'][0])[:, 0])) for s_count, size_idx in enumerate([1, 0, 2, 3]): orig_img = pack['img'][size_idx].clone() for c_idx, c in enumerate( list(torch.nonzero(pack['label'][0])[:, 0])): pack['img'][size_idx] = orig_img img_single = pack['img'][size_idx].detach()[ 0] # [:, 1]: flip if size_idx != 1: total_adv_iter = args.adv_iter else: # size_idx == 0 if args.adv_iter > 10: total_adv_iter = args.adv_iter // 2 mul_for_scale = 2 elif args.adv_iter < 6: total_adv_iter = args.adv_iter mul_for_scale = 1 else: total_adv_iter = 5 mul_for_scale = float(total_adv_iter) / 5 for it in range(total_adv_iter): img_single.requires_grad = True outputs = gcam.forward( img_single.cuda(non_blocking=True)) if c_idx == 0 and it == 0: cam_all_classes = torch.zeros([ n_classes, outputs.shape[2], outputs.shape[3] ]) gcam.backward(ids=c) regions = gcam.generate(target_layer=args.target_layer) regions = regions[0] + regions[1].flip(-1) if it == 0: init_cam = regions.detach() cam_all_classes[c_idx] += regions[0].data.cpu( ) * mul_for_scale logit = outputs logit = F.relu(logit) logit = torchutils.gap2d(logit, keepdims=True)[:, :, 0, 0] valid_cat = torch.nonzero(pack['label'][0])[:, 0] logit_loss = -2 * (logit[:, c]).sum() + torch.sum(logit) expanded_mask = torch.zeros(regions.shape) expanded_mask = add_discriminative( expanded_mask, regions, score_th=args.score_th) L_AD = torch.sum((torch.abs(regions - init_cam)) * expanded_mask.cuda()) loss = -logit_loss - L_AD * args.AD_coeff model.zero_grad() img_single.grad.zero_() loss.backward() data_grad = img_single.grad.data perturbed_data = adv_climb(img_single, args.AD_stepsize, data_grad) img_single = perturbed_data.detach() outputs_cam.append(cam_all_classes) strided_cam = torch.sum( torch.stack([ F.interpolate(torch.unsqueeze(o, 0), strided_size, mode='bilinear', align_corners=False)[0] for o in outputs_cam ]), 0) highres_cam = [ F.interpolate(torch.unsqueeze(o, 1), strided_up_size, mode='bilinear', align_corners=False) for o in outputs_cam ] highres_cam = torch.sum(torch.stack(highres_cam, 0), 0)[:, 0, :size[0], :size[1]] strided_cam /= F.adaptive_max_pool2d(strided_cam, (1, 1)) + 1e-5 highres_cam /= F.adaptive_max_pool2d(highres_cam, (1, 1)) + 1e-5 np.save( os.path.join(args.cam_out_dir, img_name + '.npy'), { "keys": valid_cat, "cam": strided_cam.cpu(), "high_res": highres_cam.cpu().numpy() })
def _align_nn(self, model_path, source_kb, target_kb, candidate_selector, cuda_device, batch_size=128): """ Align using neural network model :param source_kb: :param target_kb: :param candidate_selector: :param cuda_device: GPU device number :return: """ from emma.allennlp_classes.ontoemma_dataset_reader import OntologyMatchingDatasetReader from emma.allennlp_classes.ontoemma_model import OntoEmmaNN from emma.allennlp_classes.ontoemma_predictor import OntoEmmaPredictor alignment, s_ent_ids, t_ent_ids = self._align_string_equiv( source_kb, target_kb, candidate_selector) sys.stdout.write("%i alignments with string equivalence\n" % len(alignment)) if cuda_device > 0: with device(cuda_device): archive = load_archive(model_path, cuda_device=cuda_device) else: archive = load_archive(model_path, cuda_device=cuda_device) predictor = Predictor.from_archive(archive, 'ontoemma-predictor') sys.stdout.write("Making predictions...\n") s_ent_tqdm = tqdm.tqdm(s_ent_ids, total=len(s_ent_ids)) sim_scores = dict() if cuda_device > 0: with device(cuda_device): batch_json_data = [] for s_ent_id in s_ent_tqdm: s_ent = source_kb.get_entity_by_research_entity_id( s_ent_id) for t_ent_id in candidate_selector.select_candidates( s_ent_id)[:constants.KEEP_TOP_K_CANDIDATES]: t_ent = target_kb.get_entity_by_research_entity_id( t_ent_id) json_data = { 'source_ent': s_ent.form_json(), 'target_ent': t_ent.form_json(), 'label': 0 } batch_json_data.append(json_data) if len(batch_json_data) == batch_size: results = predictor.predict_batch_json( batch_json_data, cuda_device) for model_input, output in zip( batch_json_data, results): sim_scores[(model_input['source_ent'] ['research_entity_id'], model_input['target_ent'] ['research_entity_id'] )] = output['score'][0] batch_json_data = [] # finish last batch if batch_json_data: results = predictor.predict_batch_json( batch_json_data, cuda_device) for model_input, output in zip(batch_json_data, results): sim_scores[( model_input['source_ent']['research_entity_id'], model_input['target_ent']['research_entity_id'] )] = output['score'][0] else: for s_ent_id in s_ent_tqdm: s_ent = source_kb.get_entity_by_research_entity_id(s_ent_id) for t_ent_id in candidate_selector.select_candidates( s_ent_id)[:constants.KEEP_TOP_K_CANDIDATES]: t_ent = target_kb.get_entity_by_research_entity_id( t_ent_id) json_data = { 'source_ent': s_ent.form_json(), 'target_ent': t_ent.form_json(), 'label': 0 } output = predictor.predict_json(json_data, cuda_device) sim_scores[(json_data['source_ent']['research_entity_id'], json_data['target_ent']['research_entity_id'] )] = output['score'][0] return sim_scores
def main(): total_times = 100 run_times = 0 init_threshold = ... start_t = time.time() valid_data = torch.load(TRAIN_DATA + '.valid.pt') fields = onmt.IO.load_fields(torch.load(TRAIN_DATA + '.vocab.pt')) # fields = onmt.IO.load_fields_from_vocab(torch.load(TRAIN_DATA + '.vocab.pt')) valid_data.fields = fields # we need to clear this assignment relationg if we want to transfere valid among threads checkpoint = torch.load(weights, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] masked_models = [] with cuda.device(GPU_ID): ref_model = onmt.ModelConstructor.make_base_model( model_opt, fields, True, checkpoint) ref_model.eval() ref_model.generator.eval() masked_model = MaskedModel( ref_model, group_dict, cuda.current_device(), cuda.current_device( )) # ref_model is at current_device, no copy will happen masked_models.append(masked_model) if GPU_ID: cuda.set_device(GPU_ID) # 1 means 1% acc acc_percent_prune = 1 # 只需要原始的accuracy acc_of_no_prune = 0 get_acc_of_no_prune = False print(time_now(), "start while") while run_times < total_times: print("-----------------------------------------") print("start Iteration ", run_times) # init threshold best_threshold = 0 itr_time = time.time() ''' display all the names of parameters ''' ''' aa=ref_model.named_parameters aa_namelist = [ak[0] for ak in aa] ''' ''' test MP ''' translate_opt, translate_dummy_opt = translate_opt_initialize( 'opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt') translator = init_translate_model(translate_opt, translate_dummy_opt) del translator.model translator.model = masked_model tt = open(translate_opt.tgt, 'r') references = [[t] for t in tt] xxx = np.arange(0., 1, 0.01) #for i in range(len(masked_model.group_name_list)): # tmp_crate = len(masked_model.group_name_list)*[0.] print(time_now(), "start testing pruning") masked_model.make_evaluable() for i in range(len(xxx)): # best_threshold = 0.55 # break translate_data = onmt.IO.ONMTDataset(translate_opt.src, translate_opt.tgt, fields, use_filter_pred=False) prune_data = onmt.IO.OrderedIterator(dataset=translate_data, device=GPU_ID, batch_size=1, train=False, sort=False, shuffle=False) tmp_crate = len(masked_model.group_name_list) * [xxx[i]] #tmp_crate[i] = 0.01 masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() tmp_fit = evaluate(masked_model, valid_data, fields) #tmp_fit = evaluate_trans(translator, references, prune_data, translate_data) #logger.scalar_summary('test_bleu', tmp_fit[1]*100, int(xxx[i]*100)) #logger.scalar_summary('acc', tmp_fit[1], int(xxx[i]*100)) #logger.scalar_summary('ppl', tmp_fit[0], int(xxx[i]*100)) #logger.scalar_summary('test_ppl', tmp_fit[0], int(xxx[i]*100)) #print('group %s => acc (%.4f), ppl (%.4f)' % (masked_model.group_name_list[i], tmp_fit[1], tmp_fit[0])) #print('percentage %s => bleu (%.4f), ppl (%.4f)' % (xxx[i]*100, tmp_fit[1]*100, tmp_fit[0])) # print('percentage %s => acc (%.4f), ppl (%.4f)' % (xxx[i]*100, tmp_fit[1], tmp_fit[0])) if i == 0 and not get_acc_of_no_prune: acc_of_no_prune = tmp_fit[1] acc_of_no_prune = int(acc_of_no_prune * 100) / 100 get_acc_of_no_prune = True elif acc_of_no_prune - tmp_fit[1] > acc_percent_prune: best_threshold = xxx[i] - 0.01 break # ------------------------------------------------- # Start writing # prune again print(time_now(), " start accuracy:", acc_of_no_prune) print("-------test------------:", get_acc_of_no_prune) print(time_now(), " apply pruning with threshold:", best_threshold) tmp_crate = len(masked_model.group_name_list) * [best_threshold] masked_model.change_mask(tmp_crate, apply_MP_on_mask) masked_model.apply_mask() # print information tmp_fit = evaluate(masked_model, valid_data, fields) print('percentage %s => acc (%.4f), ppl (%.4f)' % (best_threshold * 100, tmp_fit[1], tmp_fit[0])) model_sparsity = masked_model.get_sparsity() print('Sparsity: {}'.format(model_sparsity)) #--------------- start retraining -------------- # first store model print(time_now(), "start saving model") _, saved_model = update_checkpoint(checkpoint, masked_model, run_times) print(time_now(), "finish saving model") print(time_now(), "start loading model") checkpoint = torch.load(SAVE_MODEL_TMP_FOLDER + saved_model, map_location=lambda storage, loc: storage) train_opt, _, _ = opt_initialize(checkpoint, 'opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt') # train data loading print(time_now(), "start loading data for retraining") train = torch.load(train_opt.data + '.train.pt') valid = torch.load(train_opt.data + '.valid.pt') print(time_now(), "finish data loading") train_fields = load_fields(train, valid, checkpoint, train_opt) model_for_train = init_train_model(checkpoint, train_opt, train_fields) masked_model = MaskedModel(model_for_train, group_dict, cuda.current_device(), cuda.current_device()) masked_model.make_trainable() print(time_now(), "building optm") optim = build_optim(model_for_train, checkpoint, train_opt) print(time_now(), "start restraining") recovered = train_model(model_for_train, train, valid, train_fields, optim, train_opt, run_times, acc_of_no_prune) print(time_now(), "finish retraining ") if not recovered: exit() else: print("------------Accuracy recorverd!--------------------") print("recovered accuracy:", acc_of_no_prune) run_times += 1 masked_model.make_evaluable() tmp_fit = evaluate(masked_model, valid_data, fields) print("------------------for test-------------------") print('percentage %s => acc (%.4f), ppl (%.4f)' % (best_threshold * 100, tmp_fit[1], tmp_fit[0]))
def _work_gpu(process_id, model, dataset, args): n_gpus = torch.cuda.device_count() databin = dataset[process_id] data_loader = DataLoader(databin, shuffle=False, num_workers=args.num_workers // n_gpus, pin_memory=False) with torch.no_grad(), cuda.device(process_id): model.cuda() for iter, pack in tqdm(enumerate(data_loader), total=len(databin)): img_name = pack['name'][0] size = np.asarray(pack['size']) edge, dp = model(pack['img'][0].cuda(non_blocking=True)) dp = dp.cpu().numpy() cam_dict = np.load(args.cam_out_dir + '/' + img_name + '.npy', allow_pickle=True).item() cams = cam_dict['cam'].cuda() keys = cam_dict['keys'] centroids = find_centroids_with_refinement(dp) instance_map = cluster_centroids(centroids, dp) instance_cam = separte_score_by_mask(cams, instance_map) rw = indexing.propagate_to_edge(instance_cam, edge, beta=args.beta, exp_times=args.exp_times, radius=5) rw_up = F.interpolate(rw, scale_factor=4, mode='bilinear', align_corners=False)[:, 0, :size[0], :size[1]] rw_up = rw_up / torch.max(rw_up) rw_up_bg = F.pad(rw_up, (0, 0, 0, 0, 1, 0), value=args.ins_seg_bg_thres) num_classes = len(keys) num_instances = instance_map.shape[0] instance_shape = torch.argmax(rw_up_bg, 0).cpu().numpy() instance_shape = pyutils.to_one_hot( instance_shape, maximum_val=num_instances * num_classes + 1)[1:] instance_class_id = np.repeat(keys, num_instances) detected = detect_instance(rw_up.cpu().numpy(), instance_shape, instance_class_id, max_fragment_size=size[0] * size[1] * 0.01) np.save(os.path.join(args.ins_seg_out_dir, img_name + '.npy'), detected) if process_id == n_gpus - 1 and iter % (len(databin) // 4) == 0: print("%d " % ((5 * iter + 1) // (len(databin) // 4)), end='')
def clear_cache(self): with cuda.device(self.tgpu_id): cuda.empty_cache()
def distk_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out kernel: L2DistanceKernel = a.kernel max_mem = a.max_mem N, D = X1.size() M = X2.size(0) T = v.shape[1] if v is not None else w.shape[1] dtype = X1.dtype cuda_inputs = X1.is_cuda # Memory usage: # v : M x T # K : n x M # X1ss : n x d # X2s : M x d # Kv : n x T # out : M x T # sq1 : n x 1 # sq2 : M x 1 # ------------ # total : n*d + M*d + n*(M + T + 1) + 2*M*T + M avail_mem = max_mem / sizeof_dtype(dtype) rest_coef = 2 * M * T if v is not None else M * T n, d = select_dim_over_nd(max_n=N, max_d=D, coef_nd=1, coef_n=M + T + 1, coef_d=M, rest=rest_coef + M, max_mem=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) s1 = tcd.Stream(ddev) s2 = tcd.Stream(ddev) with tcd.device(ddev), tcd.stream(s1): # First collect necessary memory mem_needed = n * M + n * T + n + M if not cuda_inputs: mem_needed += n * d + M * d if v is not None: mem_needed += M * T if not out.is_cuda: mem_needed += M * T # Create flat tensor flat_gpu_tn = torch.empty(size=(mem_needed, ), dtype=dtype, device=ddev) # Extract the sub-tensors flat_offset = 0 if v is not None: if not cuda_inputs: v_gpu = extract_same_stride(flat_gpu_tn, size=(M, T), other=v, offset=flat_offset) flat_offset += np.prod(v_gpu.shape) copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0) else: v_gpu = v K_gpu = extract_same_stride(flat_gpu_tn, size=(n, M), other=X1, offset=flat_offset) flat_offset += np.prod(K_gpu.shape) Kv_gpu = extract_same_stride(flat_gpu_tn, size=(n, T), other=X1, offset=flat_offset) flat_offset += np.prod(Kv_gpu.shape) if out.is_cuda: out_gpu = out else: out_gpu = extract_same_stride(flat_gpu_tn, size=(M, T), other=out, offset=flat_offset) flat_offset += np.prod(out_gpu.shape) out_gpu.fill_(0.0) if not cuda_inputs: X1ss_gpu = extract_same_stride(flat_gpu_tn, size=(n, d), other=X1, offset=flat_offset) flat_offset += np.prod(X1ss_gpu.shape) X2s_gpu = extract_same_stride(flat_gpu_tn, size=(M, d), other=X2, offset=flat_offset) flat_offset += np.prod(X2s_gpu.shape) sq1_gpu = extract_same_stride(flat_gpu_tn, size=(n, ), other=X1, offset=flat_offset) flat_offset += np.prod(sq1_gpu.shape) sq2_gpu = extract_same_stride(flat_gpu_tn, size=(M, ), other=X1, offset=flat_offset) for i in range(0, N, n): nb = min(N - i, n) cur_K_gpu = K_gpu[:nb] # nb x M cur_K_gpu.fill_(0.0) for j in range(0, D, d): db = min(D - j, d) s1.synchronize( ) # need that the add_(sq2_gpu.T) op is complete to avoid overwrite # Parallelize two matrix transfers with tcd.stream(s2): if cuda_inputs: cur_X2s_gpu = X2[:, j:j + db] else: cur_X2s_gpu = copy_to_device_noorder(M, db, X2, 0, j, X2s_gpu, 0, 0, s=s2) torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2) if cuda_inputs: cur_X1ss_gpu = X1[i:i + nb, j:j + db] else: cur_X1ss_gpu = copy_to_device_noorder(nb, db, X1, i, j, X1ss_gpu, 0, 0, s=s1) torch.norm(cur_X1ss_gpu, p=2, dim=1, keepdim=True, out=sq1_gpu).pow_(2) s2.synchronize( ) # need that cur_X2s_gpu and sq2_gpu are available. cur_K_gpu.addmm_(mat1=cur_X1ss_gpu, mat2=cur_X2s_gpu.T, alpha=-2.0) cur_K_gpu.add_(sq1_gpu) cur_K_gpu.add_(sq2_gpu.T) cur_K_gpu.clamp_min_(0) cur_K_gpu = kernel._transform(cur_K_gpu) if w is not None: cur_Kv_gpu = copy_to_device_noorder(nb, T, w, i, 0, Kv_gpu, 0, 0, s=s1) # n x T if v is not None: cur_Kv_gpu.addmm_(cur_K_gpu, v_gpu) else: # v cannot be None if w is None cur_Kv_gpu = Kv_gpu.narrow(0, 0, nb) # n x T torch.mm(cur_K_gpu, v_gpu, out=cur_Kv_gpu) # n x T # Multiply transposed kernel with the Kv result. out_gpu.addmm_(cur_K_gpu.T, cur_Kv_gpu) # M x T if not out.is_cuda: copy_to_host_noorder(M, T, out_gpu, 0, 0, out, 0, 0) s1.synchronize() return out