예제 #1
0
def _async_copy(inputs, device_ids):
    nr_devs = len(device_ids)
    assert type(inputs) in (tuple, list)
    assert len(inputs) == nr_devs

    outputs = []
    for i, dev in zip(inputs, device_ids):
        with cuda.device(dev):
            outputs.append(async_copy_to(i, dev))

    return tuple(outputs)
예제 #2
0
def _async_copy_stream(inputs, device_ids):
    nr_devs = len(device_ids)
    assert type(inputs) in (tuple, list)
    assert len(inputs) == nr_devs

    outputs = []
    streams = [_get_stream(d) for d in device_ids]
    for i, dev, stream in zip(inputs, device_ids, streams):
        with cuda.device(dev):
            main_stream = cuda.current_stream()
            with cuda.stream(stream):
                outputs.append(async_copy_to(i, dev, main_stream=main_stream))
            main_stream.wait_stream(stream)

    return outputs
예제 #3
0
def _generic_fmm(proc_idx, queue, device_id):
    # Unpack the function arguments
    a: ArgsFmm = queue.get()
    X1: torch.Tensor = a.X1
    X2: torch.Tensor = a.X2
    cuda_inputs = X1.is_cuda
    out = a.out
    kernel, gpu_dtype = a.kernel, a.gpu_dtype
    max_mem = a.max_mem
    num_streams = a.num_streams

    # flags and local variables
    change_dtype = gpu_dtype != X1.dtype
    X1_equal_X2 = _gpu_tns_same_memory(X1, X2)
    use_gpu_bufs = change_dtype or not cuda_inputs
    stride = "F" if is_f_contig(out, strict=True) else "C"
    j_iter = 0
    dts = sizeof_dtype(gpu_dtype)
    tc_device = torch.device('cuda:%d' % (int(device_id)))
    avail_mem = max_mem / dts

    # Choose block sizes n, m such that we won't run out of GPU memory
    ntot, d = X1.shape
    mtot = X2.shape[0]
    extra_mem = kernel.extra_mem()
    if cuda_inputs and not change_dtype:
        # No allocation will be performed by us. Only in-kernel stuff.
        n, m = select_dim_over_nm(max_n=ntot,
                                  max_m=mtot,
                                  d=d,
                                  coef_nd=extra_mem.get('nd', 0),
                                  coef_md=extra_mem.get('md', 0),
                                  coef_nm=extra_mem.get('nm', 0),
                                  coef_n=extra_mem.get('n', 0),
                                  coef_m=extra_mem.get('m', 0),
                                  rest=extra_mem.get('d', 0),
                                  max_mem=avail_mem)
    else:
        n, m = select_dim_over_nm(
            max_n=ntot,
            max_m=mtot,
            d=d,
            coef_nd=num_streams * (extra_mem.get('nd', 0) + 1),
            coef_md=num_streams * (extra_mem.get('md', 0) + 1),
            coef_nm=num_streams * (extra_mem.get('nm', 0) + 1),
            coef_n=extra_mem.get('n', 0),
            coef_m=extra_mem.get('m', 0),
            rest=extra_mem.get('d', 0),
            max_mem=avail_mem)

    # Create streams
    streams = [tcd.Stream(device=tc_device) for _ in range(num_streams)]

    # Create buffers
    if use_gpu_bufs:
        gX1 = create_same_stride((n, d), X1, gpu_dtype, tc_device)
        gX2_list = [
            create_same_stride((m, d), X2, gpu_dtype, tc_device)
            for _ in range(num_streams)
        ]
        gout_list = [
            create_same_stride((n, m), out, gpu_dtype, tc_device)
            for _ in range(num_streams)
        ]
    if not cuda_inputs:
        cpu_buf_list = [
            create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True)
            for _ in range(num_streams)
        ]

    # Define helpers for the copy-back operations (from cpu_buf to output)
    copy_ops = [None] * num_streams

    def wrap_copy_op(stream_idx):
        if copy_ops[stream_idx] is not None:
            copy_ops[stream_idx]()
            copy_ops[stream_idx] = None

    def do_copy_op(output, buf, i_, ic_, j_, jc_):
        # This function will also do the type conversion
        output[i_:i_ + ic_, j_:j_ + jc_].copy_(buf[:ic_, :jc_])

    # Kernel computation begin
    with tcd.device(tc_device):
        for i in range(0, ntot, n):
            ic = min(n, ntot - i)

            with tcd.stream(streams[j_iter % len(streams)]):
                X1_chunk = X1.narrow(0, i, ic)
                if use_gpu_bufs:
                    cur_gX1 = gX1.narrow(0, 0, ic)
                    cur_gX1.copy_(X1_chunk, non_blocking=True)
                else:
                    cur_gX1 = X1_chunk

            for j in range(0, mtot, m):
                jc = min(m, mtot - j)
                # Choose the buffers for this inner iteration
                stream_id = j_iter % len(streams)
                stream = streams[stream_id]
                if use_gpu_bufs:
                    gX2 = gX2_list[stream_id]
                    gout = gout_list[stream_id]
                if not cuda_inputs:
                    cpu_buf = cpu_buf_list[stream_id]

                # Sync for buffers we must use now (e.g. 2 previous iters)
                with tcd.stream(stream):  # Inner-loop
                    stream.synchronize()
                    wrap_copy_op(stream_id)

                    if X1_equal_X2 and j < i:  # Shortcut for symmetric kernels
                        jc = min(m, mtot - j)
                        out[i:i + ic, j:j + jc].copy_(out[j:j + jc,
                                                          i:i + ic].T,
                                                      non_blocking=True)
                        j_iter += 1
                        continue

                    # Copy (CPU->GPU)
                    X2_chunk = X2.narrow(0, j, jc)
                    if use_gpu_bufs:
                        cur_gX2 = gX2.narrow(0, 0, jc)
                        cur_gX2.copy_(X2_chunk, non_blocking=True)
                    else:
                        cur_gX2 = X2_chunk

                    if use_gpu_bufs:
                        cur_gout = gout[:ic, :jc]
                    else:
                        cur_gout = out[i:i + ic, j:j + jc]
                    cur_gout.fill_(0.0)

                    # Compute
                    ddd = kernel._prepare(cur_gX1, cur_gX2)
                    kernel._apply(cur_gX1, cur_gX2.T, cur_gout)
                    cur_gout = kernel._finalize(cur_gout, ddd)

                    # Copy Back (GPU->CPU)
                    if not cuda_inputs:
                        # copy_ does not care about the contiguity of copies, as long as it's consistent
                        # however, in case of C-contiguous inputs it will create an intermediate array
                        # which is undesired. We use cuda_memcpy2d_async which works well with C-contiguous
                        # arrays.
                        if stride == "F":
                            copy_to_host(ic,
                                         jc,
                                         cur_gout,
                                         0,
                                         0,
                                         cpu_buf,
                                         0,
                                         0,
                                         s=stream)
                        else:
                            cuda_memcpy2d_async(dst=cpu_buf.data_ptr(),
                                                dpitch=cpu_buf.stride(0) * dts,
                                                src=cur_gout.data_ptr(),
                                                spitch=cur_gout.stride(0) *
                                                dts,
                                                width=jc * dts,
                                                height=ic,
                                                stream=stream._as_parameter_)
                        copy_ops[stream_id] = partial(do_copy_op, out, cpu_buf,
                                                      i, ic, j, jc)
                    elif change_dtype:
                        out.narrow(0, i,
                                   ic).narrow(1, j,
                                              jc).copy_(cur_gout,
                                                        non_blocking=True)
                j_iter += 1

            for i in range(num_streams):
                streams[i].synchronize()
                wrap_copy_op(i)

    return out
def prune_and_eval(rank, size, orig_fit, acc_constraint, valid, es, ref_model,
                   num_runs, final_results):
    _valid = valid
    gpu_id = GPU_ID
    total_iterations = es.Tmax / es.popsize
    individual_iter_count = 0
    #ref_model = masked_models[rank]
    X = torch.Tensor(copy.deepcopy(es.pop))
    communicate_size = es.n + 4  # the size of tensors transfer accross computers
    communicate_tensor = torch.FloatTensor(communicate_size * [0.])
    fitness_list = []
    itr_best_remain = 0

    if rank == 0:  # rank 0 is the main process to collect finesses
        X.share_memory_()
        #fitness_list = [torch.FloatTensor([0.0,0.1,0.2,0.3]).share_memory_() for i in range(size)]
        fitness_list = [
            torch.FloatTensor(communicate_size * [0.]).share_memory_()
            for i in range(size)
        ]

    if rank >= 1 and rank < size:  # split tasks to different GPUs
        gpu_id = other_GPU_IDs[rank - 1]

    with cuda.device(gpu_id):
        local_fields = onmt.IO.load_fields(torch.load(TRAIN_DATA +
                                                      '.vocab.pt'))
        _valid.fields = local_fields  # fields can not be packed, so reconstruct it in each threahds

        while (individual_iter_count < total_iterations):
            if rank == 0:  # master node
                itr_X = torch.Tensor(es.ask())
                # broadcast the fathers
                X.copy_(itr_X)
                dist.broadcast(itr_X, 0)
            else:
                # recieve fathers from the source process
                dist.broadcast(X, 0)

            # apply MP on model
            x = X.numpy()[rank]
            ref_model.change_mask(x, apply_MP_on_mask)

            ref_model.apply_mask()

            # evaluate pruned network
            fitness = evaluate(ref_model, _valid, local_fields)
            communicate_tensor[0] = fitness[0]
            communicate_tensor[1] = fitness[1]
            communicate_tensor[2] = rank
            communicate_tensor[3] = ref_model.get_sparsity()
            for i in range(x.size):
                communicate_tensor[i + 4] = X[rank, i]  #x[i]

            # sync fitness
            if rank == 0:  # collect fitness across processes
                dist.gather(communicate_tensor, gather_list=fitness_list)
            else:
                dist.gather(communicate_tensor, dst=0)

            # judge new solutions
            if rank == 0:  # negatively correlated search in master node
                fit = []
                X_ = []
                for i in range(es.popsize):
                    the_fitness = 100
                    for j in range(len(
                            fitness_list)):  # results of fitness evaluation
                        if int(fitness_list[j]
                               [2]) == i:  # 0:ppl, 1:acc, 2:rank of individual
                            X_.append(fitness_list[j].numpy()[4:])
                            if orig_fit[1] - fitness_list[j][
                                    1] <= acc_constraint:
                                the_fitness = -fitness_list[j][3]
                            else:
                                the_fitness = (orig_fit[1] - fitness_list[j][1]
                                               ) / acc_constraint
                            continue
                    fit.append(the_fitness)

                es.tell(X_, fit)

                itr_best_remain = min(fit)

            final_results['result_NCS'].copy_(torch.Tensor(es.result()[0]))
            individual_iter_count += 1

            if rank == 0:  # record status
                logger.scalar_summary(
                    'ncs_%s_fitness' % num_runs,
                    es.result()[1],
                    num_runs * total_iterations + individual_iter_count)
                logger.scalar_summary(
                    'ncs_%s_best_itr_remain' % num_runs, itr_best_remain,
                    num_runs * total_iterations + individual_iter_count)
                logger.histo_summary(
                    'ncs_%s_pop' % num_runs,
                    es.result()[0],
                    num_runs * total_iterations + individual_iter_count)
                logger.histo_summary(
                    'pop of 1', X_[0],
                    num_runs * total_iterations + individual_iter_count)
                logger.scalar_summary(
                    'sp of 1', -fitness_list[0][3],
                    num_runs * total_iterations + individual_iter_count)
                logger.scalar_summary(
                    'rank of 1', fitness_list[0][2],
                    num_runs * total_iterations + individual_iter_count)
                logger.histo_summary(
                    'pop of 2', X_[1],
                    num_runs * total_iterations + individual_iter_count)
                logger.scalar_summary(
                    'sp of 2', -fitness_list[1][3],
                    num_runs * total_iterations + individual_iter_count)
                logger.scalar_summary(
                    'rank of 2', fitness_list[1][2],
                    num_runs * total_iterations + individual_iter_count)
                #logger.histo_summary('pop of 3', X_[2], num_runs*total_iterations + individual_iter_count)
                #logger.scalar_summary('sp of 3', -fitness_list[2][3], num_runs*total_iterations + individual_iter_count)
                #logger.scalar_summary('rank of 3', fitness_list[2][2], num_runs*total_iterations + individual_iter_count)

    ref_model.clear_cache()
def main():

    data_path = "{}/data/penn".format(cfg.PROJECT_ROOT)
    model_path = "{}/model/original_model/language_model/{}".format(
        cfg.PROJECT_ROOT, 'lm_model_orignal.pt')
    total_times = 20
    run_times = 0
    orginal_acc = 0
    init_threshold = ...
    start_t = time.time()

    # get data
    corpus = data.Corpus(data_path)
    ntokens = len(corpus.dictionary)
    eval_batch_size = TEST_BATCH_SIZE
    train_data = batchify(corpus.train, TRAIN_BATCH_SIZE)
    val_data = batchify(corpus.valid, TEST_BATCH_SIZE)
    valid_data = val_data
    test_data = batchify(corpus.test, TEST_BATCH_SIZE)

    ref_model = None

    # Load the best saved model.

    with cuda.device(GPU_ID):
        ff = open(model_path, 'rb')
        ref_model = torch.load(ff)
        ref_model.eval()
        masked_model = MaskedModel(
            ref_model, group_dict, cuda.current_device(), cuda.current_device(
            ))  # ref_model is at current_device, no copy will happen
        #pdb.set_trace()
        ff.close()
    if GPU_ID:
        cuda.set_device(GPU_ID)

    print(time_now(), "get accuray of no pruning model")
    masked_model.make_evaluable()
    tmp_crate = len(masked_model.group_name_list) * [0]
    masked_model.change_mask(tmp_crate, apply_MP_on_mask)
    masked_model.apply_mask()
    tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus,
                          TEST_BATCH_SIZE)
    # 只需要原始的accuracy
    acc_of_no_prune = tmp_fit[1]
    fit_of_no_prune = tmp_fit
    original_acc = acc_of_no_prune
    pruning_arr = []
    ppl_arr = []
    #acc_of_no_prune = int(acc_of_no_prune*10)/10
    print("init accuracy of model:", acc_of_no_prune)
    print("accuracy constraint:", acc_percent_prune)
    init_threshold = [0]
    while run_times < total_times:
        print("-----------------------------------------")
        print("-----------------------------------------")
        print("-----------------------------------------")
        print("start Iteration ", run_times)

        print("test model---------------")
        LR = LR_INIT
        previous_pr = None
        previous_fit = None
        best_pr = None
        best_fit = None
        for prune_rate in range(1, 100):
            tmp_crate = len(masked_model.group_name_list) * [0.01 * prune_rate]
            masked_model.change_mask(tmp_crate, apply_MP_on_mask)
            masked_model.apply_mask()
            tmp_fit = evaluate_lm(masked_model.masked_model, valid_data,
                                  corpus, TEST_BATCH_SIZE)
            print(
                "each layer {} \% | {} % in total => validation acc {}\%, validation ppl {}"
                .format(prune_rate,
                        masked_model.get_sparsity() * 100, tmp_fit[1] * 100.,
                        tmp_fit[0]))

            if (not best_pr) and (tmp_fit[1] +
                                  acc_percent_prune) < original_acc:
                best_pr = previous_pr
                best_fit = previous_fit

            previous_pr = tmp_crate
            previous_fit = tmp_fit
        print('==============================')
        print("The best pruning rates are: {}".format(best_pr))
        if (not best_pr) or (best_pr[0] == init_threshold[0]):
            print(
                "Not better than last iteration of pruning, stop the process.")
            exit()
        masked_model.change_mask(best_pr, apply_MP_on_mask)
        masked_model.apply_mask()
        test_fit = evaluate_lm(masked_model.masked_model, test_data, corpus,
                               TEST_BATCH_SIZE)
        print("{} \% => validation acc {}\%, validation ppl {}".format(
            best_pr[0], best_fit[1] * 100., best_fit[0]))
        print("{} \% => test acc {}\%, test ppl {}".format(
            best_pr[0], test_fit[1] * 100., test_fit[0]))
        print('==============================')

        init_threshold = best_pr
        saved_model_name = 'ncs_pruned_model_%s_iteration%s_%s_%s_acc_cons_%s.pt' % (
            name_mark, run_times, Model_type, layer_group_type,
            str(acc_percent_prune))
        torch.save(masked_model.masked_model,
                   cfg.LM_MODEL_TMP_FOLDER + saved_model_name)

        #--------------- start retraining --------------
        model_for_train = masked_model

        with open(cfg.LM_MODEL_TMP_FOLDER + saved_model_name, 'rb') as f:
            model_tmp_load = torch.load(f)
            model_for_train.masked_model = model_tmp_load

        model_for_train.change_mask(init_threshold, apply_MP_on_mask)
        model_for_train.apply_mask()
        model_for_train.make_trainable()
        recovered = False
        best_val_loss = None

        try:
            for epoch in range(1, RETRAIN_EPOCHS + 1):
                epoch_start_time = time.time()
                train(model_for_train, ntokens, train_data, TRAIN_BATCH_SIZE,
                      SEQ_LEN, corpus, GRAD_CLIP, TRAIN_LOG_INTERVAL, epoch)
                val_eval = evaluate_lm(model_for_train.masked_model, val_data,
                                       corpus, TEST_BATCH_SIZE)
                print('-' * 89)
                print(
                    '| end of epoch {:3d} | time: {:5.2f}s | valid acc {:5.2f} | '
                    'valid ppl {:8.2f}'.format(
                        epoch, (time.time() - epoch_start_time), val_eval[1],
                        val_eval[0]))
                val_loss = val_eval[2]
                print('-' * 89)
                # Save the model if the validation loss is the best we've seen so far.
                if not best_val_loss or val_loss < best_val_loss:
                    with open(
                            "{}/{}{}_iterative_retrain_model_runtime{}_epoch_{}.pt"
                            .format(cfg.LM_MODEL_PATH, name_mark,
                                    acc_percent_prune, run_times, epoch),
                            'wb') as f:
                        torch.save(model_for_train, f)
                    best_val_loss = val_loss
                else:
                    # Anneal the learning rate if no improvement has been seen in the validation dataset.
                    LR /= 4.0

                if val_eval[1] >= original_acc:
                    recovered = True
                    break
        except KeyboardInterrupt:
            print('-' * 89)
            print('Exiting from training early')

        print(time_now(), "finish retraining ")
        if not recovered:
            exit()
        else:
            print("------------Accuracy recorverd!--------------------")
            print("recovered accuracy (>= {})".format(acc_of_no_prune))
        model_for_train.make_evaluable()
        model_for_train.apply_mask()

        ref_model = model_for_train.masked_model

        print("validate acc of the model---------------")
        tmp_fit = evaluate_lm(ref_model, valid_data, corpus, TEST_BATCH_SIZE)
        print('ref_model', 'acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0]))

        print("-------------print TEST  evaluation info ---------------")
        tmp_fit = evaluate_lm(ref_model, test_data, corpus, TEST_BATCH_SIZE)
        print('percentage %s => acc (%.4f), ppl (%.4f)' %
              (init_threshold[0] * 100, tmp_fit[1], tmp_fit[0]))
        masked_model = model_for_train
        run_times += 1
def _work(process_id, model, dataset, args):

    databin = dataset[process_id]
    n_gpus = torch.cuda.device_count()
    data_loader = DataLoader(databin,
                             shuffle=False,
                             num_workers=args.num_workers // n_gpus,
                             pin_memory=False)

    with torch.no_grad(), cuda.device(process_id):

        model.cuda()

        for iter, pack in enumerate(data_loader):

            img_name = pack['name'][0]
            label = pack['label'][0]
            size = pack['size']

            strided_size = imutils.get_strided_size(size, 4)
            strided_up_size = imutils.get_strided_up_size(size, 16)

            outputs = [
                model(img[0].cuda(non_blocking=True)) for img in pack['img']
            ]

            strided_cam = torch.sum(
                torch.stack([
                    F.interpolate(torch.unsqueeze(o, 0),
                                  strided_size,
                                  mode='bilinear',
                                  align_corners=False)[0] for o in outputs
                ]), 0)

            highres_cam = [
                F.interpolate(torch.unsqueeze(o, 1),
                              strided_up_size,
                              mode='bilinear',
                              align_corners=False) for o in outputs
            ]
            highres_cam = torch.sum(torch.stack(highres_cam, 0),
                                    0)[:, 0, :size[0], :size[1]]

            valid_cat = torch.nonzero(label)[:, 0]

            strided_cam = strided_cam[valid_cat]
            strided_cam /= F.adaptive_max_pool2d(strided_cam, (1, 1)) + 1e-5

            highres_cam = highres_cam[valid_cat]
            highres_cam /= F.adaptive_max_pool2d(highres_cam, (1, 1)) + 1e-5

            # save cams
            np.save(
                os.path.join(args.cam_out_dir, img_name + '.npy'), {
                    "keys": valid_cat,
                    "cam": strided_cam.cpu(),
                    "high_res": highres_cam.cpu().numpy()
                })

            if process_id == n_gpus - 1 and iter % (len(databin) // 20) == 0:
                print("%d " % ((5 * iter + 1) // (len(databin) // 20)), end='')
예제 #7
0
def main():

    data_path = "{}/data/penn".format(DATA_PATH)
    model_path = "{}/deepModels/torch_models/language-model/{}".format(MODEL_PATH, 'model.pt')
    #model_path = "{}/deepModels/torch_models/language-model/{}".format(MODEL_PATH, 'lstm_3layer.pt')
    total_times = 1
    run_times = 0
    orginal_acc = 0
    init_threshold = ...
    start_t = time.time()

    # get data
    corpus = data.Corpus(data_path)
    ntokens = len(corpus.dictionary)
    eval_batch_size = TEST_BATCH_SIZE
    train_data = batchify(corpus.train, TRAIN_BATCH_SIZE)
    val_data = batchify(corpus.valid, TEST_BATCH_SIZE)
    valid_data = val_data
    test_data = batchify(corpus.test, TEST_BATCH_SIZE)

    ref_model = None

    # Load the best saved model.

    masked_models = []
    with cuda.device(GPU_ID):
        ff = open(model_path, 'rb')
        ref_model = torch.load(ff)
        ref_model.eval()
        masked_model = MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device()) # ref_model is at current_device, no copy will happen
        #pdb.set_trace()
        masked_models.append(masked_model)
        ff.close()
    if GPU_ID:
        cuda.set_device(GPU_ID)

    print(time_now(), "get accuray of no pruning model")
    masked_model.make_evaluable()
    tmp_crate = len(masked_model.group_name_list)*[0]
    masked_model.change_mask(tmp_crate, apply_MP_on_mask)
    masked_model.apply_mask()
    tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus, TEST_BATCH_SIZE)
    # 只需要原始的accuracy
    acc_of_no_prune = tmp_fit[1]
    fit_of_no_prune = tmp_fit
    original_acc = acc_of_no_prune
    #acc_of_no_prune = int(acc_of_no_prune*10)/10
    print("=============TiPO start========================")
    print("init accuracy of model:", acc_of_no_prune)
    print("accuracy constraint:", acc_percent_prune)
    previous_pr = None
    best_pr = None
    ncs_std = 0.05
    while run_times < total_times:
        print("-----------------------------------------")
        print("-----------------------------------------")
        print("-----------------------------------------")
        print("start Iteration ", run_times)

        print("test model---------------")
        LR = LR_INIT
        #ref_model.generator.eval()
        print("test model---------------")
        masked_models[0].make_evaluable()
        tmp_fit = evaluate_lm(masked_models[0].masked_model, valid_data, corpus, TEST_BATCH_SIZE)
        print('masked_models[0]','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0]))

        if run_times == 0:
            init_threshold = len(masked_models[0].group_name_list) * [0.6]

        itr_time = time.time()
        for gpu_candidate in other_GPU_IDs:
            with cuda.device(gpu_candidate):
                masked_models.append(MaskedModel(ref_model, group_dict, GPU_ID, gpu_candidate)) # if the gpu_candidate is the same as ref_model, it will return the ref_model
        
        #------------- Here -------------------------
        # del ref_model

        # do pruning
        ncs_start = time.time()
        print('Itration %d, model loading: %d sec' % (run_times, ncs_start - itr_time))

        print("init threshold:", init_threshold)
        best_found, saved_model, best_masked_model = NCS_MP(init_threshold, ncs_std, masked_models, valid_data, corpus, acc_percent_prune, fit_of_no_prune, run_times)
        #best_found, saved_model, best_masked_model = init_threshold, '/raid/lab_tk/liguiying/deepModels/torch_models/language-model/prune_tmp/ncs_pruned_model_test_iteration0_LM_time_acc_cons_0.01.pt', masked_models[0]
        init_threshold = best_found
        #best_found, saved_model, best_masked_model = NCS_MP(init_threshold, 0.05, fields, masked_models, valid_data, 0.01, run_times, checkpoint)

        end_t = time.time()
        print('NCS Time: {} min'.format((end_t - itr_time)/60.))
        print('Best found thresholds:')
        for i in range(len(masked_models[0].group_name_list)):
            print("layer {}: {}%".format(masked_models[0].group_name_list[i], 100*best_found[i]))

        print("TEST PPL evaluation:")
        tmp_fit = evaluate_lm(best_masked_model.masked_model, test_data, corpus, TEST_BATCH_SIZE)
        print('Finsished => acc (%.4f percent), ppl (%.4f)' % (tmp_fit[1]*100, tmp_fit[0]))

        # clear no used models
        for gpu_model in masked_models:
            del gpu_model
        
        if not best_pr :
            best_pr = best_masked_model.get_sparsity()
        else:
            tmp_pr = best_masked_model.get_sparsity()
            if best_pr > tmp_pr:
                print("No improvement! Stop the PROCESS.")
                exit()
            elif best_pr == tmp_pr:
                if tmp_fit[1] <fit_of_no_prune[1]:
                    ncs_std /= 10
                else:
                    ncs_std *= 10
            else:
                best_pr = tmp_pr
        #if run_times % 5 == 0:
        #   ncs_std /= 10


        #--------------- start retraining --------------
        model_for_train = best_masked_model
        #pretrained_leaf_dict = model_for_train.make_trainable()
        #print(model_for_train.map_dict.keys())
        #pdb.set_trace()
        #fix_no_leaf(model_for_train, pretrained_leaf_dict)
        #pdb.set_trace()
        
        with open(SAVE_MODEL_TMP_FOLDER + saved_model, 'rb') as f:
            model_tmp_load = torch.load(f)
            model_for_train.masked_model = model_tmp_load.masked_model

        model_for_train.change_mask(init_threshold, apply_MP_on_mask)
        model_for_train.apply_mask()
        model_for_train.make_trainable()
        recovered = False
        best_val_loss = None

        try:
            for epoch in range(1, RETRAIN_EPOCHS + 1):
                epoch_start_time = time.time()
                train(model_for_train, ntokens, train_data, TRAIN_BATCH_SIZE, SEQ_LEN, corpus, GRAD_CLIP, TRAIN_LOG_INTERVAL, epoch)
                val_eval = evaluate_lm(model_for_train.masked_model, val_data, corpus, TEST_BATCH_SIZE)
                print('-' * 89)
                print('| end of epoch {:3d} | time: {:5.2f}s | valid acc {:5.2f} | '
                        'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                                   val_eval[1], val_eval[0]))
                val_loss = val_eval[2]
                print('-' * 89)
                # Save the model if the validation loss is the best we've seen so far.
                if not best_val_loss or val_loss < best_val_loss:
                    with open("{}/{}{}_iterative_retrain_model_runtime{}_epoch_{}.pt".format(SAVE_MODEL_FOLDER, name_mark, acc_percent_prune,  run_times, epoch), 'wb') as f:
                        torch.save(model_for_train, f)
                    best_val_loss = val_loss
                else:
                    # Anneal the learning rate if no improvement has been seen in the validation dataset.
                    LR /= 4.0

                if val_eval[1] >= original_acc:
                    recovered = True
        except KeyboardInterrupt:
            print('-' * 89)
            print('Exiting from training early')

        print(time_now(), "finish retraining ")
        if not recovered:
            print("NOT RECORVER!")
            exit()
        else:
            print("------------Accuracy recorverd!--------------------")
            print("recovered accuracy (>= {})".format(acc_of_no_prune))
        model_for_train.make_evaluable()
        model_for_train.apply_mask()

        ref_model = model_for_train.masked_model
        masked_models = [MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device())]

        print("validate acc of the model---------------")
        tmp_fit = evaluate_lm(ref_model, valid_data, corpus, TEST_BATCH_SIZE)
        print('ref_model','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0]))

        tmp_fit = evaluate_lm(masked_models[0].masked_model, valid_data, corpus, TEST_BATCH_SIZE)
        print('masked_models[0]','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0]))
        
        print('------------- save checkpoint ---------------')
        saved_model = update_checkpoint(model_for_train, run_times, acc_percent_prune, t=True)
        print(time_now(), ' saving model:', saved_model)
        print("-------------print TEST  evaluation info ---------------")
        tmp_fit = evaluate_lm(model_for_train.masked_model, test_data, corpus, TEST_BATCH_SIZE)
        print('percentage %s => acc (%.4f), ppl (%.4f)' % (model_for_train.get_sparsity()*100, tmp_fit[1], tmp_fit[0]))
        run_times += 1
예제 #8
0
def distk_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out
    kernel: L2DistanceKernel = a.kernel
    max_mem = a.max_mem
    N, D = X1.size()
    M = X2.size(0)
    T = v.size(1) if v is not None else w.size(1)
    dtype = X1.dtype

    # Memory usage:
    # v    : M x T
    # K    : n x M
    # X1ss : n x d
    # X2s  : M x d
    # Kv   : n x T
    # out  : M x T
    # sq1  : n x 1
    # sq2  : M x 1
    # ------------
    # total : n*d + M*d + n*(M + T + 1) + 2*M*T + M
    avail_mem = max_mem / sizeof_dtype(dtype)
    # FIXME: There seems to be a bug where if we let avail_mem like it is
    #        for 32-bit data-types some copy fails. In such case we need
    #        to free up some more memory and then everything runs fine.
    rest_coef = 2 * M * T if v is not None else M * T
    n, d = select_dim_over_d(maxD=D,
                             maxN=N,
                             coef_nd=1,
                             coef_n=M + T + 1,
                             coef_d=M,
                             rest=rest_coef + M,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    s1 = tcd.Stream()
    s2 = tcd.Stream()

    with tcd.device(ddev), tcd.stream(s1):
        if v is not None:
            v_gpu = create_same_stride((M, T), v, dtype, ddev)
            copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0)
        K_gpu = create_same_stride((n, M), X1, dtype, ddev)
        X1ss_gpu = create_same_stride((n, d), X1, dtype, ddev)
        X2s_gpu = create_same_stride((M, d), X2, dtype, ddev)
        Kv_gpu = create_same_stride((n, T), X1, dtype, ddev)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = create_same_stride((M, T), out, dtype, ddev)
        out_gpu.fill_(0.0)
        sq1_gpu = create_same_stride((n, ), X1, dtype, ddev)
        sq2_gpu = create_same_stride((M, ), X1, dtype, ddev)

        #if (d == D):
        #    with torch.cuda.stream(s2):
        #        cur_X2s_gpu = copy_to_device_noorder(M, d, X2, 0, 0, X2s_gpu, 0, 0, s=s2)
        #        torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2)

        for i in range(0, N, n):
            nb = min(N - i, n)

            cur_K_gpu = K_gpu.narrow(0, 0, nb)  # nb x M
            cur_K_gpu.fill_(0.0)

            for j in range(0, D, d):
                db = min(D - j, d)
                # Parallelize two matrix transfers (probably pointless)
                #if d < D:
                with torch.cuda.stream(s2):
                    cur_X2s_gpu = copy_to_device_noorder(M,
                                                         db,
                                                         X2,
                                                         0,
                                                         j,
                                                         X2s_gpu,
                                                         0,
                                                         0,
                                                         s=s2)
                    torch.norm(cur_X2s_gpu,
                               p=2,
                               dim=1,
                               keepdim=True,
                               out=sq2_gpu).pow_(2)
                cur_X1ss_gpu = copy_to_device_noorder(nb,
                                                      db,
                                                      X1,
                                                      i,
                                                      j,
                                                      X1ss_gpu,
                                                      0,
                                                      0,
                                                      s=s1)
                torch.norm(cur_X1ss_gpu, p=2, dim=1, keepdim=True,
                           out=sq1_gpu).pow_(2)

                s2.synchronize()
                s1.synchronize()
                cur_K_gpu.addmm_(mat1=cur_X1ss_gpu,
                                 mat2=cur_X2s_gpu.T,
                                 alpha=-2.0)
                cur_K_gpu.add_(sq1_gpu)
                cur_K_gpu.add_(sq2_gpu.T)
                cur_K_gpu.clamp_min_(0)

            cur_K_gpu = kernel._transform(cur_K_gpu)

            if w is not None:
                # Copy split w to GPU into cur_Kv_gpu,
                cur_Kv_gpu = copy_to_device_noorder(nb,
                                                    T,
                                                    w,
                                                    i,
                                                    0,
                                                    Kv_gpu,
                                                    0,
                                                    0,
                                                    s=s1)  # n x T
                if v is not None:
                    cur_Kv_gpu.addmm_(cur_K_gpu, v_gpu)
            else:
                # v cannot be None if w is None
                cur_Kv_gpu = Kv_gpu.narrow(0, 0, nb)  # n x T
                torch.mm(cur_K_gpu, v_gpu, out=cur_Kv_gpu)  # n x T

            # Multiply transposed kernel with the Kv result.
            out_gpu.addmm_(cur_K_gpu.T, cur_Kv_gpu)  # M x T
            s1.synchronize()
        s1.synchronize()

        if not out.is_cuda:
            copy_to_host_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
    return out
예제 #9
0
def generic_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    N, D = X1.size()
    M = X2.size(0)
    if v is None:
        T = w.size(1)
    else:
        T = v.size(1)

    # Memory usage:
    # v    : M x T
    # K    : n x M
    # X1d  : n x d
    # X2d  : M x d
    # Kv   : n x T
    # out2 : M x T
    # sq1  : n x 1
    # sq2  : M x 1
    # ------------
    # total : n*d + M*d + n*(M + T) + 2*M*T + M
    avail_mem = max_mem / sizeof_dtype(dtype)
    # FIXME: There seems to be a bug where if we let avail_mem like it is
    #        for 32-bit data-types some copy fails. In such case we need
    #        to free up some more memory and then everything runs fine.
    if sizeof_dtype(dtype) == 4:
        avail_mem /= 2
    rest_coef = 2 * M * T if v is not None else M * T
    n, d = select_dim_over_d(maxD=D,
                             maxN=N,
                             coef_nd=1,
                             coef_n=M + T + 1,
                             coef_d=M,
                             rest=rest_coef + M,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        # Initialize GPU data
        ker_gpu = create_same_stride((n, M), out, dtype=dtype, device=ddev)
        X1s_gpu = create_same_stride((n, d), X1, dtype, ddev)
        X2s_gpu = create_same_stride((M, d), X2, dtype, ddev)
        w_gpu = create_same_stride((n, T), ker_gpu, dtype, ddev)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = create_same_stride((M, T), out, dtype, ddev)
        out_gpu.fill_(0.0)
        if v is not None:
            v_gpu = v.to(device=ddev)  # M x T

        for i in range(0, N, n):
            ic = min(n, N - i)
            ddd = kernel._prepare(X1.narrow(0, i, ic), X2)

            c_g_ker = ker_gpu.narrow(0, 0, ic)
            c_g_ker.fill_(0.0)
            for k in range(0, D, d):
                kc = min(d, D - k)
                c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0,
                                                 0)
                c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0,
                                                 0)
                kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker)
            kernel._finalize(c_g_ker, ddd)

            if w is not None:
                c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0)
            else:
                c_g_w = w_gpu.narrow(0, 0, ic)
                c_g_w.fill_(0.0)
            if v is not None:
                c_g_w.addmm_(c_g_ker, v_gpu)
            out_gpu.addmm_(c_g_ker.T, c_g_w)

        if not out.is_cuda:
            copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
    return out
예제 #10
0
def main():
    data_path = "{}/data/penn".format(DATA_PATH)
    model_path = "{}/deepModels/SiPO/original_model/language_model/{}".format(MODEL_PATH, 'lm_model_orignal.pt')
    total_times = 12
    run_times = 0
    orginal_acc = 0
    init_threshold = ...
    start_t = time.time()

    # get data
    corpus = data.Corpus(data_path)
    ntokens = len(corpus.dictionary)
    eval_batch_size = TEST_BATCH_SIZE
    train_data = batchify(corpus.train, TRAIN_BATCH_SIZE)
    val_data = batchify(corpus.valid, TEST_BATCH_SIZE)
    valid_data = val_data
    test_data = batchify(corpus.test, TEST_BATCH_SIZE)

    ref_model = None

    # Load the best saved model.

    with cuda.device(GPU_ID):
        ff = open(model_path, 'rb')
        ref_model = torch.load(ff)
        ref_model.eval()
        masked_model = MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device()) # ref_model is at current_device, no copy will happen
        #pdb.set_trace()
        ff.close()
    if GPU_ID:
        cuda.set_device(GPU_ID)

    print(time_now(), "get accuray of no pruning model")
    masked_model.make_evaluable()
    tmp_crate = len(masked_model.group_name_list)*[0]
    masked_model.change_mask(tmp_crate, apply_MP_on_mask)
    masked_model.apply_mask()
    tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus, TEST_BATCH_SIZE)
    # 只需要原始的accuracy
    acc_of_no_prune = tmp_fit[1]
    fit_of_no_prune = tmp_fit
    original_acc = acc_of_no_prune
    pruning_arr = []
    ppl_arr = []
    #acc_of_no_prune = int(acc_of_no_prune*10)/10
    print("init accuracy of model:", acc_of_no_prune)
    print("accuracy constraint:", acc_percent_prune)
    init_threshold = [0]
    print("-----------------------------------------")
    print("-----------------------------------------")
    print("-----------------------------------------")
    SAVE_MODEL_FOLDER = '/fl/checkpoint/language-mode/prune/'
    print("test model---------------")
    LR = LR_INIT
    previous_pr = [0.01] * 4
    previous_fit = [0.01, 12.3]
    best_pr = [0.01] * 4
    best_fit = [0.01, 12.3]
    for prune_rate in range(1, 100):
        tmp_crate = len(masked_model.group_name_list)*[0.01 * prune_rate]
        masked_model.change_mask(tmp_crate, apply_MP_on_mask)
        masked_model.apply_mask()
        tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus, TEST_BATCH_SIZE)
        print("each layer {} \% | {} % in total => validation acc {}\%, validation ppl {}".format(prune_rate, masked_model.get_sparsity()*100, tmp_fit[1]*100., tmp_fit[0]))

        if (tmp_fit[1] + acc_percent_prune) > original_acc:
            best_pr = previous_pr
            best_fit = previous_fit
        
        previous_pr = tmp_crate
        previous_fit = tmp_fit
        

        
        test_fit = evaluate_lm(masked_model.masked_model, test_data, corpus, TEST_BATCH_SIZE)
        print("{} \% => validation acc {}\%, validation ppl {}".format(best_pr[0], best_fit[1]*100., best_fit[0]))
        print("{} \% => test acc {}\%, test ppl {}".format(best_pr[0], test_fit[1]*100., test_fit[0]))
        print('==============================')
        masked_model.apply_mask_init()
        init_threshold = best_pr
        saved_model_name = 'ncs_pruned_model_%s_iteration%s_%s_%s_acc_cons_%s.pt' % (name_mark, run_times, Model_type, layer_group_type, str(acc_percent_prune))
        torch.save(masked_model.masked_model, SAVE_MODEL_FOLDER+saved_model_name)

        #--------------- start retraining --------------
        model_for_train = masked_model
        
        with open(SAVE_MODEL_FOLDER + saved_model_name, 'rb') as f:
            model_tmp_load = torch.load(f)
            model_for_train.masked_model = model_tmp_load

        model_for_train.change_mask(init_threshold, apply_MP_on_mask)
        model_for_train.apply_mask()
        model_for_train.make_trainable()
        recovered = False
        best_val_loss = None
        train(model_for_train, ntokens, train_data, TRAIN_BATCH_SIZE, SEQ_LEN, corpus, GRAD_CLIP, TRAIN_LOG_INTERVAL, prune_rate)
                

        print(time_now(), "finish retraining ")
        print("------------Accuracy recorverd!--------------------")
        print("recovered accuracy (>= {})".format(acc_of_no_prune))
        model_for_train.make_evaluable()
        model_for_train.apply_mask()

        ref_model = model_for_train.masked_model

        print("validate acc of the model---------------")
        tmp_fit = evaluate_lm(ref_model, valid_data, corpus, TEST_BATCH_SIZE)
        print('ref_model','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0]))

        print("-------------print TEST  evaluation info ---------------")
        tmp_fit = evaluate_lm(ref_model, test_data, corpus, TEST_BATCH_SIZE)
        print('percentage %s => acc (%.4f), ppl (%.4f)' % (init_threshold[0]*100, tmp_fit[1], tmp_fit[0]))
        masked_model = model_for_train


        print('==============================')
        print("The best pruning rates are: {}".format(best_pr))
        
        masked_model.change_mask(best_pr, apply_MP_on_mask)
        masked_model.apply_mask()
        test_fit = evaluate_lm(masked_model.masked_model, test_data, corpus, TEST_BATCH_SIZE)
        print("{} \% => validation acc {}\%, validation ppl {}".format(best_pr[0], best_fit[1]*100., best_fit[0]))
        print("{} \% => test acc {}\%, test ppl {}".format(best_pr[0], test_fit[1]*100., test_fit[0]))
        print('==============================')
예제 #11
0
def main():

    data_path = "{}/data/penn".format(DATA_PATH)
    model_path = "{}/deepModels/torch_models/language-model/{}".format(
        MODEL_PATH, 'model.pt')
    total_times = 12
    run_times = 0
    orginal_acc = 0
    init_threshold = ...
    start_t = time.time()

    # get data
    corpus = data.Corpus(data_path)
    ntokens = len(corpus.dictionary)
    eval_batch_size = TEST_BATCH_SIZE
    train_data = batchify(corpus.train, TRAIN_BATCH_SIZE)
    val_data = batchify(corpus.valid, TEST_BATCH_SIZE)
    valid_data = val_data
    test_data = batchify(corpus.test, TEST_BATCH_SIZE)

    ref_model = None
    masked_model = None

    # Load the best saved model.

    with cuda.device(GPU_ID):
        ff = open(model_path, 'rb')
        ref_model = torch.load(ff)
        ref_model.eval()
        masked_model = MaskedModel(
            ref_model, group_dict, cuda.current_device(), cuda.current_device(
            ))  # ref_model is at current_device, no copy will happen
        ff.close()
    if GPU_ID:
        cuda.set_device(GPU_ID)

    if layer_group_type == 'simple':
        print("MP start for LM")
    elif layer_group_type == 'layer':
        print("LMP start for LM")
    print(time_now(), "get accuray of no pruning model")
    masked_model.make_evaluable()
    tmp_crate = len(masked_model.group_name_list) * [0]
    masked_model.change_mask(tmp_crate, apply_MP_on_mask)
    masked_model.apply_mask()
    tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus,
                          TEST_BATCH_SIZE)
    # 只需要原始的accuracy
    acc_of_no_prune = tmp_fit[1]
    fit_of_no_prune = tmp_fit
    original_acc = acc_of_no_prune
    #acc_of_no_prune = int(acc_of_no_prune*10)/10
    print("init accuracy of model:", acc_of_no_prune)
    print("accuracy constraint:", acc_percent_prune)
    # the best pruning rate
    previous_pr = None
    previous_fit = None
    best_pr = None
    best_fit = None
    for prune_rate in range(1, 100):
        tmp_crate = len(masked_model.group_name_list) * [0.01 * prune_rate]
        masked_model.change_mask(tmp_crate, apply_MP_on_mask)
        masked_model.apply_mask()
        tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus,
                              TEST_BATCH_SIZE)
        print("{} \% => validation acc {}\%, validation ppl {}".format(
            prune_rate, tmp_fit[1] * 100., tmp_fit[0]))

        if (not best_pr) and (tmp_fit[1] + acc_percent_prune) < original_acc:
            best_pr = previous_pr
            best_fit = previous_fit

        previous_pr = tmp_crate
        previous_fit = tmp_fit
    print('==============================')
    print("The best pruning rates are: {}".format(best_pr))
    masked_model.change_mask(best_pr, apply_MP_on_mask)
    masked_model.apply_mask()
    test_fit = evaluate_lm(masked_model.masked_model, test_data, corpus,
                           TEST_BATCH_SIZE)
    print("{} \% => validation acc {}\%, validation ppl {}".format(
        best_pr[0], best_fit[1] * 100., best_fit[0]))
    print("{} \% => test acc {}\%, test ppl {}".format(best_pr[0],
                                                       test_fit[1] * 100.,
                                                       test_fit[0]))
    print('==============================')
예제 #12
0
def main():

    data_path = "{}/data/penn".format(cfg.PROJECT_ROOT)
    model_path = "{}/model/original_model/language_model/{}".format(cfg.PROJECT_ROOT, 'lm_model_orignal.pt')
    total_times = 2
    run_times = 0
    orginal_acc = 0
    init_threshold = ...
    start_t = time.time()

    # get data
    corpus = data.Corpus(data_path)
    ntokens = len(corpus.dictionary)
    eval_batch_size = TEST_BATCH_SIZE
    train_data = batchify(corpus.train, TRAIN_BATCH_SIZE)
    val_data = batchify(corpus.valid, TEST_BATCH_SIZE)
    valid_data = val_data
    test_data = batchify(corpus.test, TEST_BATCH_SIZE)

    ref_model = None

    # Load the best saved model.

    masked_models = []
    with cuda.device(GPU_ID):
        ff = open(model_path, 'rb')
        ref_model = torch.load(ff)
        ref_model.eval()
        masked_model = MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device()) # ref_model is at current_device, no copy will happen
        #pdb.set_trace()
        masked_models.append(masked_model)
        ff.close()
    if GPU_ID:
        cuda.set_device(GPU_ID)

    print(time_now(), "get accuray of no pruning model")
    masked_model.make_evaluable()
    tmp_crate = len(masked_model.group_name_list)*[0]
    masked_model.change_mask(tmp_crate, apply_MP_on_mask)
    masked_model.apply_mask()
    tmp_fit = evaluate_lm(masked_model.masked_model, valid_data, corpus, TEST_BATCH_SIZE)
    # 只需要原始的accuracy
    acc_of_no_prune = tmp_fit[1]
    fit_of_no_prune = tmp_fit
    original_acc = acc_of_no_prune
    pruning_arr = []
    ppl_arr = []
    #acc_of_no_prune = int(acc_of_no_prune*10)/10
    print("init accuracy of model:", acc_of_no_prune)
    print("accuracy constraint:", acc_percent_prune)
    while run_times < total_times:
        print("-----------------------------------------")
        print("-----------------------------------------")
        print("-----------------------------------------")
        print("start Iteration ", run_times)

        print("test model---------------")
        LR = LR_INIT
        ref_model.eval()
        #ref_model.generator.eval()
        tmp_fit = evaluate_lm(ref_model, valid_data, corpus, TEST_BATCH_SIZE)
        print('ref_model','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0]))

        print("test model---------------")
        masked_models[0].make_evaluable()
        tmp_fit = evaluate_lm(masked_models[0].masked_model, valid_data, corpus, TEST_BATCH_SIZE)
        print('masked_models[0]','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0]))


        itr_time = time.time()
        for gpu_candidate in other_GPU_IDs:
            with cuda.device(gpu_candidate):
                masked_models.append(MaskedModel(ref_model, group_dict, GPU_ID, gpu_candidate)) # if the gpu_candidate is the same as ref_model, it will return the ref_model
        
        #------------- Here -------------------------
        # del ref_model

        # do pruning
        ncs_start = time.time()
        print('Itration %d, model loading: %d sec' % (run_times, ncs_start - itr_time))
        if run_times == 0:
            init_threshold = len(masked_models[0].group_name_list)*[0.10]

        print("init threshold:", init_threshold)
        best_found, saved_model, best_masked_model = NCS_MP(init_threshold, 0.05, masked_models, valid_data, corpus, acc_percent_prune, fit_of_no_prune, run_times)
        #best_found, saved_model, best_masked_model = init_threshold, '/raid/lab_tk/liguiying/deepModels/torch_models/language-model/prune_tmp/ncs_pruned_model_test_iteration0_LM_time_acc_cons_0.01.pt', masked_models[0]
        #init_threshold = best_found
        #best_found, saved_model, best_masked_model = NCS_MP(init_threshold, 0.05, fields, masked_models, valid_data, 0.01, run_times, checkpoint)

        end_t = time.time()
        print('NCS Time: {} min'.format((end_t - itr_time)/60.))
        print('Best found thresholds:')
        for i in range(len(masked_models[0].group_name_list)):
            print("layer {}: {}%".format(masked_models[0].group_name_list[i], 100*best_found[i]))

        print("TEST PPL evaluation:")
        tmp_fit = evaluate_lm(best_masked_model.masked_model, test_data, corpus, TEST_BATCH_SIZE)
        print('Finsished => acc (%.4f percent), ppl (%.4f)' % (tmp_fit[1]*100, tmp_fit[0]))
        pruning_arr.append(masked_models[0].get_sparsity())
        ppl_arr.append(tmp_fit[0])

        # clear no used models
        for gpu_model in masked_models:
            del gpu_model
        run_times += 1
    pruning_arr = np.array(pruning_arr)
    ppl_arr = np.array(ppl_arr)
    print("Prunig rate : mean({}) std({})".format(pruning_arr.mean(), pruning_arr.std()))
    print("PPL : mean({}) std({})".format(ppl_arr.mean(), ppl_arr.std()))
예제 #13
0
proc_pipe = transf.Compose(
    [transf.ToPILImage(), img_resize,
     transf.ToTensor(), img_norm])
train_dir = 'train_images'
val_dir = 'test_images'
train_loader = DataLoader(DataFeed(train_dir,
                                   nat_sort=True,
                                   transform=proc_pipe),
                          batch_size=batch_size,
                          shuffle=False)
val_loader = DataLoader(DataFeed(val_dir, nat_sort=True, transform=proc_pipe),
                        batch_size=batch_size,
                        shuffle=False)

# Network training:
with cuda.device(0):
    top_1 = np.zeros((1, len(train_size)))
    top_2 = np.zeros((1, len(train_size)))
    top_3 = np.zeros((1, len(train_size)))
    acc_loss = 0
    itr = []
    for idx, n in enumerate(train_size):
        print('```````````````````````````````````````````````````````')
        print('Training size is {}'.format(n))
        # Build the network:
        net = resnet18_mod(pretrained=True, progress=True, num_classes=64)
        net = net.cuda()
        layers = list(net.children())

        #  Optimization parameters:
        criterion = nn.CrossEntropyLoss()
def _work(process_id, model, dataset, args):

    n_gpus = torch.cuda.device_count()
    databin = dataset[process_id]
    # data_loader = DataLoader(databin,
    #                          shuffle=False, num_workers=args.num_workers // n_gpus, pin_memory=False)
    data_loader = DataLoader(databin, shuffle=False, pin_memory=False)

    with torch.no_grad(), cuda.device(process_id):

        model.cuda()

        for iter, pack in enumerate(tqdm(data_loader)):
            # img_name = voc12.dataloader.decode_int_filename(pack['name'][0])
            img_name = pack['name'][0]
            orig_img_size = np.asarray(pack['size'])

            if args.dataset == 'voc12':
                img_path = voc12.dataloader.get_img_path(
                    img_name, args.dev_root)
            elif args.dataset in ['adp_morph', 'adp_func']:
                img_path = adp.dataloader.get_img_path(
                    img_name, args.dev_root, args.split == 'evaluation')
            elif args.dataset in ['deepglobe', 'deepglobe_balanced']:
                img_path = deepglobe.dataloader.get_img_path(
                    img_name, args.dev_root)
            else:
                raise KeyError('Dataset %s not yet implemented' % args.dataset)

            edge, dp = model(pack['img'][0].cuda(non_blocking=True))
            # if img_name == '2007_001185':
            #     cv2.imwrite('edge.png', np.uint8(255 * cv2.resize(edge.cpu().numpy()[0], tuple(orig_img_size[::-1]))))
            #     D = dp.cpu().numpy()
            #     hsv = np.zeros((D.shape[1], D.shape[2], 3), dtype='uint8')
            #     hsv[..., 1] = 255
            #     mag, ang = cv2.cartToPolar(-D[0], -D[1])
            #     hsv[..., 0] = ang * 180 / np.pi / 2
            #     hsv[..., 2] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX)
            #
            #     rgb = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
            #     cv2.imwrite('dp.png', cv2.resize(rgb[:, :, ::-1], tuple(orig_img_size[::-1])))
            #     a=1
            cam_dict = np.load(args.cam_out_dir + '/' + img_name + '.npy',
                               allow_pickle=True).item()

            cam_downsized_values = torch.from_numpy(cam_dict['cam']).cuda()
            if args.dataset == 'voc12':
                if len(cam_dict['keys']) > 0:
                    keys = np.pad(cam_dict['keys'] + 1, (1, 0),
                                  mode='constant')
                    if edge.shape[1:] != cam_downsized_values.shape[1:]:
                        edge = F.interpolate(
                            edge.unsqueeze(0),
                            size=(cam_downsized_values.shape[1:]),
                            mode='bilinear',
                            align_corners=False)

                    rw = indexing.propagate_to_edge(cam_downsized_values,
                                                    edge,
                                                    beta=args.beta,
                                                    exp_times=args.exp_times,
                                                    radius=5)  # radius=5
                    # rw = indexing.propagate_to_edge(cam_downsized_values, edge, beta=args.beta, exp_times=3, radius=5)

                    rw_up = F.interpolate(
                        rw,
                        size=tuple(orig_img_size),
                        mode='bilinear',
                        align_corners=False)[
                            ..., 0, :orig_img_size[0], :orig_img_size[1]]
                    rw_up = rw_up / torch.max(rw_up)

                    rw_up_bg = F.pad(rw_up, (0, 0, 0, 0, 1, 0),
                                     value=args.sem_seg_bg_thres)
                    rw_pred = torch.argmax(rw_up_bg, dim=0).cpu().numpy()

                    rw_pred = keys[rw_pred]
                else:
                    rw_pred = np.zeros(orig_img_size, dtype='uint8')
            elif args.dataset in ['adp_morph', 'adp_func']:
                keys = cam_dict['keys']

                if edge.shape[1:] != cam_downsized_values.shape[1:]:
                    edge = F.interpolate(edge.unsqueeze(0),
                                         size=(cam_downsized_values.shape[1:]),
                                         mode='bilinear',
                                         align_corners=False)

                rw = indexing.propagate_to_edge(cam_downsized_values,
                                                edge,
                                                beta=args.beta,
                                                exp_times=args.exp_times,
                                                radius=5)
                rw_up = F.interpolate(
                    rw,
                    size=tuple(orig_img_size),
                    mode='bilinear',
                    align_corners=False)[
                        ..., 0, :orig_img_size[0], :orig_img_size[1]]
                rw_up = rw_up / torch.max(rw_up)
                rw_pred = torch.argmax(rw_up, dim=0).cpu().numpy()

                rw_pred = keys[rw_pred]
            elif args.dataset in ['deepglobe', 'deepglobe_balanced']:
                if len(cam_dict['keys']) > 0:
                    keys = cam_dict['keys']

                    down_fac = 6
                    cam_downsized_values = F.interpolate(
                        cam_downsized_values.unsqueeze(0),
                        size=[
                            x // down_fac
                            for x in cam_downsized_values.shape[1:]
                        ],
                        mode='bilinear',
                        align_corners=False)[0]
                    edge = F.interpolate(edge.unsqueeze(0),
                                         size=(cam_downsized_values.shape[1:]),
                                         mode='bilinear',
                                         align_corners=False)

                    rw = indexing.propagate_to_edge(cam_downsized_values,
                                                    edge,
                                                    beta=args.beta,
                                                    exp_times=args.exp_times,
                                                    radius=5)
                    rw_up = F.interpolate(
                        rw,
                        size=tuple(orig_img_size // 4),
                        mode='bilinear',
                        align_corners=False)[..., 0, :orig_img_size[0] //
                                             4, :orig_img_size[1] // 4]
                    rw_up = rw_up / torch.max(rw_up)
                    rw_pred = torch.argmax(rw_up, dim=0).cpu().numpy()

                    rw_pred = keys[rw_pred]
                else:
                    rw_pred = 5 * np.ones(tuple(orig_img_size // 4))
            else:
                raise KeyError('Dataset %s not yet implemented' % args.dataset)

            imageio.imsave(
                os.path.join(args.sem_seg_out_dir, img_name + '.png'),
                rw_pred.astype(np.uint8))
            # Save with colour
            rw_pred_clr = np.zeros(list(rw_pred.shape) + [3], dtype=np.uint8)
            off = 0
            for t in ['bg', 'fg']:
                for i, c in enumerate(args.class_colours[t]):
                    for ch in range(3):
                        rw_pred_clr[:, :,
                                    ch] += c[ch] * np.uint8(rw_pred == (i +
                                                                        off))
                off += len(args.class_colours[t])
            imageio.imsave(
                os.path.join(args.sem_seg_clr_out_dir, img_name + '.png'),
                rw_pred_clr)
            # Save with colour, overlaid on original image
            if args.dataset not in ['deepglobe', 'deepglobe_balanced']:
                orig_img = cv2.cvtColor(cv2.imread(img_path),
                                        cv2.COLOR_BGR2RGB)
            else:
                orig_img = cv2.resize(
                    cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB),
                    rw_pred_clr.shape[:2])
            if args.dataset in ['adp_morph', 'adp_func']:
                rw_pred_clr = cv2.resize(rw_pred_clr, orig_img.shape[:2])
            rw_pred_clr_over = np.uint8(
                (1 - args.overlay_r) * np.float32(orig_img) +
                args.overlay_r * np.float32(rw_pred_clr))
            imageio.imsave(
                os.path.join(args.sem_seg_clr_out_dir,
                             img_name + '_overlay.png'), rw_pred_clr_over)
예제 #15
0
def _work(process_id, model, dataset, args):

    n_gpus = torch.cuda.device_count()
    databin = dataset[process_id]
    data_loader = DataLoader(databin,
                             shuffle=False, num_workers=args.num_workers // n_gpus, pin_memory=False)

    with torch.no_grad(), cuda.device(process_id):

        model.cuda()

        for iter, pack in enumerate(data_loader):
            img_name = voc12.data.decode_int_filename(pack['name'][0])
            orig_img_size = np.asarray(pack['size'])

            strided_size = imutils.get_strided_size(orig_img_size, 4)

            out_setting = {
                "upsize": strided_size,
                "flip": True
            }

            img_o = pack['img'][0][0]

            edge, dp = model(img_o.cuda(non_blocking=True), out_setting)

            edge_avg = edge
            edge_avg = torch.sigmoid(edge_avg)

            cam_dict = np.load(args.cam_dir + '/' + img_name + '.npy', allow_pickle=True).item()

            cams = cam_dict['cam']
            keys = np.pad(cam_dict['keys'] + 1, (1, 0), mode='constant')

            cam_downsized_values = cams.cuda()

            edge_padded = F.pad(edge_avg, (5, 5, 0, 5), mode='constant', value=1.0)

            path_index = adv_indexing.PathIndex(radius=5, default_size=(strided_size[0] + 5, strided_size[1] + 10))

            sparse_aff = adv_indexing.edge_to_affinity(torch.unsqueeze(edge_padded, 0),
                                                       path_index.default_path_indices)
            dense_aff = affinity_sparse2dense(sparse_aff, path_index.default_src_indices,
                                              path_index.default_dst_indices, (strided_size[0] + 5) * (strided_size[1] + 10))
            dense_aff = dense_aff.view(strided_size[0] + 5, strided_size[1] + 10, strided_size[0] + 5, -1)[:-5, 5:-5, :-5, 5:-5]
            dense_aff = dense_aff.reshape(strided_size[0]*strided_size[1], -1)
            trans_mat = to_transition_matrix(dense_aff, beta=args.beta, times=args.t)

            cam_expanded = cam_downsized_values*(1 - edge_avg)

            rw = torch.matmul(cam_expanded.view(cam_expanded.size(0), -1), trans_mat)
            rw = rw.view(rw.size(0), 1, strided_size[0], strided_size[1])
            rw_up = F.interpolate(rw, scale_factor=4, mode='bilinear', align_corners=False)[..., :orig_img_size[0], :orig_img_size[1]]

            rw_up = rw_up[:, 0]

            rw_up_norm = rw_up / torch.max(rw_up)

            rw_up_norm_bg = F.pad(rw_up_norm, (0, 0, 0, 0, 1, 0), value=args.sem_seg_bg_thres)
            rw_pred = torch.argmax(rw_up_norm_bg, dim=0).cpu().numpy()

            rw_pred = keys[rw_pred]

            imageio.imsave(os.path.join(args.segm_out_dir, img_name + '.png'), rw_pred.astype(np.uint8))

            if process_id == n_gpus - 1 and iter % (len(databin) // 20) == 0:
                print("%d " % ((5*iter+1)//(len(databin) // 20)), end='')
예제 #16
0
def main():
    # load valid data
    cuda.set_device(GPU_ID)
    print('loading data...')
    start_time = time.time()
    valid_data = torch.load(os.path.join(data_path, 'len50_pywmt14.valid.pt'))
    fields = onmt.IO.load_fields(
        torch.load(os.path.join(data_path, 'len50_pywmt14.vocab.pt')))
    valid_data.fields = fields
    print('data loaded. time {} seconds.'.format(time.time() - start_time))

    # load model
    print('load pretrained model...')
    start_time = time.time()
    checkpoint = torch.load(model_path,
                            map_location=lambda storage, loc: storage)
    model_opt = checkpoint['opt']
    with cuda.device(GPU_ID):
        ref_model = onmt.ModelConstructor.make_base_model(
            model_opt, fields, True, checkpoint)
        ref_model.eval()
        ref_model.generator.eval()

    lds_model = LDSMaskedNMT(ref_model, MODEL_TYPE)
    lds_model.mask_pruned_model()
    print('model loaded. time {} seconds.'.format(time.time() - start_time))

    lds_model.cal_LDS_for_all_matrices()
    lds_model.cal_dis_temp()
    lds_model.cal_sorted_params()

    tmp_fit = evaluate(lds_model.pruned_model, valid_data, fields)
    print(tmp_fit)
    test_metrics(lds_model.pruned_model, fields)
    get_sparity(lds_model.pruned_model)

    tea_model = copy.deepcopy(lds_model.pruned_model)

    lds_model.prun_rnn(1.0, 1.0)
    lds_model.prun_other(0.6)

    lds_model.mask_pruned_model()
    tmp_fit = evaluate(lds_model.pruned_model, valid_data, fields)
    print(tmp_fit)
    test_metrics(lds_model.pruned_model, fields)

    stu_model = copy.deepcopy(lds_model.pruned_model)

    # load train data
    cuda.set_device(GPU_ID)
    print('loading data...')
    start_time = time.time()
    train_data = torch.load(os.path.join(data_path, 'len50_pywmt14.train.pt'))
    train_fields = onmt.IO.load_fields(
        torch.load(os.path.join(data_path, 'len50_pywmt14.vocab.pt')))
    train_data.fields = train_fields
    print('data loaded. time {} seconds.'.format(time.time() - start_time))

    # retrain
    train_ts(tea_model, stu_model, train_data, epoch=1)

    tmp_fit = evaluate(stu_model, valid_data, fields)
    print(tmp_fit)
    test_metrics(stu_model, fields)
    get_sparity(stu_model)

    lds_model.mask_pruned_model()
    tmp_fit = evaluate(stu_model, valid_data, fields)
    print(tmp_fit)
    test_metrics(stu_model, fields)
    get_sparity(stu_model)
예제 #17
0
def sparse_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, w, out = a.v, a.w, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    N, D = X1.shape
    M = X2.size(0)
    if v is None:
        T = w.size(1)
    else:
        T = v.size(1)

    # Memory needs:
    # X1_chunk : ntot + 2 * D * ntot * density
    # X2       : dtot + 2 * D * M * density (because is transposed)
    # sparse_out : ntot + 2 * ntot * M * density (assume here density = 1)
    # ker_gpu  : M * ntot
    # w_gpu    : ntot * T
    # v_gpu    : M * T
    # out_gpu  : M * T
    avail_mem = max_mem / sizeof_dtype(dtype)
    den = 2 * D * X1.density + 2 + 3 * M + T
    sub = D + 2 * D * M * X2.density + M * T
    if v is not None:
        sub += M * T
    n = (avail_mem - sub) / den
    n = min(int(n), N)
    if n < 1:
        raise MemoryError("Not enough memory to run sparse dfmmv")

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        # Initialize GPU data
        w_gpu = create_same_stride((n, T), out, dtype, ddev)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = create_same_stride((M, T), out, dtype, ddev)
        out_gpu.fill_(0.0)
        ker_gpu = create_fortran((n, M), dtype, ddev)
        if v is not None:
            v_gpu = v.to(device=ddev)  # M x T

        X2_d = SparseTensor.from_scipy(
            X2.transpose_csc().to_scipy().tocsr(copy=False)) \
            .index_to_int() \
            .to(device=ddev)

        for i in range(0, N, n):
            ic = min(n, N - i)
            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)

            ker_chunk = ker_gpu[:ic]
            ker_chunk.fill_(0.0)

            # TODO: This is wasteful (X2 will be prepared many times over)
            ddd = kernel._prepare_sparse(X1_chunk, X2)
            ker_chunk = kernel._apply_sparse(X1_chunk_d, X2_d, ker_chunk)
            ker_chunk = kernel._finalize(ker_chunk, ddd)

            if w is not None:
                c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0)
            else:
                c_g_w = w_gpu.narrow(0, 0, ic)
                c_g_w.fill_(0.0)

            if v is not None:
                c_g_w.addmm_(ker_chunk, v_gpu)
            out_gpu.addmm_(ker_chunk.T, c_g_w)
            del ddd, X1_chunk, X1_chunk_d

        if not out.is_cuda:
            copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
    return out
예제 #18
0
def main():

    total_times = 100
    run_times = 0
    init_threshold = ...
    start_t = time.time()

    valid_data = torch.load(TRAIN_DATA + '.valid.pt')
    fields = onmt.IO.load_fields(torch.load(TRAIN_DATA + '.vocab.pt'))
    # fields = onmt.IO.load_fields_from_vocab(torch.load(TRAIN_DATA + '.vocab.pt'))
    valid_data.fields = fields  # we need to clear this assignment relationg if we want to transfere valid among threads

    checkpoint = torch.load(weights, map_location=lambda storage, loc: storage)
    model_opt = checkpoint['opt']
    # masked_models = []
    with cuda.device(GPU_ID):
        ref_model = onmt.ModelConstructor.make_base_model(
            model_opt, fields, True, checkpoint)
        ref_model.eval()
        ref_model.generator.eval()
        masked_model = MaskedModel(
            ref_model, group_dict, cuda.current_device(), cuda.current_device(
            ))  # ref_model is at current_device, no copy will happen
        # masked_models.append(masked_model)
    train_opt, _, _ = opt_initialize(checkpoint, 'opennmt_translate_opt.pt',
                                     'opennmt_translate_dummy_opt.pt')
    if GPU_ID:
        cuda.set_device(GPU_ID)

    # 只需要原始的accuracy
    acc_of_no_prune = 0
    get_acc_of_no_prune = False
    print(time_now(), "start while")
    while run_times < total_times:
        print("-----------------------------------------")
        print("-----------------------------------------")
        print("start Iteration ", run_times)
        # init threshold
        best_threshold = 0
        itr_time = time.time()

        xxx = np.arange(0., 1, 0.01)
        print(time_now(), "start testing pruning")
        masked_model.make_evaluable()
        for i in range(len(xxx)):
            # best_threshold = 0.2
            # break
            tmp_crate = len(masked_model.group_name_list) * [xxx[i]]
            masked_model.change_mask(tmp_crate, apply_MP_on_mask)
            masked_model.apply_mask()
            tmp_fit = evaluate(masked_model, valid_data, fields)

            print('percentage %s => acc (%.4f), ppl (%.4f)' %
                  (xxx[i] * 100, tmp_fit[1], tmp_fit[0]))
            if i == 0 and not get_acc_of_no_prune:
                acc_of_no_prune = tmp_fit[1]
                acc_of_no_prune = int(acc_of_no_prune * 10) / 10
                get_acc_of_no_prune = True
            elif acc_of_no_prune - tmp_fit[1] > acc_percent_prune:
                best_threshold = xxx[i] - 0.01
                break
        # -------------------------------------------------
        # Start writing
        # prune again
        print(time_now(), " init accuracy of model:", acc_of_no_prune)
        print("accuracy constraint:", acc_percent_prune)
        print("-------test------------:", get_acc_of_no_prune)
        print(time_now(), " apply pruning with threshold:", best_threshold)
        tmp_crate = len(masked_model.group_name_list) * [best_threshold]
        masked_model.change_mask(tmp_crate, apply_MP_on_mask)
        masked_model.apply_mask()

        # print information
        tmp_fit = evaluate(masked_model, valid_data, fields)
        print('percentage %s => acc (%.4f), ppl (%.4f)' %
              (best_threshold * 100, tmp_fit[1], tmp_fit[0]))
        model_sparsity = masked_model.get_sparsity()
        print('Sparsity: {}'.format(model_sparsity))

        #--------------- start retraining --------------
        # first store model
        print(time_now(), "start saving model")
        _, saved_model = update_checkpoint(checkpoint, masked_model, run_times,
                                           acc_percent_prune)
        print(time_now(), "finish saving model:", saved_model)

        model_for_train = masked_model
        pretrained_leaf_dict = model_for_train.make_trainable()
        optim = build_optim(model_for_train.masked_model, checkpoint,
                            train_opt, pretrained_leaf_dict)
        print("finish building optim")

        print(time_now(), "start loading data for retraining")
        train = torch.load(train_opt.data + '.train.pt')
        valid = torch.load(train_opt.data + '.valid.pt')
        train_fields = load_fields(train, valid, checkpoint, train_opt)
        print(time_now(), "finish data loading")

        recovered = train_model(model_for_train, train, valid, train_fields,
                                optim, train_opt, run_times, acc_of_no_prune)
        print(time_now(), "finish retraining ")
        if not recovered:
            exit()
        else:
            print("------------Accuracy recorverd!--------------------")
            print("recovered accuracy:", acc_of_no_prune)

        #-------------------------------------------------
        print('------------- save checkpoint ---------------')
        _, saved_model = update_checkpoint(checkpoint,
                                           model_for_train,
                                           run_times,
                                           t=True)
        print(time_now(), ' saving model:', saved_model)
        print("-------------print evaluation info ---------------")
        model_for_train.make_evaluable()
        tmp_fit = evaluate(model_for_train, valid_data, fields)
        print('percentage %s => acc (%.4f), ppl (%.4f)' %
              (best_found * 100, tmp_fit[1], tmp_fit[0]))
        model_sparsity = model_for_train.get_sparsity()
        print('Sparsity: {}'.format(model_sparsity))
        #--------------------------------------------------
        print("BLEU evaluation:")
        translate_opt, translate_dummy_opt = translate_opt_initialize(
            'opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt')
        translator = init_translate_model(translate_opt, translate_dummy_opt)
        del translator.model
        translator.model = model_for_train.masked_model
        tt = open(translate_opt.tgt, 'r')
        references = [[t] for t in tt]
        translate_data = onmt.IO.ONMTDataset(translate_opt.src,
                                             translate_opt.tgt,
                                             fields,
                                             use_filter_pred=False)
        prune_data = onmt.IO.OrderedIterator(dataset=translate_data,
                                             device=GPU_ID,
                                             batch_size=1,
                                             train=False,
                                             sort=False,
                                             shuffle=False)
        tmp_fit2 = evaluate_trans(translator, references, prune_data,
                                  translate_data)
        print('Finsished => bleu (%.4f), ppl (%.4f)' %
              (tmp_fit2[1] * 100, tmp_fit2[0]))
        #--------------------------------------------------

        run_times += 1
예제 #19
0
def distk_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()
    X1, X2, v, out = a.X1, a.X2, a.v, a.out
    kernel: L2DistanceKernel = a.kernel
    max_mem = a.max_mem

    N, D = X1.shape
    M = X2.shape[0]
    T = v.shape[1]
    dtype = X1.dtype

    # GPU memory usage:
    # X1s : n x D
    # X2s : m x D
    # vs  : m x T
    # nm  : n x m
    # out : n x T
    # -----------
    # total: n*m + n * (D + T) + m * (D + T) = R
    avail_mem = max_mem / sizeof_dtype(dtype)
    #if sizeof_dtype(dtype) == 4:
    #    avail_mem /= 2
    n, m = select_dim_over_m(maxM=M,
                             maxN=N,
                             coef_nm=1.0,
                             coef_n=D + T,
                             coef_m=D + T,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        nm_gpu = create_same_stride((n, m), X1, dtype, ddev)
        out_gpu = create_same_stride((n, T), out, dtype, ddev)
        X1s_gpu = create_same_stride((n, D), X1, dtype, ddev)
        X2s_gpu = create_same_stride((m, D), X2, dtype, ddev)
        vs_gpu = create_same_stride((m, T), v, dtype, ddev)

        for i in range(0, N, n):
            nb = min(n, N - i)
            cur_X1s_gpu = copy_to_device_noorder(nb, D, X1, i, 0, X1s_gpu, 0,
                                                 0)
            sq1 = torch.norm(cur_X1s_gpu, p=2, dim=1, keepdim=True).pow_(2)
            cur_out_gpu = out_gpu.narrow(0, 0, nb)  # n x T
            cur_out_gpu.fill_(0.0)

            for j in range(0, M, m):
                mb = min(m, M - j)
                cur_X2s_gpu = copy_to_device_noorder(mb, D, X2, j, 0, X2s_gpu,
                                                     0, 0)
                cur_vs_gpu = copy_to_device_noorder(mb, T, v, j, 0, vs_gpu, 0,
                                                    0)  # m x T
                cur_nm_gpu = nm_gpu[:nb, :mb]  # n x m

                sq2 = torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True).pow_(2)
                torch.mm(cur_X1s_gpu, cur_X2s_gpu.T, out=cur_nm_gpu)

                cur_nm_gpu.mul_(-2.0)
                cur_nm_gpu.add_(sq1)
                cur_nm_gpu.add_(sq2.T)
                cur_nm_gpu.clamp_min_(0)
                kernel._transform(cur_nm_gpu)

                # Multiply by the vector v
                # FIXME: This is the cause of mapping errors in case of float32 calculations.
                cur_out_gpu.addmm_(cur_nm_gpu, cur_vs_gpu)  # n x T
            # send result to CPU
            copy_to_host_noorder(nb, T, out_gpu, 0, 0, out, i, 0)

    return out
예제 #20
0
def main():

    total_times = 10
    run_times = 0
    init_threshold = ...
    start_t = time.time()

    valid_data = torch.load(TRAIN_DATA + '.valid.pt')
    fields = onmt.IO.load_fields(torch.load(TRAIN_DATA + '.vocab.pt'))
    valid_data.fields = fields  # we need to clear this assignment relationg if we want to transfere valid among threads

    if GPU_ID:
        cuda.set_device(GPU_ID)

    while run_times < total_times:

        itr_time = time.time()
        # intit pruning
        checkpoint = torch.load(weights,
                                map_location=lambda storage, loc: storage)
        model_opt = checkpoint['opt']
        masked_models = []
        with cuda.device(GPU_ID):
            ref_model = onmt.ModelConstructor.make_base_model(
                model_opt, fields, True, checkpoint)
            ref_model.eval()
            ref_model.generator.eval()
            masked_model = MaskedModel(
                ref_model, group_dict, cuda.current_device(),
                cuda.current_device(
                ))  # ref_model is at current_device, no copy will happen
            masked_models.append(masked_model)
        '''
       display all the names of parameters
      '''
        '''
       aa=ref_model.named_parameters
       aa_namelist = [ak[0] for ak in aa]
      '''
        '''
       test MP
      '''
        '''
      translate_opt, translate_dummy_opt = translate_opt_initialize('opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt')
      translator = init_translate_model(translate_opt, translate_dummy_opt)
      del translator.model
      translator.model = masked_model
      tt=open(translate_opt.tgt, 'r')
      references = [[t] for t in tt]

      xxx=np.arange(0.,1, 0.01)
      #for i in range(len(masked_model.group_name_list)):
      # tmp_crate = len(masked_model.group_name_list)*[0.]
      for i in range(len(xxx)):
       translate_data = onmt.IO.ONMTDataset(
        translate_opt.src, translate_opt.tgt, fields,
        use_filter_pred=False)
       prune_data = onmt.IO.OrderedIterator(
        dataset=translate_data, device=GPU_ID,
        batch_size=1, train=False, sort=False,
        shuffle=False)
       tmp_crate = len(masked_model.group_name_list)*[xxx[i]]
       #tmp_crate[i] = 0.01
       masked_model.change_mask(tmp_crate, apply_MP_on_mask)
       masked_model.apply_mask()
       tmp_fit = evaluate(masked_model, valid_data, fields)
       #tmp_fit = evaluate_trans(translator, references, prune_data, translate_data)
       #logger.scalar_summary('test_bleu', tmp_fit[1]*100, int(xxx[i]*100))
       logger.scalar_summary('acc', tmp_fit[1], int(xxx[i]*100))
       logger.scalar_summary('ppl', tmp_fit[0], int(xxx[i]*100))
       #logger.scalar_summary('test_ppl', tmp_fit[0], int(xxx[i]*100))
       #print('group %s => acc (%.4f), ppl (%.4f)' % (masked_model.group_name_list[i], tmp_fit[1], tmp_fit[0]))
       #print('percentage %s => bleu (%.4f), ppl (%.4f)' % (xxx[i]*100, tmp_fit[1]*100, tmp_fit[0]))
       print('percentage %s => acc (%.4f), ppl (%.4f)' % (xxx[i]*100, tmp_fit[1], tmp_fit[0]))
      exit()
      '''

        with cuda.device(GPU_ID):
            masked_models.append(
                MaskedModel(ref_model, group_dict, GPU_ID, gpu_candidate)
            )  # if the gpu_candidate is the same as ref_model, it will return the ref_model

        del ref_model

        # do pruning
        ncs_start = time.time()
        print('Itration %d, model loading: %d sec' %
              (run_times, ncs_start - itr_time))

        init_threshold = len(masked_models[0].group_name_list) * [0.10]
        best_found, saved_model, best_masked_model = NCS_MP(
            init_threshold, 0.05, fields, masked_models, valid_data, 1.0,
            run_times, checkpoint)
        init_thresholds = best_found[0]  # best pop found

        end_t = time.time()
        print('NCS Time: {} min'.format((end_t - itr_time) / 60.))

        # clear no used models
        for gpu_model in masked_models:
            del gpu_model
        #-----------------------------------------------------------------
        '''
      masked_models = []
      with cuda.device(GPU_ID):
         ref_model = best_masked_model
         masked_model = MaskedModel(ref_model, group_dict2, cuda.current_device(), cuda.current_device()) # ref_model is at current_device, no copy will happen
         masked_models.append(masked_model)

      for gpu_candidate in other_GPU_IDs:
       with cuda.device(gpu_candidate):
         masked_models.append(MaskedModel(ref_model, group_dict2, GPU_ID, gpu_candidate)) # if the gpu_candidate is the same as ref_model, it will return the ref_model
      
      del ref_model

      # do pruning
      ncs_start = time.time()
      print('*Itration %d, model loading: %d sec' % (run_times, ncs_start - itr_time))

    
      init_threshold = len(masked_models[0].group_name_list)*[0.10]
      best_found, saved_model, best_masked_model = NCS_MP(init_threshold, 0.05, fields, masked_models, valid_data, 0.5, run_times, checkpoint)
      init_thresholds = best_found[0] # best pop found

      end_t = time.time()
      print('NCS Time: {} min'.format((end_t - itr_time)/60.))

      # clear no used models
      for gpu_model in masked_models:
        del gpu_model
      '''

        exit()

        # training
        checkpoint = torch.load(saved_model,
                                map_location=lambda storage, loc: storage)
        train_opt, _, _ = opt_initialize(checkpoint,
                                         'opennmt_translate_opt.pt',
                                         'opennmt_translate_dummy_opt.pt')
        # train data loading
        train = torch.load(train_opt.data + '.train.pt')
        valid = torch.load(train_opt.data + '.valid.pt')

        train_fields = load_fields(train, valid, checkpoint, train_opt)
        model_for_train = init_train_model(checkpoint1, train_opt,
                                           train_fields)  # fields need data
        optim = build_optim(model_for_train, checkpoint, train_opt)

        model_sparsity = get_sparsity(model_for_train)
        logger.scalar_summary('model_sparsity_%s' % num_runs, model_sparsity,
                              run_times)
        print('Sparsity: {}'.format(model_sparsity))

        train_model(model_for_train, train, valid, train_fields, optim,
                    train_opt, run_times)

        # update global variabel weights
        run_times += 1
예제 #21
0
def sparse_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()

    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, out = a.v, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    ntot, dtot = X1.shape
    mtot, T = v.size()

    avail_mem = max_mem / sizeof_dtype(dtype)
    # Memory needs:
    # X1_chunk : N + 2*D*N*density
    # X2_chunk : D + 2*D*M*density (because is transposed)
    # sparse_out : N + 2*N*M*(density) (assume density = 1)
    # ker_gpu  : M*N
    # mmv_gpu  : N*T
    # v_gpu    : M*T
    # Other: GPU buffer
    n, m = select_dim_over_m(
        maxM=mtot,
        maxN=ntot,
        tot=avail_mem,
        coef_nm=3,
        coef_n=2 + 2 * dtot * X1.density + T,
        coef_m=2 * dtot * X2.density + T,
        rest=dtot,
    )

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        v_gpu = v.to(device=ddev)  # M x T
        mmv_gpu = create_same_stride((n, T), out, dtype, ddev)
        # ker_gpu should be fortran-ordered due to cusparse csr2dense function
        ker_gpu = create_fortran((n, m), dtype=dtype, device=ddev)

        for i in range(0, ntot, n):
            ic = min(n, ntot - i)

            cur_mmv_gpu = mmv_gpu[:ic]  # n x T
            cur_mmv_gpu.fill_(0.0)

            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)
            for j in range(0, mtot, m):
                jc = min(m, mtot - j)

                X2_chunk = X2.narrow_rows(j, jc)
                # Prepare sparse on CPU
                ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)

                # Transpose X2-chunk and convert it to CSR. This uses lots of RAM
                X2_chunk_d = SparseTensor.from_scipy(
                    X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \
                    .index_to_int() \
                    .to(device=ddev)

                cur_ker_gpu = ker_gpu[:ic, :jc]
                cur_ker_gpu.fill_(0.0)
                # Run the matrix multiplication (kernel apply)
                cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d,
                                                   cur_ker_gpu)
                cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd)

                # Multiply by the vector v
                cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc))
                del ddd, X2_chunk, X2_chunk_d

            # send result to CPU
            copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0)
            del X1_chunk, X1_chunk_d
    return out
예제 #22
0
파일: OntoEmma.py 프로젝트: lucylw/ontoemma
    def _align_nn(self,
                  model_path,
                  source_kb,
                  target_kb,
                  candidate_selector,
                  cuda_device,
                  batch_size=128):
        """
        Align using neural network model
        :param source_kb:
        :param target_kb:
        :param candidate_selector:
        :param cuda_device: GPU device number
        :return:
        """

        # returns json representation of entity
        def _form_json_entity(ent_to_json, kb):
            all_rels = [
                kb.relations[r_id] for r_id in ent_to_json.relation_ids
            ]
            par_ents = [
                kb.get_entity_by_research_entity_id(r.entity_ids[1])
                for r in all_rels
                if r.relation_type in constants.UMLS_PARENT_REL_LABELS
            ]
            chd_ents = [
                kb.get_entity_by_research_entity_id(r.entity_ids[1])
                for r in all_rels
                if r.relation_type in constants.UMLS_CHILD_REL_LABELS
            ]
            return {
                'research_entity_id': ent_to_json.research_entity_id,
                'canonical_name': ent_to_json.canonical_name,
                'aliases': ent_to_json.aliases,
                'definition': ent_to_json.definition,
                'other_contexts': ent_to_json.other_contexts,
                'par_relations': [e.canonical_name for e in par_ents],
                'chd_relations': [e.canonical_name for e in chd_ents]
            }

        from emma.allennlp_classes.ontoemma_dataset_reader import OntologyMatchingDatasetReader
        from emma.allennlp_classes.ontoemma_model import OntoEmmaNN
        from emma.allennlp_classes.ontoemma_predictor import OntoEmmaPredictor

        alignment, s_ent_ids, t_ent_ids = self._align_string_equiv(
            source_kb, target_kb)
        sys.stdout.write("%i alignments with string equivalence\n" %
                         len(alignment))

        if cuda_device > 0:
            with device(cuda_device):
                archive = load_archive(model_path, cuda_device=cuda_device)
        else:
            archive = load_archive(model_path, cuda_device=cuda_device)

        predictor = Predictor.from_archive(archive, 'ontoemma-predictor')

        sys.stdout.write("Making predictions...\n")
        s_ent_tqdm = tqdm.tqdm(s_ent_ids, total=len(s_ent_ids))
        temp_alignments = defaultdict(list)

        if cuda_device > 0:
            with device(cuda_device):
                batch_json_data = []

                for s_ent_id in s_ent_tqdm:
                    s_ent = source_kb.get_entity_by_research_entity_id(
                        s_ent_id)
                    for t_ent_id in candidate_selector.select_candidates(
                            s_ent_id)[:constants.KEEP_TOP_K_CANDIDATES]:
                        t_ent = target_kb.get_entity_by_research_entity_id(
                            t_ent_id)
                        json_data = {
                            'source_ent': _form_json_entity(s_ent, source_kb),
                            'target_ent': _form_json_entity(t_ent, target_kb),
                            'label': 0
                        }
                        batch_json_data.append(json_data)

                        if len(batch_json_data) == batch_size:
                            results = predictor.predict_batch_json(
                                batch_json_data, cuda_device)
                            for model_input, output in zip(
                                    batch_json_data, results):
                                if output['predicted_label'] == [1.0]:
                                    temp_alignments[model_input['source_ent'][
                                        'research_entity_id']].append(
                                            (model_input['target_ent']
                                             ['research_entity_id'],
                                             output['score'][0]))
                            batch_json_data = []
        else:
            for s_ent_id in s_ent_tqdm:
                s_ent = source_kb.get_entity_by_research_entity_id(s_ent_id)
                for t_ent_id in candidate_selector.select_candidates(
                        s_ent_id)[:constants.KEEP_TOP_K_CANDIDATES]:
                    t_ent = target_kb.get_entity_by_research_entity_id(
                        t_ent_id)
                    json_data = {
                        'source_ent': _form_json_entity(s_ent, source_kb),
                        'target_ent': _form_json_entity(t_ent, target_kb),
                        'label': 0
                    }
                    output = predictor.predict_json(json_data, cuda_device)
                    if output['predicted_label'] == [1.0]:
                        temp_alignments[json_data['source_ent'][
                            'research_entity_id']].append(
                                (json_data['target_ent']['research_entity_id'],
                                 output['score'][0]))

        alignment = []

        for s_ent_id, matches in temp_alignments.items():
            if len(matches) > 0:
                m_sort = sorted(matches, key=lambda p: p[1], reverse=True)
                if m_sort[0][1] >= constants.NN_SCORE_THRESHOLD:
                    alignment.append((s_ent_id, m_sort[0][0], m_sort[0][1]))

        return alignment
예제 #23
0
파일: make_cam.py 프로젝트: jnyborg/irn
def _work(process_id, model, dataset, args):

    databin = dataset[process_id]
    n_gpus = torch.cuda.device_count()
    data_loader = DataLoader(databin,
                             shuffle=False,
                             num_workers=args.num_workers // n_gpus,
                             pin_memory=False)

    with torch.no_grad(), cuda.device(process_id):

        model.cuda()

        for iter, pack in enumerate(tqdm(data_loader)):

            img_name = pack['name'][0]
            label = pack['label'][0]
            size = pack['size']

            strided_size = imutils.get_strided_size(size, 4)
            strided_up_size = imutils.get_strided_up_size(size, 16)

            # Run through each scale of image
            outputs = [
                model(img[0].cuda(non_blocking=True)) for img in pack['img']
            ]

            # Each output is resized to strided_size (lower than original) and summed
            strided_cam = torch.sum(
                torch.stack([
                    F.interpolate(torch.unsqueeze(o, 0),
                                  strided_size,
                                  mode='bilinear',
                                  align_corners=False)[0] for o in outputs
                ]), 0)

            # Each output is resized to strided_up_size (which should be orignal size?)
            highres_cam = [
                F.interpolate(torch.unsqueeze(o, 1),
                              strided_up_size,
                              mode='bilinear',
                              align_corners=False) for o in outputs
            ]
            highres_cam = torch.sum(torch.stack(highres_cam, 0),
                                    0)[:, 0, :size[0], :size[1]]

            # Pick the cams corresponding to image-level labels
            # Normalize by max value across H x W dimension for each channel
            valid_cat = torch.nonzero(label, as_tuple=False)[:, 0]

            strided_cam = strided_cam[valid_cat]
            strided_cam /= F.adaptive_max_pool2d(strided_cam, (1, 1)) + 1e-5

            highres_cam = highres_cam[valid_cat]
            highres_cam /= F.adaptive_max_pool2d(highres_cam, (1, 1)) + 1e-5

            # save cams
            np.save(
                os.path.join(args.cam_out_dir, img_name + '.npy'), {
                    "keys": valid_cat,
                    "cam": strided_cam.cpu(),
                    "high_res": highres_cam.cpu().numpy()
                })

            if process_id == n_gpus - 1 and iter % (len(databin) // 20) == 0:
                print("%d " % ((5 * iter + 1) // (len(databin) // 20)), end='')
                sys.stdout.flush()
예제 #24
0
    def __init__(self, pretrained, group_dict, source_device_id,
                 target_device_id):
        super(MaskedModel, self).__init__()
        self.skip_mark = 'skip'  #skip the layer while pruning
        self.mask_dict = {}
        #self.sort_tensors = {}
        self.layer_element_num = []
        self.layer_num = 0
        self.layer_name_dict = {}
        self.sparsity = 1.0
        self.total_parameter_num = 0

        # used for group parameters
        self.group_name_list = []
        self.map_dict = {}
        self.group_num_dict = {}
        self.group_parameter_dict = {}
        self.group_threshold_list = {}

        # gpu realted
        self.sgpu_id = source_device_id
        self.tgpu_id = target_device_id

        # settings for retraining
        self.pre_forward_fn = None
        self.forward_fn = None

        # init group dicts
        self.group_name_list = [
            k for k in group_dict.keys()
        ]  # the indices of list will map the the thresholds list which is accepted at pruning
        self.group_threshold_list = len(self.group_name_list) * [0.]
        for key, layer_names in group_dict.items():
            self.group_num_dict[key] = 0
            # maping the layer name to group name
            for layer_name in layer_names:
                self.map_dict[layer_name] = key

        pretrained_model_on_device = None
        # for each retrieval, transfer the pretrained model to a dictionary
        if source_device_id == target_device_id:
            pretrained_model_on_device = pretrained
        else:
            pretrained_model_on_device = my_replicate(
                pretrained, source_device_id,
                target_device_id)  #[source_device_id, target1, ...]

        with cuda.device(target_device_id):
            self.pretrained_model_dict = dict([
                (n, v)
                for n, v in pretrained_model_on_device.named_parameters()
            ])  # 20191023 lgy
            #self.pretrained_model_dict = dict([(n,v) for n,v in pretrained_model_on_device.state_dict().items()])
            for param_name, module_param in self.pretrained_model_dict.items():
                if param_name in self.map_dict:  # ignore no-grouped layers
                    self.group_num_dict[
                        self.map_dict[param_name]] += module_param.nelement()
            for group_name in self.group_name_list:
                self.group_parameter_dict[group_name] = torch.cuda.FloatTensor(
                    1, self.group_num_dict[group_name])
            self.generate_mask(
                self.pretrained_model_dict)  # init self.mask_dict
            self.masked_model = my_replicate(pretrained, source_device_id,
                                             target_device_id)
            #self.generator = self.masked_model.generator
            self.encoder = self.masked_model.encoder
            self.decoder = self.masked_model.decoder
예제 #25
0
def _work(process_id, model, dataset, args):

    databin = dataset[process_id]
    n_gpus = torch.cuda.device_count()
    data_loader = DataLoader(databin, shuffle=False, pin_memory=False)

    with torch.no_grad(), cuda.device(process_id):

        model.cuda()

        with tqdm(total=len(data_loader)) as pbar:
            for iter, pack in enumerate(data_loader):

                img_name = pack['name'][0]
                size = pack['size']

                strided_size = imutils.get_strided_size(size, 4)
                strided_up_size = imutils.get_strided_up_size(size, 16)

                if args.dataset in ['adp_morph', 'adp_func']:
                    outputs, labels = zip(*[
                        model(img.cuda(
                            non_blocking=True), orig_img.cuda(
                                non_blocking=True))
                        for img, orig_img in zip(pack['img'], pack['orig_img'])
                    ])
                else:
                    outputs, labels = zip(*[
                        model(img.cuda(non_blocking=True))
                        for img in pack['img']
                    ])
                if 'train' in args.split:
                    label = pack['label'][0]
                else:
                    label = labels[0][args.use_cls]

                valid_cat = torch.nonzero(label)[:, 0]
                if args.dataset in ['adp_morph', 'adp_func']:
                    if torch.cuda.is_available():
                        valid_cat = torch.cat(
                            (torch.from_numpy(
                                np.array(range(len(args.class_names['bg'])),
                                         dtype=np.int64)).cuda(),
                             valid_cat.cuda() + len(args.class_names['bg'])))
                    else:
                        valid_cat = torch.cat(
                            (torch.from_numpy(
                                np.array(range(len(args.class_names['bg'])),
                                         dtype=np.int64)),
                             valid_cat + len(args.class_names['bg'])))

                if len(valid_cat) > 0:
                    strided_cam = torch.sum(
                        torch.stack([
                            F.interpolate(torch.unsqueeze(o, 0),
                                          strided_size,
                                          mode='bilinear',
                                          align_corners=False)[0]
                            for o in outputs
                        ]), 0)

                    highres_cam = [
                        F.interpolate(torch.unsqueeze(o, 1),
                                      strided_up_size,
                                      mode='bilinear',
                                      align_corners=False) for o in outputs
                    ]
                    highres_cam = torch.sum(torch.stack(tuple(highres_cam), 0),
                                            0)[:, 0, :size[0], :size[1]]

                    strided_cam = strided_cam[valid_cat]
                    strided_cam /= F.adaptive_max_pool2d(strided_cam,
                                                         (1, 1)) + 1e-5

                    highres_cam = highres_cam[valid_cat]
                    highres_cam /= F.adaptive_max_pool2d(highres_cam,
                                                         (1, 1)) + 1e-5

                    # save cams
                    if args.dataset not in ['deepglobe', 'deepglobe_balanced']:
                        np.save(
                            os.path.join(args.cam_out_dir, img_name + '.npy'),
                            {
                                "keys": valid_cat.cpu().numpy(),
                                "cam": strided_cam.cpu().numpy(),
                                "high_res": highres_cam.cpu().numpy()
                            })
                    else:
                        np.save(
                            os.path.join(args.cam_out_dir, img_name + '.npy'),
                            {
                                "keys": valid_cat.cpu().numpy(),
                                "cam": strided_cam.cpu().numpy()
                            })
                else:
                    np.save(
                        os.path.join(args.cam_out_dir, img_name + '.npy'), {
                            "keys": np.empty(0),
                            "cam": np.empty(0),
                            "high_res": np.empty(0)
                        })
                pbar.update(1)
예제 #26
0
def main():

    total_times = 100
    run_times = 0
    init_threshold = ...
    start_t = time.time()

    valid_data = torch.load(TRAIN_DATA + '.valid.pt')
    fields = onmt.IO.load_fields(torch.load(TRAIN_DATA + '.vocab.pt'))
    # fields = onmt.IO.load_fields_from_vocab(torch.load(TRAIN_DATA + '.vocab.pt'))
    valid_data.fields = fields # we need to clear this assignment relationg if we want to transfere valid among threads

    checkpoint = torch.load(weights, map_location=lambda storage, loc: storage)
    model_opt = checkpoint['opt']
    masked_models = []
    with cuda.device(GPU_ID):
        ref_model = onmt.ModelConstructor.make_base_model(model_opt, fields, True, checkpoint)
        ref_model.eval()
        ref_model.generator.eval()
        masked_model = MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device()) # ref_model is at current_device, no copy will happen
        masked_models.append(masked_model)
    train_opt, _, _ = opt_initialize(checkpoint, 'opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt')
    if GPU_ID:
        cuda.set_device(GPU_ID)

    print("BLEU evaluation:")
    translate_opt, translate_dummy_opt = translate_opt_initialize('opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt')
    translator = init_translate_model(translate_opt, translate_dummy_opt)
    del translator.model
    translator.model = masked_model.masked_model
    tt=open(translate_opt.tgt, 'r')
    references = [[t] for t in tt]
    translate_data = onmt.IO.ONMTDataset(translate_opt.src, translate_opt.tgt,fields,use_filter_pred=False)
    prune_data = onmt.IO.OrderedIterator(dataset=translate_data, device=GPU_ID,batch_size=1, train=False, sort=False,shuffle=False)
    tmp_fit2 = evaluate_trans(translator, references, prune_data, translate_data)
    print('Finsished => bleu (%.4f), ppl (%.4f)' % (tmp_fit2[1]*100, tmp_fit2[0]))
    exit()
    # print(time_now(), "get accuray of no pruning model")
    # masked_model.make_evaluable()
    # tmp_crate = len(masked_model.group_name_list)*[0]
    # masked_model.change_mask(tmp_crate, apply_MP_on_mask)
    # masked_model.apply_mask()
    # tmp_fit = evaluate(masked_model, valid_data, fields)
    # # 只需要原始的accuracy
    # acc_of_no_prune = tmp_fit[1]
    # acc_of_no_prune = int(acc_of_no_prune*10)/10
    print("init accuracy of model:", acc_of_no_prune)
    print("accuracy constraint:", acc_percent_prune)
    while run_times < total_times:
        print("-----------------------------------------")
        print("-----------------------------------------")
        print("-----------------------------------------")
        print(time_now(), "start Iteration ", run_times)

        print("test model---------------")
        ref_model.eval()
        ref_model.generator.eval()
        tmp_fit = evaluate(ref_model, valid_data, fields)
        print('ref_model','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0]))

        print("test model---------------")
        masked_models[0].make_evaluable()
        tmp_fit = evaluate(masked_models[0], valid_data, fields)
        print('masked_models[0]','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0]))
        model_sparsity = masked_models[0].get_sparsity()
        print('masked_models[0] Sparsity: {}'.format(model_sparsity))


        itr_time = time.time()
        for gpu_candidate in other_GPU_IDs:
            with cuda.device(gpu_candidate):
                masked_models.append(MaskedModel(ref_model, group_dict, GPU_ID, gpu_candidate)) # if the gpu_candidate is the same as ref_model, it will return the ref_model
        
        #------------- Here -------------------------
        # del ref_model

        # do pruning
        ncs_start = time.time()
        print('Itration %d, model loading: %d sec' % (run_times, ncs_start - itr_time))
        if run_times == 0:
            if START_THRESHOLD is not None:
                init_threshold = START_THRESHOLD
            else:
                init_threshold = len(masked_models[0].group_name_list)*[0.25]
        # if run_times == 0:
        #     init_threshold = len(masked_models[0].group_name_list)*[0.25]
        print("init threshold:", init_threshold)
        prune_acc_now = acc_percent_prune+tmp_fit[1]-acc_of_no_prune
        print('pruning acc now:', prune_acc_now)
        best_found, saved_model, best_masked_model = NCS_MP(init_threshold, 0.05, fields, masked_models, valid_data, prune_acc_now, run_times, checkpoint)
        init_threshold = best_found
        #best_found, saved_model, best_masked_model = NCS_MP(init_threshold, 0.05, fields, masked_models, valid_data, 0.01, run_times, checkpoint)

        end_t = time.time()
        print('NCS Time: {} min'.format((end_t - itr_time)/60.))
        print('Best found thresholds:')
        for i in range(len(masked_models[0].group_name_list)):
            print("layer {}: {}%".format(masked_models[0].group_name_list[i], 100*best_found[i]))

        print("BLEU evaluation:")
        translate_opt, translate_dummy_opt = translate_opt_initialize('opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt')
        translator = init_translate_model(translate_opt, translate_dummy_opt)
        del translator.model
        translator.model = best_masked_model
        tt=open(translate_opt.tgt, 'r')
        references = [[t] for t in tt]
        translate_data = onmt.IO.ONMTDataset(translate_opt.src, translate_opt.tgt,fields,use_filter_pred=False)
        prune_data = onmt.IO.OrderedIterator(dataset=translate_data, device=GPU_ID,batch_size=1, train=False, sort=False,shuffle=False)
        tmp_fit = evaluate_trans(translator, references, prune_data, translate_data)
        print('Finsished => bleu (%.4f), ppl (%.4f)' % (tmp_fit[1]*100, tmp_fit[0]))

        # clear no used models
        for gpu_model in masked_models:
            del gpu_model

        #--------------- start retraining --------------
        model_for_train = best_masked_model
        pretrained_leaf_dict = model_for_train.make_trainable()
        optim = build_optim(model_for_train.masked_model, checkpoint, train_opt, pretrained_leaf_dict)

        print(time_now(), "start loading data for retraining")
        train = torch.load(train_opt.data+ '.train.pt')
        valid = torch.load(train_opt.data + '.valid.pt')
        train_fields = load_fields(train, valid, checkpoint, train_opt)
        print(time_now(), "finish data loading")
        model_for_train.change_mask(init_threshold, apply_MP_on_mask)
        model_for_train.apply_mask()
        model_for_train.make_trainable()

        recovered = train_model(model_for_train, train, valid, train_fields, optim, train_opt, run_times, acc_of_no_prune)
        print(time_now(), "finish retraining ")
        if not recovered:
            exit()
        else:
            print("------------Accuracy recorverd!--------------------")
            print("recovered accuracy:", acc_of_no_prune)
        model_for_train.make_evaluable()

        ref_model = model_for_train.masked_model
        masked_models = [MaskedModel(ref_model, group_dict, cuda.current_device(), cuda.current_device())]

        print("test model---------------")
        tmp_fit = evaluate(ref_model, valid_data, fields)
        print('ref_model','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0]))

        print("test model---------------")
        tmp_fit = evaluate(masked_models[0], valid_data, fields)
        print('masked_models[0]','acc (%.4f), ppl (%.4f)' % (tmp_fit[1], tmp_fit[0]))
        model_sparsity = masked_models[0].get_sparsity()
        print('masked_models[0] Sparsity: {}'.format(model_sparsity))
        
        print('------------- save checkpoint ---------------')
        _, saved_model = update_checkpoint(checkpoint, model_for_train, run_times, acc_percent_prune, t=True)
        print(time_now(), ' saving model:', saved_model)
        print("-------------print evaluation info ---------------")
        tmp_fit = evaluate(model_for_train, valid_data, fields)
        print('percentage %s => acc (%.4f), ppl (%.4f)' % (best_found*100, tmp_fit[1], tmp_fit[0]))
        #--------------------------------------------------
        print("BLEU evaluation:")
        translate_opt, translate_dummy_opt = translate_opt_initialize('opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt')
        translator = init_translate_model(translate_opt, translate_dummy_opt)
        del translator.model
        translator.model = model_for_train.masked_model
        tt=open(translate_opt.tgt, 'r')
        references = [[t] for t in tt]
        translate_data = onmt.IO.ONMTDataset(translate_opt.src, translate_opt.tgt,fields,use_filter_pred=False)
        prune_data = onmt.IO.OrderedIterator(dataset=translate_data, device=GPU_ID,batch_size=1, train=False, sort=False,shuffle=False)
        tmp_fit2 = evaluate_trans(translator, references, prune_data, translate_data)
        print('Finsished => bleu (%.4f), ppl (%.4f)' % (tmp_fit2[1]*100, tmp_fit2[0]))
        #--------------------------------------------------
        run_times += 1
def _work(process_id, model, dataset, args):
    databin = dataset[process_id]
    n_gpus = torch.cuda.device_count()
    data_loader = DataLoader(databin,
                             shuffle=False,
                             num_workers=args.num_workers // n_gpus,
                             pin_memory=True)
    print("dcpu", args.num_workers // n_gpus)
    cam_sizes = [[], [], [], []]  # scale 0,1,2,3
    with cuda.device(process_id):
        model.cuda()
        gcam = GradCAM(model=model, candidate_layers=[args.target_layer])
        for iter, pack in enumerate(data_loader):
            img_name = pack['name'][0]
            if os.path.exists(os.path.join(args.cam_out_dir,
                                           img_name + '.npy')):
                continue
            size = pack['size']
            strided_size = imutils.get_strided_size(size, 4)
            strided_up_size = imutils.get_strided_up_size(size, 16)
            outputs_cam = []
            n_classes = len(list(torch.nonzero(pack['label'][0])[:, 0]))

            for s_count, size_idx in enumerate([1, 0, 2, 3]):
                orig_img = pack['img'][size_idx].clone()
                for c_idx, c in enumerate(
                        list(torch.nonzero(pack['label'][0])[:, 0])):
                    pack['img'][size_idx] = orig_img
                    img_single = pack['img'][size_idx].detach()[
                        0]  # [:, 1]: flip

                    if size_idx != 1:
                        total_adv_iter = args.adv_iter
                    else:  # size_idx == 0
                        if args.adv_iter > 10:
                            total_adv_iter = args.adv_iter // 2
                            mul_for_scale = 2
                        elif args.adv_iter < 6:
                            total_adv_iter = args.adv_iter
                            mul_for_scale = 1
                        else:
                            total_adv_iter = 5
                            mul_for_scale = float(total_adv_iter) / 5

                    for it in range(total_adv_iter):
                        img_single.requires_grad = True

                        outputs = gcam.forward(
                            img_single.cuda(non_blocking=True))

                        if c_idx == 0 and it == 0:
                            cam_all_classes = torch.zeros([
                                n_classes, outputs.shape[2], outputs.shape[3]
                            ])

                        gcam.backward(ids=c)

                        regions = gcam.generate(target_layer=args.target_layer)
                        regions = regions[0] + regions[1].flip(-1)

                        if it == 0:
                            init_cam = regions.detach()

                        cam_all_classes[c_idx] += regions[0].data.cpu(
                        ) * mul_for_scale
                        logit = outputs
                        logit = F.relu(logit)
                        logit = torchutils.gap2d(logit, keepdims=True)[:, :, 0,
                                                                       0]

                        valid_cat = torch.nonzero(pack['label'][0])[:, 0]
                        logit_loss = -2 * (logit[:,
                                                 c]).sum() + torch.sum(logit)

                        expanded_mask = torch.zeros(regions.shape)
                        expanded_mask = add_discriminative(
                            expanded_mask, regions, score_th=args.score_th)

                        L_AD = torch.sum((torch.abs(regions - init_cam)) *
                                         expanded_mask.cuda())

                        loss = -logit_loss - L_AD * args.AD_coeff

                        model.zero_grad()
                        img_single.grad.zero_()
                        loss.backward()

                        data_grad = img_single.grad.data

                        perturbed_data = adv_climb(img_single,
                                                   args.AD_stepsize, data_grad)
                        img_single = perturbed_data.detach()

                outputs_cam.append(cam_all_classes)

            strided_cam = torch.sum(
                torch.stack([
                    F.interpolate(torch.unsqueeze(o, 0),
                                  strided_size,
                                  mode='bilinear',
                                  align_corners=False)[0] for o in outputs_cam
                ]), 0)
            highres_cam = [
                F.interpolate(torch.unsqueeze(o, 1),
                              strided_up_size,
                              mode='bilinear',
                              align_corners=False) for o in outputs_cam
            ]

            highres_cam = torch.sum(torch.stack(highres_cam, 0),
                                    0)[:, 0, :size[0], :size[1]]
            strided_cam /= F.adaptive_max_pool2d(strided_cam, (1, 1)) + 1e-5
            highres_cam /= F.adaptive_max_pool2d(highres_cam, (1, 1)) + 1e-5

            np.save(
                os.path.join(args.cam_out_dir, img_name + '.npy'), {
                    "keys": valid_cat,
                    "cam": strided_cam.cpu(),
                    "high_res": highres_cam.cpu().numpy()
                })
예제 #28
0
    def _align_nn(self,
                  model_path,
                  source_kb,
                  target_kb,
                  candidate_selector,
                  cuda_device,
                  batch_size=128):
        """
        Align using neural network model
        :param source_kb:
        :param target_kb:
        :param candidate_selector:
        :param cuda_device: GPU device number
        :return:
        """
        from emma.allennlp_classes.ontoemma_dataset_reader import OntologyMatchingDatasetReader
        from emma.allennlp_classes.ontoemma_model import OntoEmmaNN
        from emma.allennlp_classes.ontoemma_predictor import OntoEmmaPredictor

        alignment, s_ent_ids, t_ent_ids = self._align_string_equiv(
            source_kb, target_kb, candidate_selector)
        sys.stdout.write("%i alignments with string equivalence\n" %
                         len(alignment))

        if cuda_device > 0:
            with device(cuda_device):
                archive = load_archive(model_path, cuda_device=cuda_device)
        else:
            archive = load_archive(model_path, cuda_device=cuda_device)

        predictor = Predictor.from_archive(archive, 'ontoemma-predictor')

        sys.stdout.write("Making predictions...\n")
        s_ent_tqdm = tqdm.tqdm(s_ent_ids, total=len(s_ent_ids))
        sim_scores = dict()

        if cuda_device > 0:
            with device(cuda_device):
                batch_json_data = []

                for s_ent_id in s_ent_tqdm:
                    s_ent = source_kb.get_entity_by_research_entity_id(
                        s_ent_id)
                    for t_ent_id in candidate_selector.select_candidates(
                            s_ent_id)[:constants.KEEP_TOP_K_CANDIDATES]:
                        t_ent = target_kb.get_entity_by_research_entity_id(
                            t_ent_id)
                        json_data = {
                            'source_ent': s_ent.form_json(),
                            'target_ent': t_ent.form_json(),
                            'label': 0
                        }
                        batch_json_data.append(json_data)

                        if len(batch_json_data) == batch_size:
                            results = predictor.predict_batch_json(
                                batch_json_data, cuda_device)
                            for model_input, output in zip(
                                    batch_json_data, results):
                                sim_scores[(model_input['source_ent']
                                            ['research_entity_id'],
                                            model_input['target_ent']
                                            ['research_entity_id']
                                            )] = output['score'][0]
                            batch_json_data = []

                # finish last batch
                if batch_json_data:
                    results = predictor.predict_batch_json(
                        batch_json_data, cuda_device)
                    for model_input, output in zip(batch_json_data, results):
                        sim_scores[(
                            model_input['source_ent']['research_entity_id'],
                            model_input['target_ent']['research_entity_id']
                        )] = output['score'][0]
        else:
            for s_ent_id in s_ent_tqdm:
                s_ent = source_kb.get_entity_by_research_entity_id(s_ent_id)
                for t_ent_id in candidate_selector.select_candidates(
                        s_ent_id)[:constants.KEEP_TOP_K_CANDIDATES]:
                    t_ent = target_kb.get_entity_by_research_entity_id(
                        t_ent_id)
                    json_data = {
                        'source_ent': s_ent.form_json(),
                        'target_ent': t_ent.form_json(),
                        'label': 0
                    }
                    output = predictor.predict_json(json_data, cuda_device)
                    sim_scores[(json_data['source_ent']['research_entity_id'],
                                json_data['target_ent']['research_entity_id']
                                )] = output['score'][0]

        return sim_scores
def main():

    total_times = 100
    run_times = 0
    init_threshold = ...
    start_t = time.time()

    valid_data = torch.load(TRAIN_DATA + '.valid.pt')
    fields = onmt.IO.load_fields(torch.load(TRAIN_DATA + '.vocab.pt'))
    # fields = onmt.IO.load_fields_from_vocab(torch.load(TRAIN_DATA + '.vocab.pt'))
    valid_data.fields = fields  # we need to clear this assignment relationg if we want to transfere valid among threads

    checkpoint = torch.load(weights, map_location=lambda storage, loc: storage)
    model_opt = checkpoint['opt']
    masked_models = []
    with cuda.device(GPU_ID):
        ref_model = onmt.ModelConstructor.make_base_model(
            model_opt, fields, True, checkpoint)
        ref_model.eval()
        ref_model.generator.eval()
        masked_model = MaskedModel(
            ref_model, group_dict, cuda.current_device(), cuda.current_device(
            ))  # ref_model is at current_device, no copy will happen
        masked_models.append(masked_model)

    if GPU_ID:
        cuda.set_device(GPU_ID)

    # 1 means 1% acc
    acc_percent_prune = 1
    # 只需要原始的accuracy
    acc_of_no_prune = 0
    get_acc_of_no_prune = False
    print(time_now(), "start while")
    while run_times < total_times:
        print("-----------------------------------------")
        print("start Iteration ", run_times)
        # init threshold
        best_threshold = 0
        itr_time = time.time()
        '''
            display all the names of parameters
        '''
        '''
            aa=ref_model.named_parameters
            aa_namelist = [ak[0] for ak in aa]
        '''
        '''
            test MP
        '''
        translate_opt, translate_dummy_opt = translate_opt_initialize(
            'opennmt_translate_opt.pt', 'opennmt_translate_dummy_opt.pt')
        translator = init_translate_model(translate_opt, translate_dummy_opt)
        del translator.model
        translator.model = masked_model
        tt = open(translate_opt.tgt, 'r')
        references = [[t] for t in tt]

        xxx = np.arange(0., 1, 0.01)
        #for i in range(len(masked_model.group_name_list)):
        #   tmp_crate = len(masked_model.group_name_list)*[0.]
        print(time_now(), "start testing pruning")
        masked_model.make_evaluable()
        for i in range(len(xxx)):
            # best_threshold = 0.55
            # break
            translate_data = onmt.IO.ONMTDataset(translate_opt.src,
                                                 translate_opt.tgt,
                                                 fields,
                                                 use_filter_pred=False)
            prune_data = onmt.IO.OrderedIterator(dataset=translate_data,
                                                 device=GPU_ID,
                                                 batch_size=1,
                                                 train=False,
                                                 sort=False,
                                                 shuffle=False)
            tmp_crate = len(masked_model.group_name_list) * [xxx[i]]
            #tmp_crate[i] = 0.01
            masked_model.change_mask(tmp_crate, apply_MP_on_mask)
            masked_model.apply_mask()
            tmp_fit = evaluate(masked_model, valid_data, fields)
            #tmp_fit = evaluate_trans(translator, references, prune_data, translate_data)
            #logger.scalar_summary('test_bleu', tmp_fit[1]*100, int(xxx[i]*100))
            #logger.scalar_summary('acc', tmp_fit[1], int(xxx[i]*100))
            #logger.scalar_summary('ppl', tmp_fit[0], int(xxx[i]*100))
            #logger.scalar_summary('test_ppl', tmp_fit[0], int(xxx[i]*100))
            #print('group %s => acc (%.4f), ppl (%.4f)' % (masked_model.group_name_list[i], tmp_fit[1], tmp_fit[0]))
            #print('percentage %s => bleu (%.4f), ppl (%.4f)' % (xxx[i]*100, tmp_fit[1]*100, tmp_fit[0]))
            # print('percentage %s => acc (%.4f), ppl (%.4f)' % (xxx[i]*100, tmp_fit[1], tmp_fit[0]))
            if i == 0 and not get_acc_of_no_prune:
                acc_of_no_prune = tmp_fit[1]
                acc_of_no_prune = int(acc_of_no_prune * 100) / 100
                get_acc_of_no_prune = True
            elif acc_of_no_prune - tmp_fit[1] > acc_percent_prune:
                best_threshold = xxx[i] - 0.01
                break
        # -------------------------------------------------
        # Start writing
        # prune again
        print(time_now(), " start accuracy:", acc_of_no_prune)
        print("-------test------------:", get_acc_of_no_prune)
        print(time_now(), " apply pruning with threshold:", best_threshold)
        tmp_crate = len(masked_model.group_name_list) * [best_threshold]
        masked_model.change_mask(tmp_crate, apply_MP_on_mask)
        masked_model.apply_mask()

        # print information
        tmp_fit = evaluate(masked_model, valid_data, fields)
        print('percentage %s => acc (%.4f), ppl (%.4f)' %
              (best_threshold * 100, tmp_fit[1], tmp_fit[0]))
        model_sparsity = masked_model.get_sparsity()
        print('Sparsity: {}'.format(model_sparsity))

        #--------------- start retraining --------------
        # first store model
        print(time_now(), "start saving model")
        _, saved_model = update_checkpoint(checkpoint, masked_model, run_times)
        print(time_now(), "finish saving model")
        print(time_now(), "start loading model")
        checkpoint = torch.load(SAVE_MODEL_TMP_FOLDER + saved_model,
                                map_location=lambda storage, loc: storage)
        train_opt, _, _ = opt_initialize(checkpoint,
                                         'opennmt_translate_opt.pt',
                                         'opennmt_translate_dummy_opt.pt')

        # train data loading
        print(time_now(), "start loading data for retraining")
        train = torch.load(train_opt.data + '.train.pt')
        valid = torch.load(train_opt.data + '.valid.pt')
        print(time_now(), "finish data loading")

        train_fields = load_fields(train, valid, checkpoint, train_opt)
        model_for_train = init_train_model(checkpoint, train_opt, train_fields)
        masked_model = MaskedModel(model_for_train, group_dict,
                                   cuda.current_device(),
                                   cuda.current_device())

        masked_model.make_trainable()

        print(time_now(), "building optm")
        optim = build_optim(model_for_train, checkpoint, train_opt)

        print(time_now(), "start restraining")
        recovered = train_model(model_for_train, train, valid, train_fields,
                                optim, train_opt, run_times, acc_of_no_prune)
        print(time_now(), "finish retraining ")
        if not recovered:
            exit()
        else:
            print("------------Accuracy recorverd!--------------------")
            print("recovered accuracy:", acc_of_no_prune)
        run_times += 1

        masked_model.make_evaluable()
        tmp_fit = evaluate(masked_model, valid_data, fields)
        print("------------------for test-------------------")
        print('percentage %s => acc (%.4f), ppl (%.4f)' %
              (best_threshold * 100, tmp_fit[1], tmp_fit[0]))
예제 #30
0
def _work_gpu(process_id, model, dataset, args):
    n_gpus = torch.cuda.device_count()
    databin = dataset[process_id]
    data_loader = DataLoader(databin,
                             shuffle=False,
                             num_workers=args.num_workers // n_gpus,
                             pin_memory=False)

    with torch.no_grad(), cuda.device(process_id):

        model.cuda()

        for iter, pack in tqdm(enumerate(data_loader), total=len(databin)):
            img_name = pack['name'][0]
            size = np.asarray(pack['size'])

            edge, dp = model(pack['img'][0].cuda(non_blocking=True))

            dp = dp.cpu().numpy()

            cam_dict = np.load(args.cam_out_dir + '/' + img_name + '.npy',
                               allow_pickle=True).item()

            cams = cam_dict['cam'].cuda()
            keys = cam_dict['keys']

            centroids = find_centroids_with_refinement(dp)
            instance_map = cluster_centroids(centroids, dp)
            instance_cam = separte_score_by_mask(cams, instance_map)

            rw = indexing.propagate_to_edge(instance_cam,
                                            edge,
                                            beta=args.beta,
                                            exp_times=args.exp_times,
                                            radius=5)

            rw_up = F.interpolate(rw,
                                  scale_factor=4,
                                  mode='bilinear',
                                  align_corners=False)[:,
                                                       0, :size[0], :size[1]]
            rw_up = rw_up / torch.max(rw_up)

            rw_up_bg = F.pad(rw_up, (0, 0, 0, 0, 1, 0),
                             value=args.ins_seg_bg_thres)

            num_classes = len(keys)
            num_instances = instance_map.shape[0]

            instance_shape = torch.argmax(rw_up_bg, 0).cpu().numpy()
            instance_shape = pyutils.to_one_hot(
                instance_shape,
                maximum_val=num_instances * num_classes + 1)[1:]
            instance_class_id = np.repeat(keys, num_instances)

            detected = detect_instance(rw_up.cpu().numpy(),
                                       instance_shape,
                                       instance_class_id,
                                       max_fragment_size=size[0] * size[1] *
                                       0.01)

            np.save(os.path.join(args.ins_seg_out_dir, img_name + '.npy'),
                    detected)

            if process_id == n_gpus - 1 and iter % (len(databin) // 4) == 0:
                print("%d " % ((5 * iter + 1) // (len(databin) // 4)), end='')
예제 #31
0
 def clear_cache(self):
     with cuda.device(self.tgpu_id):
         cuda.empty_cache()
예제 #32
0
파일: fmmv_cuda.py 프로젝트: ymohit/falkon
def distk_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out
    kernel: L2DistanceKernel = a.kernel
    max_mem = a.max_mem
    N, D = X1.size()
    M = X2.size(0)
    T = v.shape[1] if v is not None else w.shape[1]
    dtype = X1.dtype
    cuda_inputs = X1.is_cuda

    # Memory usage:
    # v    : M x T
    # K    : n x M
    # X1ss : n x d
    # X2s  : M x d
    # Kv   : n x T
    # out  : M x T
    # sq1  : n x 1
    # sq2  : M x 1
    # ------------
    # total : n*d + M*d + n*(M + T + 1) + 2*M*T + M
    avail_mem = max_mem / sizeof_dtype(dtype)
    rest_coef = 2 * M * T if v is not None else M * T
    n, d = select_dim_over_nd(max_n=N,
                              max_d=D,
                              coef_nd=1,
                              coef_n=M + T + 1,
                              coef_d=M,
                              rest=rest_coef + M,
                              max_mem=avail_mem)
    ddev = torch.device('cuda:%d' % int(device_id))
    s1 = tcd.Stream(ddev)
    s2 = tcd.Stream(ddev)

    with tcd.device(ddev), tcd.stream(s1):
        # First collect necessary memory
        mem_needed = n * M + n * T + n + M
        if not cuda_inputs:
            mem_needed += n * d + M * d
            if v is not None:
                mem_needed += M * T
        if not out.is_cuda:
            mem_needed += M * T
        # Create flat tensor
        flat_gpu_tn = torch.empty(size=(mem_needed, ),
                                  dtype=dtype,
                                  device=ddev)
        # Extract the sub-tensors
        flat_offset = 0
        if v is not None:
            if not cuda_inputs:
                v_gpu = extract_same_stride(flat_gpu_tn,
                                            size=(M, T),
                                            other=v,
                                            offset=flat_offset)
                flat_offset += np.prod(v_gpu.shape)
                copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0)
            else:
                v_gpu = v
        K_gpu = extract_same_stride(flat_gpu_tn,
                                    size=(n, M),
                                    other=X1,
                                    offset=flat_offset)
        flat_offset += np.prod(K_gpu.shape)
        Kv_gpu = extract_same_stride(flat_gpu_tn,
                                     size=(n, T),
                                     other=X1,
                                     offset=flat_offset)
        flat_offset += np.prod(Kv_gpu.shape)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = extract_same_stride(flat_gpu_tn,
                                          size=(M, T),
                                          other=out,
                                          offset=flat_offset)
            flat_offset += np.prod(out_gpu.shape)
        out_gpu.fill_(0.0)
        if not cuda_inputs:
            X1ss_gpu = extract_same_stride(flat_gpu_tn,
                                           size=(n, d),
                                           other=X1,
                                           offset=flat_offset)
            flat_offset += np.prod(X1ss_gpu.shape)
            X2s_gpu = extract_same_stride(flat_gpu_tn,
                                          size=(M, d),
                                          other=X2,
                                          offset=flat_offset)
            flat_offset += np.prod(X2s_gpu.shape)
        sq1_gpu = extract_same_stride(flat_gpu_tn,
                                      size=(n, ),
                                      other=X1,
                                      offset=flat_offset)
        flat_offset += np.prod(sq1_gpu.shape)
        sq2_gpu = extract_same_stride(flat_gpu_tn,
                                      size=(M, ),
                                      other=X1,
                                      offset=flat_offset)

        for i in range(0, N, n):
            nb = min(N - i, n)

            cur_K_gpu = K_gpu[:nb]  # nb x M
            cur_K_gpu.fill_(0.0)

            for j in range(0, D, d):
                db = min(D - j, d)
                s1.synchronize(
                )  # need that the add_(sq2_gpu.T) op is complete to avoid overwrite
                # Parallelize two matrix transfers
                with tcd.stream(s2):
                    if cuda_inputs:
                        cur_X2s_gpu = X2[:, j:j + db]
                    else:
                        cur_X2s_gpu = copy_to_device_noorder(M,
                                                             db,
                                                             X2,
                                                             0,
                                                             j,
                                                             X2s_gpu,
                                                             0,
                                                             0,
                                                             s=s2)
                    torch.norm(cur_X2s_gpu,
                               p=2,
                               dim=1,
                               keepdim=True,
                               out=sq2_gpu).pow_(2)
                if cuda_inputs:
                    cur_X1ss_gpu = X1[i:i + nb, j:j + db]
                else:
                    cur_X1ss_gpu = copy_to_device_noorder(nb,
                                                          db,
                                                          X1,
                                                          i,
                                                          j,
                                                          X1ss_gpu,
                                                          0,
                                                          0,
                                                          s=s1)
                torch.norm(cur_X1ss_gpu, p=2, dim=1, keepdim=True,
                           out=sq1_gpu).pow_(2)

                s2.synchronize(
                )  # need that cur_X2s_gpu and sq2_gpu are available.
                cur_K_gpu.addmm_(mat1=cur_X1ss_gpu,
                                 mat2=cur_X2s_gpu.T,
                                 alpha=-2.0)
                cur_K_gpu.add_(sq1_gpu)
                cur_K_gpu.add_(sq2_gpu.T)
                cur_K_gpu.clamp_min_(0)

            cur_K_gpu = kernel._transform(cur_K_gpu)

            if w is not None:
                cur_Kv_gpu = copy_to_device_noorder(nb,
                                                    T,
                                                    w,
                                                    i,
                                                    0,
                                                    Kv_gpu,
                                                    0,
                                                    0,
                                                    s=s1)  # n x T
                if v is not None:
                    cur_Kv_gpu.addmm_(cur_K_gpu, v_gpu)
            else:
                # v cannot be None if w is None
                cur_Kv_gpu = Kv_gpu.narrow(0, 0, nb)  # n x T
                torch.mm(cur_K_gpu, v_gpu, out=cur_Kv_gpu)  # n x T

            # Multiply transposed kernel with the Kv result.
            out_gpu.addmm_(cur_K_gpu.T, cur_Kv_gpu)  # M x T

        if not out.is_cuda:
            copy_to_host_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
        s1.synchronize()
    return out