Пример #1
0
def sweep_step(work_items, tgroup_id, I, sigma, new_sigma, coefs, directions,
               sigma_a, sigma_s):
    I[1:-1, 1:-1, 1:-1, :, -1] = np.nan
    tgroup_id[0] = 0
    # Sweep across the graph for the differencing scheme for the gradient.
    chunk_size = 1024
    num_blocks = (work_items.shape[0] + chunk_size - 1) // chunk_size
    assert I.strides[3] == 4
    I_flat = np.swapaxes(I, 3, 4).ravel()
    sigma_flat = sigma.ravel()
    # direction_offset = 1
    frequency_offset = I.shape[3]
    cp.cuda.get_current_stream().synchronize()
    start = perf_counter()
    cuda.profile_start()
    compute_fluxes[num_blocks, chunk_size, 0, uint_t_nbytes](
        work_items, sigma.shape[0], sigma.shape[1], sigma.shape[2],
        sigma.shape[3], I.shape[4], I_flat, sigma_flat, directions,
        sigma_a + sigma_s, tgroup_id, 1. / I.shape[1],
        I.strides[0] // float_t_nbytes, I.strides[1] // float_t_nbytes,
        I.strides[2] // float_t_nbytes, I.strides[3] // float_t_nbytes,
        I.strides[4] // float_t_nbytes)
    cp.cuda.get_current_stream().synchronize()
    cuda.profile_stop()
    stop = perf_counter()
    print("sweep kernel time:", stop - start)
    # Compute the scattering terms in the collision operator.
    compute_new_scattering(sigma_s, I, coefs, new_sigma)
Пример #2
0
def train():
    #Launch recv td
    print("worker_id(rank)",worker_id, "  size:",str(worker_num)," batch_size=",batch_size )
    init_processes(worker_id,worker_num, 'gloo')

    input("Worker End Connection Initialized") 
    
    sub_net.train()
    inputs = None
    outputs = None
    train_loss = 0
    correct = 0
    total = 0
    iteration_num = 100
    iter_n = 0
    loss = None
    sub_optimizer.zero_grad()
    sta = time.time()
    
    while True:
        inputs = fake_input.to(device)
        targets = fake_target.to(device)
        outputs = sub_net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        comm_time_sta = time.time()
        para_num = 0
        for name, parameters in sub_net.named_parameters():
            if(parameters.grad is not None):
                grad_content = parameters.grad.to("cpu")
                para_num += grad_content.numel()
                dist.all_reduce(tensor=grad_content, op = dist.ReduceOp.SUM)
                grad_content = grad_content/worker_num
                parameters.grad = grad_content.to(device)
        comm_time_ed = time.time()
        sub_optimizer.step()
        sub_optimizer.zero_grad()
        print("iter=",iter_n," comm_time=",str(comm_time_ed-comm_time_ed))
        if iter_n == 10:
            cuda.profile_start()
            print("cuda profile start...")
        if iter_n == 30:
            cuda.profile_stop()
            print("cuda profile end...")
        iter_n = iter_n + 1
        if iter_n%10 == 0:
            ed = time.time()
            print("iter_n=",iter_n," time=",(ed-sta*1.0), "comm_num=",para_num)
        if iter_n == iteration_num:
            exit(0)        
Пример #3
0
    def every_n_step_begin(self, step):

        if self.ended:
            return

        first_check_step = 305
        last_check_step = 325
        if (not self.started) and step > first_check_step:
            print("Profile Start!")
            self.started = True
            cuda.profile_start()
        elif self.started and step > last_check_step:
            print("Profile End! Calling profile_stop().")
            self.ended = True
            cuda.profile_stop()
            print("Done calling profile_stop().")
Пример #4
0
def train():
    #Launch recv td
    print("worker_id(rank)", worker_id, "  size:", str(worker_num),
          " batch_size=", batch_size)
    init_processes(worker_id, worker_num, 'gloo')

    print("Worker End Connection Initialized")
    global sub_net, sub_optimizer, device
    is_cpu_mode = False
    sub_net.train()
    inputs = None
    outputs = None
    train_loss = 0
    correct = 0
    total = 0
    iteration_num = 100
    iter_n = 0
    loss = None
    sub_optimizer.zero_grad()
    sta = time.time()
    with torch.autograd.profiler.emit_nvtx():
        cuda.profile_start()
        while iter_n <= 10:
            inputs = fake_input.to(device)
            targets = fake_target.to(device)
            outputs = sub_net(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            comm_time_sta = time.time()
            for name, parameters in sub_net.named_parameters():
                if (parameters.grad is not None):
                    grad_content = parameters.grad.to("cpu")
                    dist.all_reduce(tensor=grad_content, op=dist.ReduceOp.SUM)
                    grad_content = grad_content / worker_num
                    parameters.grad = grad_content.to(device)
            comm_time_ed = time.time()
            sub_optimizer.step()
            sub_optimizer.zero_grad()
            print("iter=", iter_n)
            iter_n = iter_n + 1

            if iter_n == 5:
                print("Stop")
                cuda.profile_stop()
            if iter_n % 10 == 0:
                ed = time.time()
                print("iter_n=", iter_n, " time=", (ed - sta * 1.0))
Пример #5
0
    def main(self):
        self.stats.start()
        self.dynamic_adjustment.start()

        if Config.PLAY_MODE:
            for trainer in self.trainers:
                trainer.enabled = False

        learning_rate_multiplier = (
            Config.LEARNING_RATE_END -
            Config.LEARNING_RATE_START) / Config.ANNEALING_EPISODE_COUNT
        beta_multiplier = (Config.BETA_END -
                           Config.BETA_START) / Config.ANNEALING_EPISODE_COUNT

        while self.stats.episode_count.value < Config.EPISODES:

            #CUDA PROFILING - GUY
            if self.stats.episode_count.value == 1000:
                cuda.profile_start()

            if self.stats.episode_count.value == 2000:
                cuda.profile_stop()
            #CUDA PROFILING - GUY

            step = min(self.stats.episode_count.value,
                       Config.ANNEALING_EPISODE_COUNT - 1)
            self.model.learning_rate = Config.LEARNING_RATE_START + learning_rate_multiplier * step
            self.model.beta = Config.BETA_START + beta_multiplier * step

            # Saving is async - even if we start saving at a given episode, we may save the model at a later episode
            if Config.SAVE_MODELS and self.stats.should_save_model.value > 0:
                self.save_model()
                self.stats.should_save_model.value = 0

            time.sleep(0.01)

        self.dynamic_adjustment.exit_flag = True
        while self.agents:
            self.remove_agent()
        while self.predictors:
            self.remove_predictor()
        while self.trainers:
            self.remove_trainer()
Пример #6
0
    def _fit(
            self,
            train_iter: data_io.ParallelBucketSentenceIter,
            val_iter: data_io.ParallelBucketSentenceIter,
            output_folder: str,
            max_params_files_to_keep: int,
            metrics: List[AnyStr],
            max_updates: int,
            checkpoint_frequency: int,
            max_num_not_improved: int,
            min_num_epochs: Optional[int] = None,
            # <EcoSys> Parametrizing profiler
            profiler_on: bool = False,
            profiler_start: int = 4500,
            profiler_stop: int = 4600,
            # </EcoSys>
            mxmonitor: Optional[mx.monitor.Monitor] = None):
        """
        Internal fit method. Runtime determined by early stopping.

        :param train_iter: Training data iterator.
        :param val_iter: Validation data iterator.
        :param output_folder: Model output folder.
        :params max_params_files_to_keep: Maximum number of params files to keep in the output folder (last n are kept).
        :param metrics: List of metric names to track on training and validation data.
        :param max_updates: Maximum number of batches to process.
        :param checkpoint_frequency: Frequency of checkpointing.
        :param max_num_not_improved: Maximum number of checkpoints until fitting is stopped if model does not improve,
               -1 for no early stopping.
        :param min_num_epochs: Minimum number of epochs to train, even if validation scores did not improve.
        :param mxmonitor: Optional MXNet monitor instance.
        """
        metric_train = self._create_eval_metric(metrics)
        metric_val = self._create_eval_metric(metrics)

        tic = time.time()

        training_state_dir = os.path.join(output_folder,
                                          C.TRAINING_STATE_DIRNAME)
        if os.path.exists(training_state_dir):
            train_state = self.load_checkpoint(training_state_dir, train_iter)
        else:
            train_state = _TrainingState(num_not_improved=0,
                                         epoch=0,
                                         checkpoint=0,
                                         updates=0,
                                         samples=0)

        next_data_batch = train_iter.next()
        logfile = expanduser("~") + "/profiler-" + str(plt.node()) + ".json"
        mx.profiler.profiler_set_config(mode='all', filename=logfile)

        while max_updates == -1 or train_state.updates < max_updates:

            # <EcoSys> Added the profiler start and end point.
            if profiler_on:
                import numba.cuda as cuda

                if train_state.updates == profiler_start:
                    cuda.profile_start()
                    mx.profiler.profiler_set_state('run')
                if train_state.updates == profiler_stop:
                    mx.profiler.profiler_set_state('stop')
                    mx.profiler.dump_profile()
                    cuda.profile_stop()
                    exit()
            # </EcoSys>

            if not train_iter.iter_next():
                train_state.epoch += 1
                train_iter.reset()

            # process batch
            batch = next_data_batch

            if mxmonitor is not None:
                mxmonitor.tic()

            self.module.forward_backward(batch)
            self.module.update()

            if mxmonitor is not None:
                results = mxmonitor.toc()
                if results:
                    for _, k, v in results:
                        logger.info('Monitor: Batch [{:d}] {:s} {:s}'.format(
                            train_state.updates, k, v))

            if train_iter.iter_next():
                # pre-fetch next batch
                next_data_batch = train_iter.next()
                self.module.prepare(next_data_batch)

            self.module.update_metric(metric_train, batch.label)

            self.training_monitor.batch_end_callback(train_state.epoch,
                                                     train_state.updates,
                                                     metric_train)
            train_state.updates += 1
            train_state.samples += train_iter.batch_size

            if train_state.updates > 0 and train_state.updates % checkpoint_frequency == 0:
                train_state.checkpoint += 1
                self._save_params(output_folder, train_state.checkpoint)
                cleanup_params_files(
                    output_folder, max_params_files_to_keep,
                    train_state.checkpoint,
                    self.training_monitor.get_best_checkpoint())
                self.training_monitor.checkpoint_callback(
                    train_state.checkpoint, metric_train)

                toc = time.time()
                logger.info(
                    "Checkpoint [%d]\tUpdates=%d Epoch=%d Samples=%d Time-cost=%.3f",
                    train_state.checkpoint, train_state.updates,
                    train_state.epoch, train_state.samples, (toc - tic))
                tic = time.time()

                for name, val in metric_train.get_name_value():
                    logger.info('Checkpoint [%d]\tTrain-%s=%f',
                                train_state.checkpoint, name, val)
                metric_train.reset()

                # evaluation on validation set
                has_improved, best_checkpoint = self._evaluate(
                    train_state, val_iter, metric_val)
                if self.lr_scheduler is not None:
                    self.lr_scheduler.new_evaluation_result(has_improved)

                if has_improved:
                    best_path = os.path.join(output_folder, C.PARAMS_BEST_NAME)
                    if os.path.lexists(best_path):
                        os.remove(best_path)
                    actual_best_fname = C.PARAMS_NAME % best_checkpoint
                    os.symlink(actual_best_fname, best_path)
                    train_state.num_not_improved = 0
                else:
                    train_state.num_not_improved += 1
                    logger.info("Model has not improved for %d checkpoints",
                                train_state.num_not_improved)

                if max_num_not_improved >= 0 and train_state.num_not_improved >= max_num_not_improved:
                    logger.info(
                        "Maximum number of not improved checkpoints (%d) reached: %d",
                        max_num_not_improved, train_state.num_not_improved)
                    stop_fit = True

                    if min_num_epochs is not None and train_state.epoch < min_num_epochs:
                        logger.info(
                            "Minimum number of epochs (%d) not reached yet: %d",
                            min_num_epochs, train_state.epoch)
                        stop_fit = False

                    if stop_fit:
                        logger.info("Stopping fit")
                        self.training_monitor.stop_fit_callback()
                        final_training_state_dirname = os.path.join(
                            output_folder, C.TRAINING_STATE_DIRNAME)
                        if os.path.exists(final_training_state_dirname):
                            shutil.rmtree(final_training_state_dirname)
                        break

                self._checkpoint(train_state, output_folder, train_iter)
        cleanup_params_files(output_folder, max_params_files_to_keep,
                             train_state.checkpoint,
                             self.training_monitor.get_best_checkpoint())
Пример #7
0
def main():
    args = parse_args()
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = "./run/neumf/{}".format(config['timestamp'])
    print("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    t1 = time.time()
    # Load Data
    print('Loading data')
    train_dataset = CFTrainDataset(
        os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples)
    train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=args.workers,
                                                   pin_memory=True)

    test_ratings = load_test_ratings(
        os.path.join(args.data, TEST_RATINGS_FILENAME))  # noqa: E501
    test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME))
    nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items
    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' %
          (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz,
           len(test_ratings)))

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers])
    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    criterion = nn.BCEWithLogitsLoss()

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        criterion = criterion.cuda()

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')

    # Calculate initial Hit Ratio and NDCG
    hits, ndcgs = val_epoch(model,
                            test_ratings,
                            test_negs,
                            args.topk,
                            use_cuda=use_cuda,
                            processes=args.processes)
    print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(
        K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs)))

    for epoch in range(args.epochs):

        model.train()
        losses = utils.AverageMeter()

        begin = time.time()
        loader = tqdm.tqdm(train_dataloader)
        length = len(loader)
        if length < 101:
            print(
                'Exiting, cannot profile the required 100 iterations. Please re-run with a larger batch size.'
            )
            cuda.profile_stop()
            exit()
        for batch_index, (user, item, label) in enumerate(loader):
            if batch_index == length // 2 and epoch == 0:
                print('Starting profiling for 100 iterations.')
                cuda.profile_start()

            if batch_index == length // 2 + 100 and epoch == 0:
                print(
                    'Profiling completed, stopping profiling and continuing training.'
                )
                cuda.profile_stop()

            user = torch.autograd.Variable(user, requires_grad=False)
            item = torch.autograd.Variable(item, requires_grad=False)
            label = torch.autograd.Variable(label, requires_grad=False)
            if use_cuda:
                user = user.cuda(async=True)
                item = item.cuda(async=True)
                label = label.cuda(async=True)

            outputs = model(user, item)
            loss = criterion(outputs, label)
            losses.update(loss.data.item(), user.size(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Save stats to file
            description = (
                'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format(
                    epoch, loss=losses))
            loader.set_description(description)

        train_time = time.time() - begin
        begin = time.time()
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                output=valid_results_file,
                                epoch=epoch,
                                processes=args.processes)
        val_time = time.time() - begin
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=np.mean(hits),
                ndcg=np.mean(ndcgs),
                train_time=train_time,
                val_time=val_time))

        if args.threshold is not None:
            if np.mean(hits) >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                return 0
Пример #8
0
 def __call__(self, param):
     import numba.cuda as cuda
     if self.nbatch == param.nbatch and self.nepoch == param.epoch:
         cuda.profile_stop()
Пример #9
0
def train(train_loader, model, criterion, optimizer, epoch, args):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()

    length = len(train_loader)
    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        if i == length // 2:
            print('Starting profiling for 100 iterations.')
            cuda.profile_start()
        if i == length // 2 + 100:
            print('Profiling completed, stopping profiling and exiting.')
            cuda.profile_stop()
            exit()

        # measure data loading time
        data_time.update(time.time() - end)

        if args.gpu is not None:
            input = input.cuda(args.gpu, non_blocking=True)
        target = target.cuda(args.gpu, non_blocking=True)

        # compute output
        output = model(input)
        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), input.size(0))
        top1.update(acc1[0], input.size(0))
        top5.update(acc5[0], input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                      epoch,
                      i,
                      len(train_loader),
                      batch_time=batch_time,
                      data_time=data_time,
                      loss=losses,
                      top1=top1,
                      top5=top5))
Пример #10
0
        if (z.real * z.real + z.imag * z.imag) >= 4:
            return i

    return 255


@jit
def create_fractal(min_x, max_x, min_y, max_y, image, iters):
    height = image.shape[0]
    width = image.shape[1]

    pixel_size_x = (max_x - min_x) / width
    pixel_size_y = (max_y - min_y) / height
    for x in range(width):
        real = min_x + x * pixel_size_x
        for y in range(height):
            imag = min_y + y * pixel_size_y
            color = mandel(real, imag, iters)
            image[y, x] = color

    return image


image = np.zeros((500 * 2, 750 * 2), dtype=np.uint8)
s = timer()
create_fractal(-2.0, 1.0, -1.0, 1.0, image, 20)
e = timer()
print(e - s)

profile_stop()
Пример #11
0
def main():

    args = parse_arguments()

    if args.use_env and 'LOCAL_RANK' in os.environ:
        args.local_rank = int(os.environ['LOCAL_RANK'])
        
    random.seed(args.seed + args.local_rank)
    np.random.seed(args.seed + args.local_rank)
    torch.manual_seed(args.seed + args.local_rank)
    torch.cuda.manual_seed(args.seed + args.local_rank)
    worker_init = WorkerInitObj(args.seed + args.local_rank)

    device, args = setup_training(args)
    dllogger.log(step="PARAMETER", data={"Config": [str(args)]})

    # Prepare optimizer
    model, optimizer, lr_scheduler, checkpoint, global_step, criterion = prepare_model_and_optimizer(args, device)

    if is_main_process():
        dllogger.log(step="PARAMETER", data={"SEED": args.seed})

    raw_train_start = time.time()
    if args.do_train:
        if is_main_process():
            dllogger.log(step="PARAMETER", data={"train_start": True})
            dllogger.log(step="PARAMETER", data={"batch_size_per_gpu": args.train_batch_size})
            dllogger.log(step="PARAMETER", data={"learning_rate": args.learning_rate})

        model.train()
        most_recent_ckpts_paths = []
        average_loss = 0.0  # averaged loss every args.log_freq steps
        epoch = 0
        training_steps = 0

        pool = ProcessPoolExecutor(1)

        running_total = 0
        running_count = 0
        # Note: We loop infinitely over epochs, termination is handled via iteration count
        while True:
            thread = None
            if not args.resume_from_checkpoint or epoch > 0 or (args.phase2 and global_step < 1) or args.init_checkpoint:
                files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if
                         os.path.isfile(os.path.join(args.input_dir, f)) and 'training' in f]
                files.sort()
                num_files = len(files)
                random.shuffle(files)
                f_start_id = 0
            else:
                f_start_id = checkpoint['files'][0]
                files = checkpoint['files'][1:]
                args.resume_from_checkpoint = False
                num_files = len(files)


            shared_file_list = {}

            if torch.distributed.is_initialized() and torch.distributed.get_world_size() > num_files:
                remainder = torch.distributed.get_world_size() % num_files
                data_file = files[(f_start_id*torch.distributed.get_world_size()+torch.distributed.get_rank() + remainder*f_start_id)%num_files]
            else:
                data_file = files[(f_start_id*torch.distributed.get_world_size()+torch.distributed.get_rank())%num_files]

            previous_file = data_file

            train_data = pretraining_dataset(data_file, args.max_predictions_per_seq)
            train_sampler = RandomSampler(train_data)
            train_dataloader = DataLoader(train_data, sampler=train_sampler,
                                          batch_size=args.train_batch_size * args.n_gpu,
                                          num_workers=4, worker_init_fn=worker_init,
                                          pin_memory=True)
            # shared_file_list["0"] = (train_dataloader, data_file)

            overflow_buf = None
            if args.allreduce_post_accumulation:
                overflow_buf = torch.cuda.IntTensor([0])
            
            if len(files) == 1:
                f_start_id = -1
         
            for f_id in range(f_start_id + 1 , len(files)):
                
   
                if torch.distributed.get_world_size() > num_files:
                    data_file = files[(f_id*torch.distributed.get_world_size()+torch.distributed.get_rank() + remainder*f_id)%num_files]
                else:
                    data_file = files[(f_id*torch.distributed.get_world_size()+torch.distributed.get_rank())%num_files]

                previous_file = data_file

                dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init)

                train_iter = tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar) if is_main_process() else train_dataloader
                for step, batch in enumerate(train_iter):
                    if global_step >= 500:
                        batch_start_time = time.time()

                    # profile the file if it has at least 200 batches
                    if args.profile and len(train_dataloader.dataset) > 200 and step == 100:
                        print("Profilling the kernel for 100 iternations")
                        cuda.profile_start()


                    if args.profile and len(train_dataloader.dataset) > 200 and step == 200:
                        cuda.profile_stop()
                        print("Profiling complete, exiting")
                        exit()



                    training_steps += 1
                    batch = [t.to(device) for t in batch]
                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch
                    prediction_scores, seq_relationship_score = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
                    loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels)
                    if args.n_gpu > 1:
                        loss = loss.mean()  # mean() to average on multi-gpu.

                    divisor = args.gradient_accumulation_steps
                    if args.gradient_accumulation_steps > 1:
                        if not args.allreduce_post_accumulation:
                            # this division was merged into predivision
                            loss = loss / args.gradient_accumulation_steps
                            divisor = 1.0
                    if args.fp16:
                        with amp.scale_loss(loss, optimizer, delay_overflow_check=args.allreduce_post_accumulation) as scaled_loss:
                            scaled_loss.backward()
                    else:
                        loss.backward()
                    average_loss += loss.item()

                    if training_steps % args.gradient_accumulation_steps == 0:
                        lr_scheduler.step()  # learning rate warmup
                        global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step)

                    if global_step >= args.max_steps:
                        train_time_raw = time.time() - raw_train_start
                        last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq
                        last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
                        average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda()
                        average_loss = average_loss / (last_num_steps * divisor)
                        if (torch.distributed.is_initialized()):
                            average_loss /= torch.distributed.get_world_size()
                            torch.distributed.all_reduce(average_loss)
                        final_loss = average_loss.item()
                        if is_main_process():
                            dllogger.log(step=(epoch, global_step, ), data={"final_loss": final_loss})
                    elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
                        if is_main_process():
                            dllogger.log(step=(epoch, global_step, ), data={"average_loss": average_loss / (args.log_freq * divisor),
                                                                            "step_loss": loss.item() * args.gradient_accumulation_steps / divisor,
                                                                            "learning_rate": optimizer.param_groups[0]['lr']})
                        average_loss = 0

                    if global_step >= args.max_steps or training_steps % (
                            args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0:
                        if is_main_process() and not args.skip_checkpoint:
                            # Save a trained model
                            dllogger.log(step="PARAMETER", data={"checkpoint_step": global_step})
                            model_to_save = model.module if hasattr(model,
                                                                    'module') else model  # Only save the model it-self
                            if args.resume_step < 0 or not args.phase2:
                                output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step))
                            else:
                                output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step))
                            if args.do_train:
                                torch.save({'model': model_to_save.state_dict(),
                                            'optimizer': optimizer.state_dict(),
                                            'master params': list(amp.master_params(optimizer)),
                                            'files': [f_id] + files}, output_save_file)

                                most_recent_ckpts_paths.append(output_save_file)
                                if len(most_recent_ckpts_paths) > 3:
                                    ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
                                    os.remove(ckpt_to_be_removed)

                        if global_step >= args.max_steps:
                            del train_dataloader
                            # thread.join()
                            return args, final_loss, train_time_raw
                    # give some warmup period and record throughput
                    if global_step > 500:
                        batch_duration = time.time() - batch_start_time
                        running_total += args.train_batch_size * args.max_seq_length
                        running_count += batch_duration
                del train_dataloader
                print("Running throughput average:", running_total/running_count)
                # thread.join()
                # Make sure pool has finished and switch train_dataloader
                # NOTE: Will block until complete
                train_dataloader, data_file = dataset_future.result(timeout=None)

            epoch += 1
Пример #12
0
def train():
    # kvstore
    kv = mx.kvstore.create(args.kv_store)

    model_prefix = args.model_prefix
    if model_prefix is not None:
        model_prefix += "-%d" % (kv.rank)
    save_model_prefix = args.save_model_prefix
    if save_model_prefix is None:
        save_model_prefix = model_prefix

    log_config(args.log_dir, args.log_file, save_model_prefix, kv.rank)

    devs = mx.cpu() if args.gpus is None else [
        mx.gpu(int(i)) for i in args.gpus.split(',')
    ]

    epoch_size = args.num_examples / args.batch_size

    if args.kv_store == 'dist_sync':
        epoch_size /= kv.num_workers

    # disable kvstore for single device
    if 'local' in kv.type and (args.gpus is None
                               or len(args.gpus.split(',')) is 1):
        kv = None

    # module
    dataiter = rl_data.GymDataIter('Breakout-v0',
                                   args.batch_size,
                                   args.input_length,
                                   web_viz=True)
    net = sym.get_symbol_atari(dataiter.act_dim)
    module = mx.mod.Module(net,
                           data_names=[d[0] for d in dataiter.provide_data],
                           label_names=('policy_label', 'value_label'),
                           context=devs)
    module.bind(data_shapes=dataiter.provide_data,
                label_shapes=[('policy_label', (args.batch_size, )),
                              ('value_label', (args.batch_size, 1))],
                grad_req='add')

    # load model

    if args.load_epoch is not None:
        assert model_prefix is not None
        _, arg_params, aux_params = mx.model.load_checkpoint(
            model_prefix, args.load_epoch)
    else:
        arg_params = aux_params = None

    # save model
    checkpoint = None if save_model_prefix is None else mx.callback.do_checkpoint(
        save_model_prefix)

    init = mx.init.Mixed(['fc_value_weight|fc_policy_weight', '.*'], [
        mx.init.Uniform(0.001),
        mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2)
    ])
    module.init_params(initializer=init,
                       arg_params=arg_params,
                       aux_params=aux_params)

    # optimizer
    module.init_optimizer(kvstore=kv,
                          optimizer='adam',
                          optimizer_params={
                              'learning_rate': args.lr,
                              'wd': args.wd,
                              'epsilon': 1e-3
                          })

    # logging
    np.set_printoptions(precision=3, suppress=True)

    T = 0
    dataiter.reset()
    score = np.zeros((args.batch_size, 1))
    final_score = np.zeros((args.batch_size, 1))
    iteration = 0
    for epoch in range(args.num_epochs):
        if save_model_prefix:
            module.save_params('%s-%04d.params' % (save_model_prefix, epoch))

        for _ in range(int(epoch_size / args.t_max)):

            # <EcoSys> Added this profiling check.
            if iteration == args.profile_start:
                print("Profile start.")
                cuda.profile_start()
            elif iteration == args.profile_stop:
                print("Calling profile_stop().")
                cuda.profile_stop()
                print("Done calling profile_stop().")
            # </EcoSys>

            tic = time.time()
            # clear gradients
            for exe in module._exec_group.grad_arrays:
                for g in exe:
                    g[:] = 0

            S, A, V, r, D = [], [], [], [], []
            for t in range(args.t_max + 1):
                data = dataiter.data()
                module.forward(mx.io.DataBatch(data=data, label=None),
                               is_train=False)
                act, _, val = module.get_outputs()
                V.append(val.asnumpy())
                if t < args.t_max:
                    act = act.asnumpy()
                    act = [
                        np.random.choice(dataiter.act_dim, p=act[i])
                        for i in range(act.shape[0])
                    ]
                    reward, done = dataiter.act(act)
                    S.append(data)
                    A.append(act)
                    r.append(reward.reshape((-1, 1)))
                    D.append(done.reshape((-1, 1)))

            err = 0
            R = V[args.t_max]
            for i in reversed(range(args.t_max)):
                R = r[i] + args.gamma * (1 - D[i]) * R
                adv = np.tile(R - V[i], (1, dataiter.act_dim))

                batch = mx.io.DataBatch(
                    data=S[i], label=[mx.nd.array(A[i]),
                                      mx.nd.array(R)])
                module.forward(batch, is_train=True)

                pi = module.get_outputs()[1]
                h = -args.beta * (mx.nd.log(pi + 1e-7) * pi)
                out_acts = np.amax(pi.asnumpy(), 1)
                out_acts = np.reshape(out_acts, (-1, 1))
                out_acts_tile = np.tile(-np.log(out_acts + 1e-7),
                                        (1, dataiter.act_dim))
                module.backward([mx.nd.array(out_acts_tile * adv), h])

                print('pi', pi[0].asnumpy())
                print('h', h[0].asnumpy())
                err += (adv**2).mean()
                score += r[i]
                final_score *= (1 - D[i])
                final_score += score * D[i]
                score *= 1 - D[i]
                T += D[i].sum()

            module.update()
            logging.info('fps: %f err: %f score: %f final: %f T: %f' %
                         (args.batch_size / (time.time() - tic), err /
                          args.t_max, score.mean(), final_score.mean(), T))
            print(score.squeeze())
            print(final_score.squeeze())
            iteration += 1
Пример #13
0
def train_step(sess, train_op, global_step, train_step_kwargs):
    """Function that takes a gradient step and specifies whether to stop.
  Args:
    sess: The current session.
    train_op: An `Operation` that evaluates the gradients and returns the
      total loss.
    global_step: A `Tensor` representing the global training step.
    train_step_kwargs: A dictionary of keyword arguments.
  Returns:
    The total loss and a boolean indicating whether or not to stop training.
  Raises:
    ValueError: if 'should_trace' is in `train_step_kwargs` but `logdir` is not.
  """
    start_time = time.time()

    trace_run_options = None
    run_metadata = None
    if 'should_trace' in train_step_kwargs:
        if 'logdir' not in train_step_kwargs:
            raise ValueError(
                'logdir must be present in train_step_kwargs when '
                'should_trace is present')
        if sess.run(train_step_kwargs['should_trace']):
            trace_run_options = config_pb2.RunOptions(
                trace_level=config_pb2.RunOptions.FULL_TRACE)
            run_metadata = config_pb2.RunMetadata()

    total_loss, np_global_step = sess.run([train_op, global_step],
                                          options=trace_run_options,
                                          run_metadata=run_metadata)
    time_elapsed = time.time() - start_time

    if 'nvprof_on' in train_step_kwargs:
        import numba.cuda as cuda
        if np_global_step == train_step_kwargs['nvprof_start_step']:
            cuda.profile_start()
        if np_global_step == train_step_kwargs['nvprof_stop_step']:
            cuda.profile_stop()

    if run_metadata is not None:
        tl = timeline.Timeline(run_metadata.step_stats)
        trace = tl.generate_chrome_trace_format()
        trace_filename = os.path.join(train_step_kwargs['logdir'],
                                      'tf_trace-%d.json' % np_global_step)
        logging.info('Writing trace to %s', trace_filename)
        file_io.write_string_to_file(trace_filename, trace)
        if 'summary_writer' in train_step_kwargs:
            train_step_kwargs['summary_writer'].add_run_metadata(
                run_metadata, 'run_metadata-%d' % np_global_step)

    if 'should_log' in train_step_kwargs:
        if sess.run(train_step_kwargs['should_log']):
            logging.info('global step %d: loss = %.4f (%.3f sec/step)',
                         np_global_step, total_loss, time_elapsed)

    # TODO(nsilberman): figure out why we can't put this into sess.run. The
    # issue right now is that the stop check depends on the global step. The
    # increment of global step often happens via the train op, which used
    # created using optimizer.apply_gradients.
    #
    # Since running `train_op` causes the global step to be incremented, one
    # would expected that using a control dependency would allow the
    # should_stop check to be run in the same session.run call:
    #
    #   with ops.control_dependencies([train_op]):
    #     should_stop_op = ...
    #
    # However, this actually seems not to work on certain platforms.
    if 'should_stop' in train_step_kwargs:
        should_stop = sess.run(train_step_kwargs['should_stop'])
    else:
        should_stop = False

    return total_loss, should_stop
Пример #14
0
def train_proc(conv_wid, conv_wn, fc_wid, fc_wn, wid, wn, pred_wid, succ_wid,
               bs, subbs, pd, input_shp, output_shp, sub_net, fp_head_list,
               fp_tail_list, bp_head_list, bp_tail_list, shared_cnters,
               train_step, global_step, sta_lidx, end_lidx):

    pid = os.getpid()
    print("train_proc pid=", pid)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    optimizer = optim.SGD(sub_net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)
    iter_thresh = int(bs / subbs)
    fp_iter = 0
    bp_iter = 0
    inputs = None
    outputs = None
    if sta_lidx == 0:
        fake_input = torch.randn(input_shp)
        print(fake_input.size())
    if end_lidx == -1:
        fake_target = torch.from_numpy(
            np.random.randint(0, 999, size=int(subbs / pd)))
        criterion = nn.CrossEntropyLoss()
        print(fake_target.size())
    qu = Queue.Queue()
    local_step = 0
    sta = time.time()
    prof_on = False
    #with torch.autograd.profiler.emit_nvtx():
    while True:
        if local_step == 5 and prof_on == False:
            cuda.profile_start()
            prof_on = True
            print("Prof Start")
        if local_step == 10 and prof_on == True:
            cuda.profile_stop()
            prof_on = False
            print("Prof Stop")

        if not (local_step == global_step):
            time.sleep(0.001)
            continue
        if wid == 0 or wid == 1:
            #先查BP 再 FP
            #fp_head_tensor_list, fp_tail_tensor_list, bp_head_tensor_list, bp_tail_tensor_list
            if bp_iter < fp_iter:
                if bp_iter < shared_cnters[3]:
                    backward_ctx = bp_tail_list[bp_iter].cuda()
                    outputs = qu.get()
                    outputs.backward(backward_ctx)
                    bp_iter += 1
                    #print(wid," ", rank, "  bp complete  ", fp_iter, " ", bp_iter)
                if bp_iter == iter_thresh:
                    #bp_to_recv has reached bs, then it is time to update grad and reset cnter
                    optimizer.step()
                    optimizer.zero_grad()
                    train_step += 1
                    fp_iter = 0
                    bp_iter = 0
                    shared_cnters[3].zero_()
                    local_step += 1
                    #print(wid, " ", sync_iter)
            #FP has not reached the threshold and can be executed
            if fp_iter < shared_cnters[0]:
                inputs = fake_input.cuda()
                outputs = sub_net(inputs)
                fp_tail_list[fp_iter].copy_(outputs)
                qu.put(outputs)
                shared_cnters[1] += 1
                fp_iter += 1
                #print(wid, "  fp complete  ", fp_iter, "  ", bp_iter)
        elif wid == wn - 1:
            #print("last worker")
            #FP has not reached the threshold and can be executed
            if fp_iter < shared_cnters[0]:
                fp_head_list[fp_iter].requires_grad = True
                inputs = fp_head_list[fp_iter].cuda()
                outputs = sub_net(inputs)
                #shared_cnters[1] += 1
                fp_iter += 1
                target = fake_target.cuda()
                loss = criterion(outputs, target)
                loss.backward()
                #print(HookFunc.hook_dict)
                #time.sleep(5)
                #bp_ctx = HookFunc.hook_dict[pid]
                if HookFunc.hook_dict[pid] is not None:
                    #should be forked
                    bp_head_list[bp_iter].copy_(HookFunc.hook_dict[pid])
                    HookFunc.hook_dict[pid] = None
                    shared_cnters[2] += 1
                else:
                    print("Err")
                    exit(-1)
                bp_iter += 1
                if bp_iter == iter_thresh:
                    #bp_to_recv has reached bs, then it is time to update grad and reset cnter
                    optimizer.step()
                    optimizer.zero_grad()
                    train_step += 1
                    global_step += 1
                    fp_iter = 0
                    bp_iter = 0
                    shared_cnters[0].zero_()
                    local_step += 1
                    #print("wid={:d} global_step={:d}".format( int(wid), int(global_step) ))

        else:
            #middle
            #print("ff ", fp_iter, "  ", shared_cnters[0], " ", bp_iter)
            if bp_iter < fp_iter:
                #print("Pre fp vs bp ", fp_iter, " ", bp_iter)
                if bp_iter < shared_cnters[3]:
                    backward_ctx = bp_tail_list[bp_iter].cuda()
                    outputs = qu.get()
                    outputs.backward(backward_ctx)
                    #bp_ctx = HookFunc.hook_dict[pid]
                    #exec('bp_ctx = HookFunc_{}.hook_dict["backward_ctx"]'.format(rank))
                    if HookFunc.hook_dict[pid] is not None:
                        #should be forked
                        bp_head_list[bp_iter].copy_(HookFunc.hook_dict[pid])
                        #exec('HookFunc_{}.hook_dict["backward_ctx"]=None'.format(rank))
                        HookFunc.hook_dict[pid] = None
                        shared_cnters[2] += 1
                    else:
                        print("Err")
                        exit(-1)
                    bp_iter += 1
                    #print("fp vs bp ", fp_iter, " ", bp_iter)
                if bp_iter == iter_thresh:
                    #bp_to_recv has reached bs, then it is time to update grad and reset cnter
                    optimizer.step()
                    optimizer.zero_grad()
                    train_step += 1
                    global_step += 1
                    fp_iter = 0
                    bp_iter = 0
                    shared_cnters[0].zero_()
                    shared_cnters[3].zero_()
                    local_step += 1
                    #print("wid={:d} global_step={:d}".format(int(wid), int(global_step)))

            #FP has not reached the threshold and can be executed
            #print("ff ", fp_iter, "  ", shared_cnters[0])
            if fp_iter < shared_cnters[0]:
                fp_head_list[fp_iter].requires_grad = True
                inputs = fp_head_list[fp_iter].cuda()
                outputs = sub_net(inputs)
                qu.put(outputs)
                #print("debug: ", outputs.size(), output_shp)
                fp_tail_list[fp_iter].copy_(outputs)
                shared_cnters[1] += 1
                fp_iter += 1
Пример #15
0
def train(hparams, scope=None, target_session="", single_cell_fn=None):
    """Train a translation model."""
    log_device_placement = hparams.log_device_placement
    out_dir = hparams.out_dir
    num_train_steps = hparams.num_train_steps
    steps_per_stats = hparams.steps_per_stats
    steps_per_external_eval = hparams.steps_per_external_eval
    steps_per_eval = 10 * steps_per_stats
    if not steps_per_external_eval:
        steps_per_external_eval = 5 * steps_per_eval

    if not hparams.attention:
        model_creator = nmt_model.Model
    elif hparams.attention_architecture == "standard":
        model_creator = attention_model.AttentionModel
    elif hparams.attention_architecture in ["gnmt", "gnmt_v2"]:
        model_creator = gnmt_model.GNMTModel
    else:
        raise ValueError("Unknown model architecture")

    train_model = create_train_model(model_creator, hparams, scope,
                                     single_cell_fn)
    eval_model = create_eval_model(model_creator, hparams, scope,
                                   single_cell_fn)
    infer_model = inference.create_infer_model(model_creator, hparams, scope,
                                               single_cell_fn)

    # Preload data for sample decoding.
    dev_src_file = "%s.%s" % (hparams.dev_prefix, hparams.src)
    dev_tgt_file = "%s.%s" % (hparams.dev_prefix, hparams.tgt)
    sample_src_data = inference.load_data(dev_src_file)
    sample_tgt_data = inference.load_data(dev_tgt_file)

    summary_name = "train_log"
    model_dir = hparams.out_dir

    # Log and output files
    log_file = os.path.join(out_dir, "log_%d" % time.time())
    log_f = tf.gfile.GFile(log_file, mode="a")
    utils.print_out("# log_file=%s" % log_file, log_f)

    avg_step_time = 0.0

    # TensorFlow model
    config_proto = utils.get_config_proto(
        log_device_placement=log_device_placement)

    train_sess = tf.Session(target=target_session,
                            config=config_proto,
                            graph=train_model.graph)
    eval_sess = tf.Session(target=target_session,
                           config=config_proto,
                           graph=eval_model.graph)
    infer_sess = tf.Session(target=target_session,
                            config=config_proto,
                            graph=infer_model.graph)

    with train_model.graph.as_default():
        loaded_train_model, global_step = model_helper.create_or_load_model(
            train_model.model, model_dir, train_sess, "train")

    # Summary writer
    summary_writer = tf.summary.FileWriter(os.path.join(out_dir, summary_name),
                                           train_model.graph)

    # First evaluation
    run_full_eval(model_dir, infer_model, infer_sess, eval_model, eval_sess,
                  hparams, summary_writer, sample_src_data, sample_tgt_data)

    last_stats_step = global_step
    last_eval_step = global_step
    last_external_eval_step = global_step

    # This is the training loop.
    step_time, checkpoint_loss, checkpoint_predict_count = 0.0, 0.0, 0.0
    # <EcoSys> Added the measurements on total number of samples.
    checkpoint_total_count, checkpoint_total_samples = 0.0, 0.0
    # checkpoint_total_count = 0.0
    # </EcoSys>
    speed, train_ppl = 0.0, 0.0
    start_train_time = time.time()

    utils.print_out(
        "# Start step %d, lr %g, %s" %
        (global_step, loaded_train_model.learning_rate.eval(
            session=train_sess), time.ctime()), log_f)

    # Initialize all of the iterators
    skip_count = hparams.batch_size * hparams.epoch_step
    utils.print_out("# Init train iterator, skipping %d elements" % skip_count)
    train_sess.run(train_model.iterator.initializer,
                   feed_dict={train_model.skip_count_placeholder: skip_count})

    while global_step < num_train_steps:

        # <EcoSys> Added the profiler start and end point.
        import numba.cuda as cuda

        if global_step == 501:
            cuda.profile_start()
        if global_step == 511:
            cuda.profile_stop()
        # </EcoSys>

        ### Run a step ###
        start_time = time.time()
        try:
            step_result = loaded_train_model.train(train_sess)
            (_, step_loss, step_predict_count, step_summary, global_step,
             step_word_count, batch_size) = step_result
            hparams.epoch_step += 1
        except tf.errors.OutOfRangeError:
            # Finished going through the training dataset.  Go to next epoch.
            hparams.epoch_step = 0
            utils.print_out(
                "# Finished an epoch, step %d. Perform external evaluation" %
                global_step)
            run_sample_decode(infer_model, infer_sess, model_dir, hparams,
                              summary_writer, sample_src_data, sample_tgt_data)
            dev_scores, test_scores, _ = run_external_eval(
                infer_model, infer_sess, model_dir, hparams, summary_writer)
            train_sess.run(train_model.iterator.initializer,
                           feed_dict={train_model.skip_count_placeholder: 0})
            continue

        # Write step summary.
        summary_writer.add_summary(step_summary, global_step)

        # update statistics
        step_time += (time.time() - start_time)

        checkpoint_loss += (step_loss * batch_size)
        checkpoint_predict_count += step_predict_count
        checkpoint_total_count += float(step_word_count)
        # <EcoSys> Increase the total number of samples by batch size.
        checkpoint_total_samples += float(batch_size)
        # </EcoSys>

        # Once in a while, we print statistics.
        if global_step - last_stats_step >= steps_per_stats:
            last_stats_step = global_step

            # Print statistics for the previous epoch.
            avg_step_time = step_time / steps_per_stats
            train_ppl = utils.safe_exp(checkpoint_loss /
                                       checkpoint_predict_count)
            speed = checkpoint_total_count / (1000 * step_time)
            # <EcoSys> Added samples per second to the log file.
            speed_samples_per_sec = checkpoint_total_samples / (step_time)

            utils.print_out(
                "  global step %d lr %g "
                "step-time %.2fs wps %.2fK sps %5.2f ppl %.2f %s" %
                (global_step,
                 loaded_train_model.learning_rate.eval(session=train_sess),
                 avg_step_time, speed, speed_samples_per_sec, train_ppl,
                 _get_best_results(hparams)), log_f)
            # </EcoSys>
            """
      utils.print_out(
          "  global step %d lr %g "
          "step-time %.2fs wps %.2fK ppl %.2f %s" %
          (global_step,
           loaded_train_model.learning_rate.eval(session=train_sess),
           avg_step_time, speed, train_ppl, _get_best_results(hparams)),
          log_f)
      """
            if math.isnan(train_ppl):
                break

            # Reset timer and loss.
            step_time, checkpoint_loss, checkpoint_predict_count = 0.0, 0.0, 0.0
            checkpoint_total_count = 0.0
            checkpoint_total_samples = 0.0

        if global_step - last_eval_step >= steps_per_eval:
            last_eval_step = global_step

            utils.print_out("# Save eval, global step %d" % global_step)
            utils.add_summary(summary_writer, global_step, "train_ppl",
                              train_ppl)

            # Save checkpoint
            loaded_train_model.saver.save(train_sess,
                                          os.path.join(out_dir,
                                                       "translate.ckpt"),
                                          global_step=global_step)

            # Evaluate on dev/test
            run_sample_decode(infer_model, infer_sess, model_dir, hparams,
                              summary_writer, sample_src_data, sample_tgt_data)
            dev_ppl, test_ppl = run_internal_eval(eval_model, eval_sess,
                                                  model_dir, hparams,
                                                  summary_writer)

        if global_step - last_external_eval_step >= steps_per_external_eval:
            last_external_eval_step = global_step

            # Save checkpoint
            loaded_train_model.saver.save(train_sess,
                                          os.path.join(out_dir,
                                                       "translate.ckpt"),
                                          global_step=global_step)
            run_sample_decode(infer_model, infer_sess, model_dir, hparams,
                              summary_writer, sample_src_data, sample_tgt_data)
            dev_scores, test_scores, _ = run_external_eval(
                infer_model, infer_sess, model_dir, hparams, summary_writer)

    # Done training
    loaded_train_model.saver.save(train_sess,
                                  os.path.join(out_dir, "translate.ckpt"),
                                  global_step=global_step)

    result_summary, _, dev_scores, test_scores, dev_ppl, test_ppl = run_full_eval(
        model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams,
        summary_writer, sample_src_data, sample_tgt_data)
    utils.print_out(
        "# Final, step %d lr %g "
        "step-time %.2f wps %.2fK ppl %.2f, %s, %s" %
        (global_step,
         loaded_train_model.learning_rate.eval(session=train_sess),
         avg_step_time, speed, train_ppl, result_summary, time.ctime()), log_f)
    utils.print_time("# Done training!", start_train_time)

    utils.print_out("# Start evaluating saved best models.")
    for metric in hparams.metrics:
        best_model_dir = getattr(hparams, "best_" + metric + "_dir")
        result_summary, best_global_step, _, _, _, _ = run_full_eval(
            best_model_dir, infer_model, infer_sess, eval_model, eval_sess,
            hparams, summary_writer, sample_src_data, sample_tgt_data)
        utils.print_out(
            "# Best %s, step %d "
            "step-time %.2f wps %.2fK, %s, %s" %
            (metric, best_global_step, avg_step_time, speed, result_summary,
             time.ctime()), log_f)

    summary_writer.close()
    return (dev_scores, test_scores, dev_ppl, test_ppl, global_step)
Пример #16
0
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bpg = cuda.gridDim.x  # blocks per grid

    if x >= C.shape[0] and y >= C.shape[1]:
        # Quit if (x, y) is outside of valid C boundary
        return

    # Each thread computes one element in the result matrix.
    # The dot product is chunked into dot products of TPB-long vectors.
    tmp = 0.
    for i in range(bpg):
        # Preload data into shared memory
        sA[tx, ty] = A[x, ty + i * TPB]
        sB[tx, ty] = B[tx + i * TPB, y]

        # Wait until all threads finish preloading
        cuda.syncthreads()

        # Computes partial product on the shared memory
        for j in range(TPB):
            tmp += sA[tx, j] * sB[j, ty]

        # Wait until all threads finish computing
        cuda.syncthreads()

    C[x, y] = tmp


cuda.profile_stop()
Пример #17
0
def gpu_merge(points, err, numobj):
    cuda.profile_start()
    start = time.time()
    #reshape points for gpu
    centre = np.zeros((len(points), 4), dtype='float32')
    related = np.zeros((len(points), len(points)), dtype='int32')
    sources = np.zeros((len(points), numobj, 3), dtype='float32')

    e = 0  #global error
    a_e = []
    #populate arrays for gpu
    for i, p in enumerate(points):
        centre[i, 0] = p[0]
        centre[i, 1] = p[1]
        centre[i, 2] = p[2]
        centre[i, 3] = p[8]
        e += p[8]
        for j, r in enumerate(p[6]):
            if r[0][6] == True:
                for k in xrange(len(points)):
                    if (points[k][0] == r[1][0]) and (points[k][1] == r[1][1]):
                        related[i, j] = k
                        break
                else:
                    related[i, j] = -1
        for j in range(len(p[6]), len(points) / 2 + 1):
            related[i, j] = -2
        for j, s in enumerate(p[7]):
            sources[i, j, 0] = s[0]
            sources[i, j, 1] = s[1]
            sources[i, j, 2] = s[2]
    end = time.time()
    print 'reshape time: {0}'.format(end - start)

    start = time.time()
    #transfer arrays to gpu
    d_results = cuda.device_array((len(points), 7), np.float32)
    d_centre = cuda.to_device(centre)
    d_related = cuda.to_device(related)
    d_sources = cuda.to_device(sources)
    end = time.time()
    print 'transfer time: {0}'.format(end - start)

    p = len(points)
    #get grid and block sizes
    b = 32
    g = len(points) / b + 1

    start = time.time()
    while (True):
        #call kernel
        d_get_best[g, b](d_centre, p, d_results, d_related, d_sources, err,
                         numobj)

        results = d_results.copy_to_host()

        a_e.append(e)

        best = np.array([0, 0, 0, 0, 0, 0, err])
        for r in range(results.shape[0]):
            if results[r, 6] < best[6]:
                for q in range(7):
                    best[q] = results[r, q]

        if best[6] + e > err:
            print "Merge criteria met"
            print "Final Error: {0}".format(e)
            break
        else:
            e += best[6]

        h_do_merge(best, points)
        d_best = cuda.to_device(best)
        d_do_merge[g, b](d_best, d_centre, d_related, d_sources, p, numobj,
                         err)
    end = time.time()
    print 'compute time: {0}'.format(end - start)
    cuda.profile_stop()
    return (a_e)