def _get_sampler(self, epoch) -> "DistributedSampler":
     if self.split == "train":
         # For video model training, we don't necessarily want to use all possible
         # clips in the video in one training epoch. More often, we randomly
         # sample at most N clips per training video. In practice, N is often 1
         clip_sampler = RandomClipSampler(self.video_clips,
                                          self.clips_per_video)
     else:
         # For video model testing, we sample N evenly spaced clips per test
         # video. We will simply average predictions over them
         clip_sampler = UniformClipSampler(self.video_clips,
                                           self.clips_per_video)
     clip_sampler = MaxLengthClipSampler(clip_sampler,
                                         num_samples=self.num_samples)
     world_size = get_world_size()
     rank = get_rank()
     sampler = DistributedSampler(
         clip_sampler,
         num_replicas=world_size,
         rank=rank,
         shuffle=self.shuffle,
         group_size=self.clips_per_video,
     )
     sampler.set_epoch(epoch)
     return sampler
Пример #2
0
    def _log_performance_metrics(self, task: "tasks.ClassyTask") -> None:
        """
        Compute and log performance metrics.
        """
        phase_type = task.phase_type
        batches = len(task.losses)

        if self.start_time is None:
            logging.warning("start_time not initialized")
        else:
            # Average batch time calculation
            total_batch_time = time.time() - self.start_time
            average_batch_time = total_batch_time / batches
            logging.info(
                "Average %s batch time (ms) for %d batches: %d"
                % (phase_type, batches, 1000.0 * average_batch_time)
            )

        # Train step time breakdown
        if not hasattr(task, "perf_stats") or task.perf_stats is None:
            logging.warning('"perf_stats" not set in task')
        elif task.train:
            logging.info(
                "Train step time breakdown (rank {}):\n{}".format(
                    get_rank(), task.perf_stats.report_str()
                )
            )
Пример #3
0
    def _log_loss_meters(self, task: "tasks.ClassyTask",
                         local_variables: Dict[str, Any]) -> None:
        """
        Compute and log the loss and meters.
        """

        phase_type = task.phase_type
        phase_type_idx = task.train_phase_idx if task.train else task.eval_phase_idx
        batches = len(task.losses)

        # Loss for the phase
        loss = sum(task.losses) / (batches * task.get_batchsize_per_replica())

        log_strs = [
            "Rank: {}, {} phase: {}, processed batches: {}".format(
                get_rank(), phase_type, phase_type_idx, batches),
            "{} loss: {}".format(phase_type, loss),
            "Meters:",
        ]
        acc = []
        for meter in task.meters:
            log_strs.append("{}".format(meter))
            acc.append(meter)
        logging.info("\n".join(log_strs))
        return acc
Пример #4
0
 def _get_sampler(self, epoch):
     world_size = get_world_size()
     rank = get_rank()
     sampler = DistributedSampler(self,
                                  num_replicas=world_size,
                                  rank=rank,
                                  shuffle=self.shuffle)
     sampler.set_epoch(epoch)
     return sampler
Пример #5
0
    def __init__(self, buffer_params, temperature: float):
        super(SimclrInfoNCECriterion, self).__init__()

        self.use_gpu = get_cuda_device_index() > -1
        self.temperature = temperature
        self.num_pos = 2
        self.buffer_params = buffer_params
        self.criterion = nn.CrossEntropyLoss()
        self.dist_rank = get_rank()
        self.pos_mask = None
        self.neg_mask = None
        self.precompute_pos_neg_mask()
        logging.info(f"Creating Info-NCE loss on Rank: {self.dist_rank}")
Пример #6
0
 def compute_partition_function(self, out):
     num_items = self.memory.size(0)
     with torch.no_grad():
         batch_mean = out.mean()
         # NOTE: this relies of "mean" computation being stable and deterministic
         # across all nodes. Could be replaced with smarter ways.
         if torch.distributed.is_available(
         ) and torch.distributed.is_initialized():
             batch_mean_gathered = gather_from_all(batch_mean)
             all_batch_mean = batch_mean_gathered.mean().squeeze().item()
         else:
             all_batch_mean = batch_mean.item()
     self.params[2] = all_batch_mean * num_items
     Z = self.params[2].clone().detach().item()
     rank = get_rank()
     logging.info(f"Rank: {rank}; Normalization constant Z is set to {Z}")
Пример #7
0
 def __init__(self, cfg: AttrDict, data_source: str, path: str, split: str,
              dataset_name: str):
     super(AirstoreDataset, self).__init__(
         queue_size=cfg["DATA"][split]["BATCHSIZE_PER_REPLICA"])
     self.pathmanager = create_path_manager()
     self.cfg = cfg
     self.batch_size = cfg["DATA"][split]["BATCHSIZE_PER_REPLICA"]
     self.airstore_uri = path
     self.split = split
     self.epoch = 0
     self.start_iter = 0
     self.enable_queue_dataset = cfg["DATA"][
         self.split]["ENABLE_QUEUE_DATASET"]
     self.global_rank = get_rank()
     self.global_world_size = get_world_size()
     self._iterator = None
Пример #8
0
    def _get_sampler(self, epoch: int):
        """
        Return a :class:`torch.utils.data.sampler.Sampler` to sample the data.

        This is used to distribute the data across the replicas. If shuffling
        is enabled, every epoch will have a different shuffle.

        Args:
            epoch: The epoch being fetched.

        Returns:
            A sampler which tells the data loader which sample to load next.
        """
        world_size = get_world_size()
        rank = get_rank()
        sampler = DistributedSampler(self,
                                     num_replicas=world_size,
                                     rank=rank,
                                     shuffle=self.shuffle)
        sampler.set_epoch(epoch)
        return sampler
Пример #9
0
    def __init__(
        self,
        temperature: float,
        crops_for_assign: List[int],
        num_crops: int,
        num_iters: int,
        epsilon: float,
        use_double_prec: bool,
        num_prototypes: List[int],
        local_queue_length: int,
        embedding_dim: int,
        temp_hard_assignment_iters: int,
        output_dir: str,
    ):
        super(SwAVCriterion, self).__init__()

        self.use_gpu = get_cuda_device_index() > -1

        self.temperature = temperature
        self.crops_for_assign = crops_for_assign
        self.num_crops = num_crops
        self.nmb_sinkhornknopp_iters = num_iters
        self.epsilon = epsilon
        self.use_double_prec = use_double_prec
        self.num_prototypes = num_prototypes
        self.nmb_heads = len(self.num_prototypes)
        self.embedding_dim = embedding_dim
        self.temp_hard_assignment_iters = temp_hard_assignment_iters
        self.local_queue_length = local_queue_length
        self.dist_rank = get_rank()
        self.world_size = get_world_size()
        self.log_softmax = nn.LogSoftmax(dim=1).cuda()
        self.softmax = nn.Softmax(dim=1).cuda()
        self.register_buffer("num_iteration", torch.zeros(1, dtype=int))
        self.use_queue = False
        if local_queue_length > 0:
            self.initialize_queue()
        self.output_dir = output_dir
Пример #10
0
    def on_update(self, task: "tasks.ClassyTask") -> None:
        """
        Executed after after parameter update. If the current phase is training,
        and it's a logging iteration, we compute and log several helpul training
        stats to keep track of ongoing training.

        For monitoring the batch size (average training iteration time), we allow
        monitoring the stats (optionally) for every N iterations to get better
        idea about the batch time and training eta.

        Set the btime_freq input using cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY=N
        ensuring that cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS = True.
        """
        phase_type = "train" if task.train else "test"
        if is_primary() and phase_type == "train":
            train_phase_idx = task.train_phase_idx
            log_freq = task.config["LOG_FREQUENCY"]
            iteration = task.iteration

            if torch.cuda.is_available():
                peak_mem_used = int(torch.cuda.max_memory_allocated() /
                                    1024.0 / 1024.0)
            else:
                peak_mem_used = -1

            if ((iteration == 1) or (iteration % log_freq == 0)
                    or (iteration <= 100 and iteration % 5 == 0)):
                loss_val = round(task.last_batch.loss.data.cpu().item(), 5)
                if len(task.batch_time) > 0:
                    batch_times = task.batch_time
                else:
                    batch_times = [0]
                avg_time = sum(batch_times) / len(batch_times)

                eta_secs = avg_time * (task.max_iteration - iteration)
                eta_string = str(datetime.timedelta(seconds=int(eta_secs)))
                if isinstance(task.optimizer.options_view.lr, set):
                    lr_val = list(task.optimizer.options_view.lr)
                else:
                    lr_val = round(task.optimizer.options_view.lr, 5)
                batch_time = int(1000.0 * avg_time)
                rank = get_rank()
                log_data = {
                    "Rank": rank,
                    "ep": train_phase_idx,
                    "iter": iteration,
                    "lr": lr_val,
                    "loss": loss_val,
                    "btime(ms)": batch_time,
                    "eta": eta_string,
                    "peak_mem(M)": peak_mem_used,
                }

                if iteration == 1:
                    # Set max iterations. Currently used in benchmark_suite_scheduler.py
                    log_data["max_iterations"] = task.max_iteration

                if self.btime_freq and len(batch_times) >= self.btime_freq:
                    rolling_avg_time = (sum(batch_times[-self.btime_freq:]) /
                                        self.btime_freq)
                    rolling_eta_secs = int(rolling_avg_time *
                                           (task.max_iteration - iteration))
                    rolling_eta_str = str(
                        datetime.timedelta(seconds=int(rolling_eta_secs)))
                    rolling_btime = int(1000.0 * rolling_avg_time)
                    log_data[
                        f"btime({self.btime_freq}iters)(ms)"] = rolling_btime
                    log_data["rolling_eta"] = rolling_eta_str

                # to maintain the backwards compatibility with the log.txt
                # logs, we convert the json to the previous format.
                # the stdout.json can be used to use the json format of logs.
                stdout_data = ""
                for key, value in log_data.items():
                    stdout_data = (f"{stdout_data}[{key}: {value}] "
                                   if key == "ep" else
                                   f"{stdout_data}{key}: {value}; ")
                logging.info(stdout_data.strip())
                self.json_stdout_logger.write(json.dumps(log_data) + "\n")
Пример #11
0
    def _log_training_epoch(self, task):
        train_phase_idx = task.train_phase_idx
        log_freq = task.config["LOG_FREQUENCY"]
        iteration = task.iteration
        if torch.cuda.is_available():
            peak_mem_used = int(torch.cuda.max_memory_allocated() / 1024.0 /
                                1024.0)
        else:
            peak_mem_used = -1

        if ((iteration == 1) or (iteration % log_freq == 0)
                or (iteration <= 100 and iteration % 5 == 0)):
            loss_val = round(task.last_batch.loss.data.cpu().item(), 5)
            if len(task.batch_time) > 0:
                batch_times = task.batch_time
            else:
                batch_times = [0]
            avg_time = sum(batch_times) / len(batch_times)

            eta_secs = avg_time * (task.max_iteration - iteration)
            eta_string = str(datetime.timedelta(seconds=int(eta_secs)))
            if isinstance(task.optimizer.options_view.lr, (set, list)):
                lr_val = list(task.optimizer.options_view.lr)
            else:
                lr_val = round(task.optimizer.options_view.lr, 5)
            if isinstance(task.optimizer.options_view.weight_decay,
                          (set, list)):
                wd_val = list(task.optimizer.options_view.weight_decay)
            else:
                wd_val = round(task.optimizer.options_view.weight_decay, 5)
            batch_time = int(1000.0 * avg_time)
            rank = get_rank()

            log_data = {
                "Rank": rank,
                "ep": train_phase_idx,
                "iter": iteration,
                "lr": lr_val,
                "loss": loss_val,
                "btime(ms)": batch_time,
                "eta": eta_string,
                "peak_mem(M)": peak_mem_used,
                "weight_decay": wd_val,
            }

            # Add customized data registered by other hooks
            log_data.update(task.additional_log_data)

            if iteration == 1:
                # Set max iterations. Currently used in benchmark_suite_scheduler.py
                log_data["max_iterations"] = task.max_iteration

            if self.btime_freq and len(batch_times) >= self.btime_freq:
                rolling_avg_time = (sum(batch_times[-self.btime_freq:]) /
                                    self.btime_freq)
                rolling_eta_secs = int(rolling_avg_time *
                                       (task.max_iteration - iteration))
                rolling_eta_str = str(
                    datetime.timedelta(seconds=int(rolling_eta_secs)))
                rolling_btime = int(1000.0 * rolling_avg_time)
                log_data[f"btime({self.btime_freq}iters)(ms)"] = rolling_btime
                log_data["rolling_eta"] = rolling_eta_str

            # to maintain the backwards compatibility with the log.txt
            # logs, we convert the json to the previous format.
            # the stdout.json can be used to use the json format of logs.
            stdout_data = ""
            for key, value in log_data.items():
                stdout_data = (f"{stdout_data}[{key}: {value}] " if key == "ep"
                               else f"{stdout_data}{key}: {value}; ")
            logging.info(stdout_data.strip())
            self.json_stdout_logger.write(json.dumps(log_data) + "\n")
Пример #12
0
    def on_update(self, task: "tasks.ClassyTask") -> None:
        """
        Executed after after parameter update. If the current phase is training,
        and it's a logging iteration, we compute and log several helpul training
        stats to keep track of ongoing training.

        For monitoring the batch size (average training iteration time), we allow
        monitoring the stats (optionally) for every N iterations to get better
        idea about the batch time and training eta.

        Set the btime_freq input using cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY=N
        ensuring that cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS = True.
        """
        phase_type = "train" if task.train else "test"
        if is_primary() and phase_type == "train":
            train_phase_idx = task.train_phase_idx
            log_freq = task.config["LOG_FREQUENCY"]
            iteration = task.iteration

            if torch.cuda.is_available():
                peak_mem_used = int(torch.cuda.max_memory_allocated() /
                                    1024.0 / 1024.0)
            else:
                peak_mem_used = -1

            if ((iteration == 1) or (iteration % log_freq == 0)
                    or (iteration <= 100 and iteration % 5 == 0)):
                loss_val = round(task.last_batch.loss.data.cpu().item(), 5)
                if len(task.batch_time) > 0:
                    batch_times = task.batch_time
                else:
                    batch_times = [0]
                avg_time = sum(batch_times) / len(batch_times)

                eta_secs = avg_time * (task.max_iteration - iteration)
                eta_string = str(datetime.timedelta(seconds=int(eta_secs)))
                if isinstance(task.optimizer.options_view.lr, set):
                    lr_val = list(task.optimizer.options_view.lr)
                else:
                    lr_val = round(task.optimizer.options_view.lr, 5)
                batch_time = int(1000.0 * avg_time)
                rank = get_rank()
                log_str = (f"Rank: {rank}; "
                           f"[ep: {train_phase_idx}] "
                           f"iter: {iteration}; "
                           f"lr: {lr_val}; "
                           f"loss: {loss_val}; "
                           f"btime(ms): {batch_time}; "
                           f"eta: {eta_string}; "
                           f"peak_mem: {peak_mem_used}M")
                if self.btime_freq and len(batch_times) >= self.btime_freq:
                    rolling_avg_time = (sum(batch_times[-self.btime_freq:]) /
                                        self.btime_freq)
                    rolling_eta_secs = int(rolling_avg_time *
                                           (task.max_iteration - iteration))
                    rolling_eta_str = str(
                        datetime.timedelta(seconds=int(rolling_eta_secs)))
                    rolling_btime = int(1000.0 * rolling_avg_time)
                    log_str = (
                        f"{log_str}; "
                        f"btime({self.btime_freq}iters): {rolling_btime} ms; "
                        f"rolling_eta: {rolling_eta_str}")
                logging.info(log_str)
Пример #13
0
    def cluster_memory(self):
        self.start_idx = 0
        j = 0
        with torch.no_grad():
            for i_K, K in enumerate(self.num_clusters):
                # run distributed k-means

                # init centroids with elements from memory bank of rank 0
                centroids = torch.empty(
                    K, self.embedding_dim).cuda(non_blocking=True)
                if get_rank() == 0:
                    random_idx = torch.randperm(
                        len(self.local_memory_embeddings[j]))[:K]
                    assert len(random_idx
                               ) >= K, "please reduce the number of centroids"
                    centroids = self.local_memory_embeddings[j][random_idx]
                dist.broadcast(centroids, 0)

                for n_iter in range(self.nmb_kmeans_iters + 1):

                    # E step
                    dot_products = torch.mm(self.local_memory_embeddings[j],
                                            centroids.t())
                    _, assignments = dot_products.max(dim=1)

                    # finish
                    if n_iter == self.nmb_kmeans_iters:
                        break

                    # M step
                    where_helper = get_indices_sparse(
                        assignments.cpu().numpy())
                    counts = torch.zeros(K).cuda(non_blocking=True).int()
                    emb_sums = torch.zeros(
                        K, self.embedding_dim).cuda(non_blocking=True)
                    for k in range(len(where_helper)):
                        if len(where_helper[k][0]) > 0:
                            emb_sums[k] = torch.sum(
                                self.local_memory_embeddings[j][where_helper[k]
                                                                [0]],
                                dim=0,
                            )
                            counts[k] = len(where_helper[k][0])
                    all_reduce_sum(counts)
                    mask = counts > 0
                    all_reduce_sum(emb_sums)
                    centroids[mask] = emb_sums[mask] / counts[mask].unsqueeze(
                        1)

                    # normalize centroids
                    centroids = nn.functional.normalize(centroids, dim=1, p=2)

                getattr(self, "centroids" + str(i_K)).copy_(centroids)
                # gather the assignments
                assignments_all = gather_from_all(assignments)
                indexes_all = gather_from_all(self.local_memory_index)
                self.assignments[i_K] = -100
                self.assignments[i_K][indexes_all] = assignments_all

                j = (j + 1) % self.nmb_mbs

        logging.info(f"Rank: {get_rank()}, clustering of the memory bank done")