def test_load_linear(self):

        # Initialize model with new random seed.
        set_random_seed(33)
        model = MNISTSparseCNN()
        model.eval()

        # Check output through the full network.
        for param1, param2 in zip(model.parameters(), self.model.parameters()):
            tot_eq = (param1 == param2).sum().item()
            self.assertNotEqual(tot_eq, np.prod(param1.shape))

        # Check output through the lower network.
        out = lower_forward(model, self.in_1)
        num_matches = out.isclose(self.out_lower, atol=1e-2).sum().item()
        self.assertEqual(num_matches, 1337)  # some correct

        # Check output through the lower network.
        out = upper_forward(model, self.in_2)
        num_matches = out.isclose(self.out_upper, atol=1e-2).sum().item()
        self.assertEqual(num_matches, 1)  # some correct

        # Restore full model.
        model = load_multi_state(model, restore_linear=self.checkpoint_path)
        model.eval()

        # Check output through the lower network.
        out = lower_forward(model, self.in_1)
        num_matches = out.isclose(self.out_lower, atol=1e-2).sum().item()
        self.assertEqual(num_matches, 1337)  # some correct

        # Check output through the lower network.
        out = upper_forward(model, self.in_2)
        num_matches = out.isclose(self.out_upper, atol=1e-2).sum().item()
        self.assertEqual(num_matches, 20)  # all correct
    def setUp(self):

        set_random_seed(20)
        self.model = MNISTSparseCNN()
        self.model.eval()

        # Make all params twice as large to differentiate it from an init-ed model.
        for name, param in self.model.named_parameters():
            if ("cnn" in name or "linear" in name) and ("weight" in name):
                param[:] = param.data * 2

        # self.model.eval()
        self.in_1 = torch.rand(2, 1, 28, 28)
        self.in_2 = torch.rand(2, 1024)
        self.out_full = full_forward(self.model, self.in_1)
        self.out_lower = lower_forward(self.model, self.in_1)
        self.out_upper = upper_forward(self.model, self.in_2)

        # Create temporary results directory.
        self.tempdir = tempfile.TemporaryDirectory()
        self.results_dir = Path(self.tempdir.name) / Path("results")
        self.results_dir.mkdir()

        # Save model state.
        state = {}
        with io.BytesIO() as buffer:
            serialize_state_dict(buffer,
                                 self.model.state_dict(),
                                 compresslevel=-1)
            state["model"] = buffer.getvalue()

        self.checkpoint_path = self.results_dir / Path("mymodel")
        with open(self.checkpoint_path, "wb") as f:
            pickle.dump(state, f)
示例#3
0
    def __init__(self, config):
        super(MobileNetCIFAR10, self).__init__()

        self.logger = get_logger(config["name"], config["verbose"])
        self.logger.debug("Config: %s", config)

        # Setup random seed
        seed = config["seed"]
        set_random_seed(seed)

        self._configure_dataloaders(config)

        # Configure Model
        model_type = config["model_type"]
        model_params = config["model_params"]
        self.model = model_type(**model_params)
        self.logger.debug("Model: %s", self.model)

        if torch.cuda.is_available():
            self.device = torch.device("cuda")
            self.model = self.model.cuda()
        else:
            self.device = torch.device("cpu")

        # Configure Optimizer. Skip weight decay on deep-wise
        params = [
            {
                "params": self.model.conv.parameters()
            },
            {
                "params": self.model.deepwise.parameters(),
                "weight_decay": 0
            },
            {
                "params": self.model.classifier.parameters()
            },
        ]

        self.optimizer = torch.optim.RMSprop(
            params,
            lr=config["learning_rate"],
            weight_decay=config["weight_decay"])
        self.loss_function = config["loss_function"]
        self.lr_scheduler = torch.optim.lr_scheduler.StepLR(
            self.optimizer,
            step_size=config["lr_step_size"],
            gamma=config["learning_rate_gamma"],
        )
        if torch.cuda.device_count() > 1:
            self.model = torch.nn.DataParallel(self.model)

        self.batches_in_epoch = config["batches_in_epoch"]
        self.batches_in_first_epoch = config["batches_in_first_epoch"]
        self.test_batches_in_epoch = config["test_batches_in_epoch"]

        self.config = config
示例#4
0
def run_ray_many(tune_config, exp_config, experiments, fix_seed=False):

    # update config
    tune_config["config"] = exp_config

    # override when running local for test
    if not torch.cuda.is_available():
        tune_config["config"]["device"] = "cpu"
        tune_config["resources_per_trial"] = {"cpu": 1}

    # MC code to fix for an unknown bug
    def serializer(obj):
        if obj.is_cuda:
            return obj.cpu().numpy()
        else:
            return obj.numpy()

    def deserializer(serialized_obj):
        return serialized_obj

    for t in [
        torch.FloatTensor,
        torch.DoubleTensor,
        torch.HalfTensor,
        torch.ByteTensor,
        torch.CharTensor,
        torch.ShortTensor,
        torch.IntTensor,
        torch.LongTensor,
        torch.Tensor,
    ]:
        ray.register_custom_serializer(
            t, serializer=serializer, deserializer=deserializer
        )

    # fix seed
    if fix_seed:
        set_random_seed(32)

    # multiple experiments
    exp_configs = [
        (name, new_experiment(exp_config, c)) for name, c in experiments.items()
    ]

    # init ray
    ray.init()
    results = [
        run_experiment.remote(name, RayTrainable, c, tune_config)
        for name, c in exp_configs
    ]
    ray.get(results)
    ray.shutdown()
    def setUp(self):
        set_random_seed(42)
        self.device = torch.device("cuda")

        # Config for model with sparse encoder and sparse embedding layer.
        self.config = CONFIG_MAPPING["fully_static_sparse_bert"](
            num_attention_heads=2,
            num_hidden_layers=2,
            hidden_size=128,
            intermediate_size=512,
            max_position_embeddings=128,
            sparsity=0.75,
        )
        self.sparse_model = AutoModelForMaskedLM.from_config(self.config)
        self.sparse_model.resize_token_embeddings()
        self.sparse_model.apply(rezero_weights)
示例#6
0
def run_ray(tune_config, exp_config, fix_seed=False):

    # update config
    tune_config["config"] = exp_config
    download_dataset(exp_config)

    # override when running local for test
    if not torch.cuda.is_available():
        tune_config["config"]["device"] = "cpu"
        tune_config["resources_per_trial"] = {"cpu": 1}

    # init ray
    ray.init(load_code_from_local=True)

    # MC code to fix for an unknown bug
    def serializer(obj):
        if obj.is_cuda:
            return obj.cpu().numpy()
        else:
            return obj.numpy()

    def deserializer(serialized_obj):
        return serialized_obj

    for t in [
            torch.FloatTensor,
            torch.DoubleTensor,
            torch.HalfTensor,
            torch.ByteTensor,
            torch.CharTensor,
            torch.ShortTensor,
            torch.IntTensor,
            torch.LongTensor,
            torch.Tensor,
    ]:
        ray.register_custom_serializer(t,
                                       serializer=serializer,
                                       deserializer=deserializer)

    # fix seed
    if fix_seed:
        set_random_seed(32)

    tune.run(Trainable, **tune_config)
    def test_load_full(self):

        # Initialize model with new random seed.
        set_random_seed(33)
        model = MNISTSparseCNN()
        model.eval()

        # Check output through the full network.
        for param1, param2 in zip(model.parameters(), self.model.parameters()):
            tot_eq = (param1 == param2).sum().item()
            self.assertNotEqual(tot_eq, np.prod(param1.shape))

        # Restore full model.
        model = load_multi_state(model,
                                 restore_full_model=self.checkpoint_path)
        model.eval()

        # Check output through the full network.
        for param1, param2 in zip(model.parameters(), self.model.parameters()):
            tot_eq = (param1 == param2).sum().item()
            self.assertEqual(tot_eq, np.prod(param1.shape))

        for buffer1, buffer2 in zip(model.buffers(), self.model.buffers()):
            if buffer1.dtype == torch.float16:
                buffer1 = buffer1.float()
                buffer2 = buffer2.float()

            tot_eq = (buffer1 == buffer2).sum().item()
            self.assertEqual(tot_eq, np.prod(buffer1.shape))

        out = full_forward(model, self.in_1)
        num_matches = out.isclose(self.out_full, atol=1e-2,
                                  rtol=0).sum().item()
        self.assertEqual(num_matches, 20)  # all correct

        # Check output through the lower network.
        out = lower_forward(model, self.in_1)
        num_matches = out.isclose(self.out_lower, atol=1e-2).sum().item()
        self.assertEqual(num_matches, 2048)  # all correct

        # Check output through the lower network.
        out = upper_forward(model, self.in_2)
        num_matches = out.isclose(self.out_upper, atol=1e-2).sum().item()
        self.assertEqual(num_matches, 20)  # all correct
    def setup_experiment(self, config):
        """
        :param config: Dictionary containing the configuration parameters

            - local_dir: Results path
            - logdir: Directory generated by Ray Tune for this Trial
            - seed: the seed to be used for pytorch, python, and numpy
            - checkpoint_at_init: boolean argument for whether to create a checkpoint
                                  of the initialized model. this differs from
                                  `checkpoint_at_start` for which the checkpoint occurs
                                  after the first epoch of training as opposed to
                                  before it
        """
        self._logger = self.create_logger(config)
        self.logdir = config.get("logdir", None)

        # Configure seed
        self.seed = config.get("seed", 42)
        set_random_seed(self.seed, False)
    def setUp(self):

        set_random_seed(20)
        self.model = torch.nn.Sequential(
            torch.nn.Linear(8, 8),
            KWinners(8, percent_on=0.1),
        )

        # Create temporary results directory.
        self.tempdir = tempfile.TemporaryDirectory()
        self.results_dir = Path(self.tempdir.name) / Path("results")
        self.results_dir.mkdir()

        # Save model state.
        state = {}
        with io.BytesIO() as buffer:
            serialize_state_dict(buffer, self.model.state_dict(), compresslevel=-1)
            state["model"] = buffer.getvalue()

        self.checkpoint_path = self.results_dir / Path("mymodel")
        with open(self.checkpoint_path, "wb") as f:
            pickle.dump(state, f)
示例#10
0
def main(args=CONFIG):
    if args.seed is not None:
        set_random_seed(args.seed, args.deterministic_mode)

    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])

    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    ngpus_per_node = torch.cuda.device_count()
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker,
                 nprocs=ngpus_per_node,
                 args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function
        main_worker(args.gpu, ngpus_per_node, args)
def main(args):
    # Get experiment configuration
    config = copy.deepcopy(CONFIGS[args.name])
    config.update(vars(args))
    device = "cuda" if torch.cuda.is_available() else "cpu"
    config["device"] = device

    # Replace dynamic seed (i.e. 'tune.sample_from') with constant
    seed = config.get("seed", 42)
    if not isinstance(seed, int):
        seed = 42
        config["seed"] = seed
    set_random_seed(seed)

    q_model = quantize(config)

    # Save quantized model
    output_file_name = os.path.join(args.output,
                                    f"{args.name}.{args.backend}.pt")
    print(
        f"Saving quantized model '{args.name}' weights to '{output_file_name}'"
    )
    torch.jit.save(torch.jit.script(q_model), output_file_name)
 def setUp(self):
     set_random_seed(42)
    def __init__(self, config):
        """Called once at the beginning of each experiment."""
        super(MNISTSparseExperiment, self).__init__()
        self.start_time = time.time()
        self.logger = get_logger(config["name"], config.get("verbose", 2))
        self.logger.debug("Config: %s", config)

        # Setup random seed
        seed = config["seed"]
        set_random_seed(seed)

        self.data_dir = config["data_dir"]
        self.batch_size = config["batch_size"]
        self.test_batch_size = config["test_batch_size"]
        self.first_epoch_batch_size = config["first_epoch_batch_size"]
        self.validation = config.get("validation", 50000.0 / 60000.0)
        self.learning_rate_factor = config["learning_rate_factor"]
        self.lr_scheduler_params = config.get("lr_scheduler_params", None)
        self.num_classes = 10

        self._configure_dataloaders()

        # Configure Model
        model = LeSparseNet(
            input_shape=(1, 28, 28),
            cnn_out_channels=config["cnn_out_channels"],
            cnn_activity_percent_on=config["cnn_percent_on"],
            cnn_weight_percent_on=config["cnn_weight_sparsity"],
            linear_n=config["linear_n"],
            linear_activity_percent_on=config["linear_percent_on"],
            linear_weight_percent_on=config["weight_sparsity"],
            boost_strength=config["boost_strength"],
            boost_strength_factor=config["boost_strength_factor"],
            use_batch_norm=config["use_batch_norm"],
            dropout=config.get("dropout", 0.0),
            num_classes=self.num_classes,
            k_inference_factor=config["k_inference_factor"],
            activation_fct_before_max_pool=config.get(
                "activation_fct_before_max_pool", False),
            consolidated_sparse_weights=config.get(
                "consolidated_sparse_weights", False),
            use_kwinners_local=config.get("use_kwinner_local", False),
        )

        if torch.cuda.is_available():
            self.device = torch.device("cuda")
            model = model.cuda()
        else:
            self.device = torch.device("cpu")

        if torch.cuda.device_count() > 1:
            self.logger.debug("Using", torch.cuda.device_count(), "GPUs")
            model = torch.nn.DataParallel(model)

        self.model = model
        self.logger.debug("Model: %s", self.model)
        self.learning_rate = config["learning_rate"]
        self.momentum = config["momentum"]

        self.batches_in_epoch = config["batches_in_epoch"]
        self.batches_in_first_epoch = config["batches_in_first_epoch"]
        self.config = config

        self.optimizer = self._create_optimizer(name=config["optimizer"],
                                                model=self.model)
        self.lr_scheduler = self._create_learning_rate_scheduler(
            name=config.get("lr_scheduler", None), optimizer=self.optimizer)
示例#14
0
    def __init__(self, config):
        """Called once at the beginning of each experiment."""
        self.start_time = time.time()
        self.logger = get_logger(config["name"], config.get("verbose", 2))
        self.logger.debug("Config: %s", config)

        # Setup random seed
        seed = config["seed"]
        set_random_seed(seed)

        # Get our directories correct
        self.data_dir = config["data_dir"]

        # Configure Model
        self.model_type = config["model_type"]
        self.num_classes = 12
        self.log_interval = config["log_interval"]
        self.batches_in_epoch = config["batches_in_epoch"]
        self.batch_size = config["batch_size"]
        self.background_noise_dir = config["background_noise_dir"]
        self.noise_values = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]

        self.load_datasets()

        if self.model_type == "le_sparse":
            model = LeSparseNet(
                input_shape=config.get("input_shape", (1, 32, 32)),
                cnn_out_channels=config["cnn_out_channels"],
                cnn_activity_percent_on=config["cnn_percent_on"],
                cnn_weight_percent_on=config["cnn_weight_sparsity"],
                linear_n=config["linear_n"],
                linear_activity_percent_on=config["linear_percent_on"],
                linear_weight_percent_on=config["weight_sparsity"],
                boost_strength=config["boost_strength"],
                boost_strength_factor=config["boost_strength_factor"],
                use_batch_norm=config["use_batch_norm"],
                dropout=config.get("dropout", 0.0),
                num_classes=self.num_classes,
                k_inference_factor=config["k_inference_factor"],
                activation_fct_before_max_pool=config.get(
                    "activation_fct_before_max_pool", False),
                consolidated_sparse_weights=config.get(
                    "consolidated_sparse_weights", False),
                use_kwinners_local=config.get("use_kwinner_local", False),
            )

        elif self.model_type == "resnet9":
            model = resnet9(
                num_classes=self.num_classes, in_channels=1
            )

        elif self.model_type == "gsc_sparse_cnn":
            model = GSCSparseCNN()

        elif self.model_type == "gsc_super_sparse_cnn":
            model = GSCSuperSparseCNN()

        else:
            raise RuntimeError("Unknown model type: " + self.model_type)

        self.use_cuda = torch.cuda.is_available()
        self.logger.debug("use_cuda %s", self.use_cuda)
        if self.use_cuda:
            self.device = torch.device("cuda")
            model = model.cuda()
        else:
            self.device = torch.device("cpu")

        self.logger.debug("device %s", self.device)
        if torch.cuda.device_count() > 1:
            self.logger.debug("Using %s GPUs", torch.cuda.device_count())
            model = torch.nn.DataParallel(model)

        self.model = model
        self.logger.debug("Model: %s", self.model)
        self.logger.debug("Model non-zero params: %s", count_nonzero_params(self.model))
        self.learning_rate = config["learning_rate"]
        self.optimizer = self.create_optimizer(config, self.model)
        self.lr_scheduler = self.create_learning_rate_scheduler(config, self.optimizer)
    def __init__(self, config):
        """Called once at the beginning of each experiment."""
        self.start_time = time.time()
        self.logger = get_logger(config["name"], config.get("verbose", 2))
        self.logger.debug("Config: %s", config)

        # Setup random seed
        seed = config["seed"]
        set_random_seed(seed)

        # Get our directories correct
        self.data_dir = config["data_dir"]

        # Configure Model
        self.model_type = config["model_type"]
        self.num_classes = 12
        self.log_interval = config["log_interval"]
        self.batches_in_epoch = config["batches_in_epoch"]
        self.batch_size = config["batch_size"]
        self.background_noise_dir = config["background_noise_dir"]
        self.noise_values = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
        cnn_input_shape = config.get("cnn_input_shape", (1, 32, 32))
        linear_n = config["linear_n"]
        linear_percent_on = config["linear_percent_on"]
        cnn_out_channels = config["cnn_out_channels"]
        cnn_percent_on = config["cnn_percent_on"]
        boost_strength = config["boost_strength"]
        weight_sparsity = config["weight_sparsity"]
        cnn_weight_sparsity = config["cnn_weight_sparsity"]
        boost_strength_factor = config["boost_strength_factor"]
        k_inference_factor = config["k_inference_factor"]
        use_batch_norm = config["use_batch_norm"]
        dropout = config.get("dropout", 0.0)

        self.load_datasets()

        model = nn.Sequential()

        if self.model_type == "cnn":
            # Add CNN Layers
            input_shape = cnn_input_shape
            cnn_layers = len(cnn_out_channels)
            if cnn_layers > 0:
                for i in range(cnn_layers):
                    in_channels, height, width = input_shape
                    add_sparse_cnn_layer(
                        network=model,
                        suffix=i + 1,
                        in_channels=in_channels,
                        out_channels=cnn_out_channels[i],
                        use_batch_norm=use_batch_norm,
                        weight_sparsity=cnn_weight_sparsity,
                        percent_on=cnn_percent_on[i],
                        k_inference_factor=k_inference_factor,
                        boost_strength=boost_strength,
                        boost_strength_factor=boost_strength_factor,
                    )

                    # Feed this layer output into next layer input
                    in_channels = cnn_out_channels[i]

                    # Compute next layer input shape
                    wout = (width - 5) + 1
                    maxpool_width = wout // 2
                    input_shape = (in_channels, maxpool_width, maxpool_width)

            # Flatten CNN output before passing to linear layer
            model.add_module("flatten", Flatten())

            # Add Linear layers
            input_size = np.prod(input_shape)
            for i in range(len(linear_n)):
                add_sparse_linear_layer(
                    network=model,
                    suffix=i + 1,
                    input_size=input_size,
                    linear_n=linear_n[i],
                    dropout=dropout,
                    use_batch_norm=use_batch_norm,
                    weight_sparsity=weight_sparsity,
                    percent_on=linear_percent_on[i],
                    k_inference_factor=k_inference_factor,
                    boost_strength=boost_strength,
                    boost_strength_factor=boost_strength_factor,
                )
                input_size = linear_n[i]

            # Output layer
            model.add_module(
                "output", nn.Linear(input_size, self.num_classes)
            )
            model.add_module("softmax", nn.LogSoftmax(dim=1))

        elif self.model_type == "resnet9":
            model = resnet9(
                num_classes=self.num_classes, in_channels=1
            )

        elif self.model_type == "gsc_sparse_cnn":
            model = GSCSparseCNN()

        elif self.model_type == "gsc_super_sparse_cnn":
            model = GSCSuperSparseCNN()

        else:
            raise RuntimeError("Unknown model type")

        self.use_cuda = torch.cuda.is_available()
        self.logger.debug("use_cuda %s", self.use_cuda)
        if self.use_cuda:
            self.device = torch.device("cuda")
            model = model.cuda()
        else:
            self.device = torch.device("cpu")

        self.logger.debug("device %s", self.device)
        if torch.cuda.device_count() > 1:
            self.logger.debug("Using %s GPUs", torch.cuda.device_count())
            model = torch.nn.DataParallel(model)

        self.model = model
        self.logger.debug("Model: %s", self.model)
        self.learning_rate = config["learning_rate"]
        self.optimizer = self.create_optimizer(config, self.model)
        self.lr_scheduler = self.create_learning_rate_scheduler(config, self.optimizer)
示例#16
0
    # Run experiments
    results = []
    for exp in configs:
        config = configs[exp]
        config["name"] = exp

        # Make sure path and data_dir are relative to the project location,
        # handling both ~/nta and ../results style paths.
        path = config.get("path", ".")
        config["path"] = str(Path(path).expanduser().resolve())

        data_dir = config.get("data_dir", "data")
        config["data_dir"] = str(Path(data_dir).expanduser().resolve())

        # Run each experiment in parallel
        results.append(run_noise_test.remote(config))

    # Wait until all experiments complete
    ray.get(results)
    ray.shutdown()


if __name__ == "__main__":
    # Set a random random seed, and print it for reproducibility
    # This enables variability in the random seeds that Ray generates for
    # experiments with multiple repetitions.
    seed = int(time.time())
    print("Global random seed set to", seed)
    set_random_seed(seed)
    cli()
示例#17
0
    def setup_experiment(self, config):
        """
        Configure the experiment for training

        :param config: Dictionary containing the configuration parameters

            - distributed: Whether or not to use Pytorch Distributed training
            - backend: Pytorch Distributed backend ("nccl", "gloo")
                    Default: nccl
            - world_size: Total number of processes participating
            - rank: Rank of the current process
            - data: Dataset path
            - train_dir: Dataset training data relative path
            - batch_size: Training batch size
            - val_dir: Dataset validation data relative path
            - val_batch_size: Validation batch size
            - workers: how many data loading processes to use
            - num_classes: Limit the dataset size to the given number of classes
            - model_class: Model class. Must inherit from "torch.nn.Module"
            - model_args: model model class arguments passed to the constructor
            - init_batch_norm: Whether or not to Initialize running batch norm
                               mean to 0.
            - optimizer_class: Optimizer class.
                               Must inherit from "torch.optim.Optimizer"
            - optimizer_args: Optimizer class class arguments passed to the
                              constructor
            - batch_norm_weight_decay: Whether or not to apply weight decay to
                                       batch norm modules parameters
            - bias_weight_decay: Whether or not to apply weight decay to
                                       bias parameters
            - lr_scheduler_class: Learning rate scheduler class.
                                 Must inherit from "_LRScheduler"
            - lr_scheduler_args: Learning rate scheduler class class arguments
                                 passed to the constructor
            - loss_function: Loss function. See "torch.nn.functional"
            - local_dir: Results path
            - logdir: Directory generated by Ray Tune for this Trial
            - epochs: Number of epochs to train
            - batches_in_epoch: Number of batches per epoch.
                                Useful for debugging
            - progress: Show progress during training
            - name: Experiment name. Used as logger name
            - log_level: Python Logging level
            - log_format: Python Logging format
            - seed: the seed to be used for pytorch, python, and numpy
            - mixed_precision: Whether or not to enable apex mixed precision
            - mixed_precision_args: apex mixed precision arguments.
                                    See "amp.initialize"
            - sample_transform: Transform acting on the training samples. To be used
                                additively after default transform or auto-augment.
            - target_transform: Transform acting on the training targets.
            - replicas_per_sample: Number of replicas to create per sample in the batch.
                                   (each replica is transformed independently)
                                   Used in maxup.
            - train_model_func: Optional user defined function to train the model,
                                expected to behave similarly to `train_model`
                                in terms of input parameters and return values
            - evaluate_model_func: Optional user defined function to validate the model
                                   expected to behave similarly to `evaluate_model`
                                   in terms of input parameters and return values
            - checkpoint_file: if not None, will start from this model. The model
                               must have the same model_args and model_class as the
                               current experiment.
            - checkpoint_at_init: boolean argument for whether to create a checkpoint
                                  of the initialized model. this differs from
                                  `checkpoint_at_start` for which the checkpoint occurs
                                  after the first epoch of training as opposed to
                                  before it
            - epochs_to_validate: list of epochs to run validate(). A -1 asks
                                  to run validate before any training occurs.
                                  Default: last three epochs.
            - launch_time: time the config was created (via time.time). Used to report
                           wall clock time until the first batch is done.
                           Default: time.time() in this setup_experiment().
        """
        # Configure logging related stuff
        log_format = config.get("log_format", logging.BASIC_FORMAT)
        log_level = getattr(logging, config.get("log_level", "INFO").upper())
        console = logging.StreamHandler()
        console.setFormatter(logging.Formatter(log_format))
        self.logger = logging.getLogger(config.get("name",
                                                   type(self).__name__))
        self.logger.setLevel(log_level)
        self.logger.addHandler(console)
        self.progress = config.get("progress", False)
        self.launch_time = config.get("launch_time", time.time())
        self.logdir = config.get("logdir", None)

        # Configure seed
        self.seed = config.get("seed", self.seed)
        set_random_seed(self.seed, False)

        # Configure distribute pytorch
        self.distributed = config.get("distributed", False)
        self.rank = config.get("rank", 0)

        if self.rank == 0:
            self.logger.info(
                f"Execution order: {pformat(self.get_execution_order())}")

        if self.distributed:
            dist_url = config.get("dist_url", "tcp://127.0.0.1:54321")
            backend = config.get("backend", "nccl")
            world_size = config.get("world_size", 1)
            dist.init_process_group(
                backend=backend,
                init_method=dist_url,
                rank=self.rank,
                world_size=world_size,
            )
            # Only enable logs from first process
            self.logger.disabled = self.rank != 0
            self.progress = self.progress and self.rank == 0

        # Configure model
        self.model = self.create_model(config, self.device)
        if self.rank == 0:
            self.logger.debug(self.model)

        # Configure optimizer
        optimizer_class = config.get("optimizer_class", torch.optim.SGD)
        optimizer_args = config.get("optimizer_args", {})
        batch_norm_weight_decay = config.get("batch_norm_weight_decay", True)
        bias_weight_decay = config.get("bias_weight_decay", True)
        self.optimizer = create_optimizer(
            model=self.model,
            optimizer_class=optimizer_class,
            optimizer_args=optimizer_args,
            batch_norm_weight_decay=batch_norm_weight_decay,
            bias_weight_decay=bias_weight_decay,
        )

        # Validate mixed precision requirements
        self.mixed_precision = config.get("mixed_precision", False)
        if self.mixed_precision and amp is None:
            self.mixed_precision = False
            self.logger.error(
                "Mixed precision requires NVIDA APEX."
                "Please install apex from https://www.github.com/nvidia/apex"
                "Disabling mixed precision training.")

        # Configure mixed precision training
        if self.mixed_precision:
            amp_args = config.get("mixed_precision_args", {})
            self.model, self.optimizer = amp.initialize(
                self.model, self.optimizer, **amp_args)
            self.logger.info("Using mixed precision")

        # Apply DistributedDataParallel after all other model mutations
        if self.distributed:
            self.model = DistributedDataParallel(self.model)
        else:
            self.model = DataParallel(self.model)

        self._loss_function = config.get("loss_function",
                                         torch.nn.functional.cross_entropy)

        # Configure data loaders
        self.epochs = config.get("epochs", 1)
        self.batches_in_epoch = config.get("batches_in_epoch", sys.maxsize)
        self.epochs_to_validate = config.get(
            "epochs_to_validate", range(self.epochs - 3, self.epochs + 1))
        self.current_epoch = 0

        # Get initial batch size
        self.batch_size = config.get("batch_size", 1)

        # CUDA runtime does not support the fork start method.
        # See https://pytorch.org/docs/stable/notes/multiprocessing.html
        if torch.cuda.is_available():
            multiprocessing.set_start_method("spawn")

        # Configure data loaders
        self.train_loader = self.create_train_dataloader(config)
        self.val_loader = self.create_validation_dataloader(config)
        self.total_batches = len(self.train_loader)

        # Configure learning rate scheduler
        lr_scheduler_class = config.get("lr_scheduler_class", None)
        if lr_scheduler_class is not None:
            lr_scheduler_args = config.get("lr_scheduler_args", {})
            self.logger.info("LR Scheduler args:")
            self.logger.info(pformat(lr_scheduler_args))
            self.logger.info("steps_per_epoch=%s", self.total_batches)
            self.lr_scheduler = create_lr_scheduler(
                optimizer=self.optimizer,
                lr_scheduler_class=lr_scheduler_class,
                lr_scheduler_args=lr_scheduler_args,
                steps_per_epoch=self.total_batches)

        # Set train and validate methods.
        self.train_model = config.get("train_model_func", train_model)
        self.evaluate_model = config.get("evaluate_model_func", evaluate_model)
示例#18
0
    def __init__(self, config):
        super(NotSoDenseExperiment, self).__init__()
        self.logger = get_logger(config["name"], config["verbose"])
        self.logger.debug("Config: %s", config)

        seed = config["seed"]
        set_random_seed(seed)
        self.batches_in_epoch = config["batches_in_epoch"]
        self.epochs = config["iterations"]
        self.batch_size = config["batch_size"]
        self.test_batch_size = config["test_batch_size"]
        self.test_batches_in_epoch = config.get("test_batches_in_epoch",
                                                sys.maxsize)
        data_dir = config["data_dir"]

        normalize_tensor = [
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2023, 0.1994, 0.2010))
        ]
        data_augmentation = []
        if config.get("data_augmentation", False):
            data_augmentation = [
                transforms.RandomCrop(32, padding=4),
                transforms.RandomHorizontalFlip()
            ]

        train_dataset = datasets.CIFAR10(
            root=data_dir,
            train=True,
            download=True,
            transform=transforms.Compose(data_augmentation + normalize_tensor))
        self.train_loader = torch.utils.data.DataLoader(
            dataset=train_dataset, batch_size=self.batch_size, shuffle=True)
        test_dataset = datasets.CIFAR10(
            root=data_dir,
            train=False,
            download=False,
            transform=transforms.Compose(normalize_tensor))
        self.test_loader = torch.utils.data.DataLoader(
            dataset=test_dataset,
            batch_size=self.test_batch_size,
            shuffle=True)

        self.model = NoSoDenseNetCIFAR(
            block_config=config.get("block_config"),
            depth=config.get("depth"),
            growth_rate=config["growth_rate"],
            reduction=config["reduction"],
            num_classes=config["num_classes"],
            bottleneck_size=config["bottleneck_size"],
            avg_pool_size=config["avg_pool_size"],
            dense_percent_on=config["dense_percent_on"],
            transition_percent_on=config["transition_percent_on"],
            classifier_percent_on=config["classifier_percent_on"],
            k_inference_factor=config["k_inference_factor"],
            boost_strength=config["boost_strength"],
            boost_strength_factor=config["boost_strength_factor"],
            duty_cycle_period=config["duty_cycle_period"],
        )
        self.logger.debug("Model: %s", self.model)

        if torch.cuda.is_available():
            self.device = torch.device("cuda")
            self.model = self.model.cuda()
        else:
            self.device = torch.device("cpu")

        self.optimizer = torch.optim.SGD(self.model.parameters(),
                                         lr=config["learning_rate"],
                                         momentum=config["momentum"],
                                         nesterov=config["nesterov"],
                                         weight_decay=config["weight_decay"])
        self.loss_function = config["loss_function"]

        if "learning_scheduler_milestones" in config:
            self.scheduler = torch.optim.lr_scheduler.MultiStepLR(
                self.optimizer,
                gamma=config["learning_scheduler_gamma"],
                milestones=config["learning_scheduler_milestones"])
        else:
            self.scheduler = torch.optim.lr_scheduler.StepLR(
                self.optimizer,
                gamma=config["learning_scheduler_gamma"],
                step_size=config["learning_scheduler_step_size"])
示例#19
0
def run_ray(tune_config, exp_config, fix_seed=False):

    # update config
    tune_config["config"] = exp_config

    # override when running local for test
    if not torch.cuda.is_available():
        tune_config["config"]["device"] = "cpu"
        tune_config["resources_per_trial"] = {"cpu": 1}

    # move epochs to tune_config, to keep track
    if "stop" not in tune_config:
        if "epochs" in exp_config:
            tune_config["stop"] = {"training_iteration": exp_config["epochs"]}

    # expand path in dir
    if "local_dir" in tune_config:
        tune_config["local_dir"] = os.path.expanduser(tune_config["local_dir"])
    else:
        tune_config["local_dir"] = os.path.expanduser("~/nta/results")
    # saves a copy of local dir to exp config for LT experiments
    exp_config["local_dir"] = tune_config["local_dir"]

    if "data_dir" not in exp_config:
        exp_config["data_dir"] = os.path.expanduser("~/nta/datasets")

    download_dataset(exp_config)

    # set default checkpoint dir
    # temp: name and checkpoint dir in tune_config for backwards compatibility
    exp_config["name"] = tune_config["name"]
    if "checkpoint dir" in tune_config:
        exp_config["checkpoint_dir"] = os.path.expanduser(exp_config["checkpoint_dir"])
    else:
        exp_config["checkpoint_dir"] = os.path.expanduser("~/nta/checkpoints")

    # init ray
    ray.init(load_code_from_local=True)

    # MC code to fix for an unknown bug
    def serializer(obj):
        if obj.is_cuda:
            return obj.cpu().numpy()
        else:
            return obj.numpy()

    def deserializer(serialized_obj):
        return serialized_obj

    for t in [
        torch.FloatTensor,
        torch.DoubleTensor,
        torch.HalfTensor,
        torch.ByteTensor,
        torch.CharTensor,
        torch.ShortTensor,
        torch.IntTensor,
        torch.LongTensor,
        torch.Tensor,
    ]:
        ray.register_custom_serializer(
            t, serializer=serializer, deserializer=deserializer
        )

    # fix seed
    if fix_seed:
        set_random_seed(32)

    # allows different kind of experiments to run
    run_experiment = base_experiment
    if "experiment_type" in exp_config:
        if exp_config["experiment_type"] in custom_experiments:
            run_experiment = custom_experiments[exp_config["experiment_type"]]
        else:
            raise ValueError("Experiment type not available.")

    # run
    run_experiment(tune_config)
示例#20
0
    num_gpus = float(num_gpus / num_cpus)
    run_noise_test._num_gpus = num_gpus
    run_noise_test.num_cpus = 1

    # Run experiments
    results = []
    for exp in configs:
        config = configs[exp]
        config["name"] = exp

        # Make sure local directories are relative to the project location
        path = config.get("path", None)
        if path and not os.path.isabs(path):
            config["path"] = os.path.join(project_dir, path)

        data_dir = config.get("data_dir", "data")
        if not os.path.isabs(data_dir):
            config["data_dir"] = os.path.join(project_dir, data_dir)

        # Run each experiment in parallel
        results.append(run_noise_test.remote(config))

    # Wait until all experiments complete
    ray.get(results)
    ray.shutdown()


if __name__ == "__main__":
    set_random_seed(18)
    cli()
示例#21
0
    def setup_experiment(self, config):
        """
        Configure the experiment for training

        :param config: Dictionary containing the configuration parameters

            - distributed: Whether or not to use  Pytorch Distributed training
            - backend: Pytorch Distributed backend ("nccl", "gloo")
            - world_size: Total number of processes participating
            - rank: Rank of the current process
            - data: Dataset path
            - train_dir: Dataset training data relative path
            - batch_size: Training batch size
            - val_dir: Dataset validation data relative path
            - val_batch_size: Validation batch size
            - workers: how many data loading processes to use
            - num_classes: Limit the dataset size to the given number of classes
            - model_class: Model class. Must inherit from "torch.nn.Module"
            - model_args: model model class arguments passed to the constructor
            - init_batch_norm: Whether or not to Initialize running batch norm
                               mean to 0.
            - progressive_resize: Progressive resize schedule
                                  dict(start_epoch: image_size)
            - dynamic_batch_size: dynamic batch size schedule.
                                  dict(start_epoch: batch_size)
                                  Works with progressive_resize and the
                                  available GPU memory to fit as many images as
                                  possible in each batch
            - optimizer_class: Optimizer class.
                               Must inherit from "torch.optim.Optimizer"
            - optimizer_args: Optimizer class class arguments passed to the
                              constructor
            - batch_norm_weight_decay: Whether or not to apply weight decay to
                                       batch norm modules parameters
            - lr_scheduler_class: Learning rate scheduler class.
                                 Must inherit from "_LRScheduler"
            - lr_scheduler_args: Learning rate scheduler class class arguments
                                 passed to the constructor
            - loss_function: Loss function. See "torch.nn.functional"
            - local_dir: Results path
            - epochs: Number of epochs to train
            - batches_in_epoch: Number of batches per epoch.
                                Useful for debugging
            - progress: Show progress during training
            - name: Experiment name. Used as logger name
            - log_level: Python Logging level
            - log_format: Python Logging format
            - seed: the seed to be used for pytorch, python, and numpy
        """
        # Configure logger
        log_format = config.get("log_format", logging.BASIC_FORMAT)
        log_level = getattr(logging, config.get("log_level", "INFO").upper())
        console = logging.StreamHandler()
        console.setFormatter(logging.Formatter(log_format))
        self.logger = logging.getLogger(config.get("name",
                                                   type(self).__name__))
        self.logger.setLevel(log_level)
        self.logger.addHandler(console)
        self.progress = config.get("progress", False)

        # Configure seed
        self.seed = config.get("seed", self.seed)
        set_random_seed(self.seed, False)

        # Configure distribute pytorch
        self.distributed = config.get("distributed", False)
        self.rank = config.get("rank", 0)
        if self.distributed:
            dist_url = config.get("dist_url", "tcp://127.0.0.1:54321")
            backend = config.get("backend", "nccl")
            world_size = config.get("world_size", 1)
            dist.init_process_group(
                backend=backend,
                init_method=dist_url,
                rank=self.rank,
                world_size=world_size,
            )

        # Configure model
        model_class = config["model_class"]
        model_args = config.get("model_args", {})
        init_batch_norm = config.get("init_batch_norm", False)
        self.model = _create_model(
            model_class=model_class,
            model_args=model_args,
            init_batch_norm=init_batch_norm,
            distributed=self.distributed,
            device=self.device,
        )
        if self.rank == 0:
            self.logger.debug(self.model)
            params_sparse, nonzero_params_sparse2 = count_nonzero_params(
                self.model)
            self.logger.debug("Params total/nnz %s / %s", params_sparse,
                              nonzero_params_sparse2)

        # Configure optimizer
        optimizer_class = config.get("optimizer_class", torch.optim.SGD)
        optimizer_args = config.get("optimizer_args", {})
        batch_norm_weight_decay = config.get("batch_norm_weight_decay", True)
        self.optimizer = _create_optimizer(
            model=self.model,
            optimizer_class=optimizer_class,
            optimizer_args=optimizer_args,
            batch_norm_weight_decay=batch_norm_weight_decay,
        )

        self.loss_function = config.get("loss_function",
                                        torch.nn.functional.cross_entropy)

        # Configure data loaders
        self.epochs = config.get("epochs", 1)
        self.batches_in_epoch = config.get("batches_in_epoch", sys.maxsize)
        workers = config.get("workers", 0)
        data_dir = config["data"]
        train_dir = config.get("train_dir", "train")
        progressive_resize = config.get("progressive_resize", None)
        num_classes = config.get("num_classes", 1000)

        # Get initial batch size
        self.batch_size = config.get("batch_size", 1)

        # Configure dynamic training batch size
        dynamic_batch_size = config.get("dynamic_batch_size", None)
        if dynamic_batch_size is not None:
            # Convert dynamic_batch_size dict from {str:int} to {int:int}
            self.dynamic_batch_size = {
                int(k): v
                for k, v in dynamic_batch_size.items()
            }

            # Override initial batch size from dynamic_batch_size schedule
            milestones = sorted(self.dynamic_batch_size.keys())
            self.batch_size = self.dynamic_batch_size[milestones[0]]

            # Scale LR proportionally to initial batch size for each epoch milestone
            # See https://arxiv.org/pdf/1706.02677.pdf
            lr_scale = {milestones[0]: 1.0}
            lr_scale.update({
                k: self.dynamic_batch_size[k] / self.batch_size
                for k in milestones[1:]
            })

            # Create chained scaled LR scheduler to be called after the main scheduler
            self.scaled_lr_scheduler = ScaledLR(
                optimizer=self.optimizer,
                lr_scale=lr_scale,
            )

        # Configure Training data loader
        self.train_loader = _create_train_dataloader(
            data_dir=data_dir,
            train_dir=train_dir,
            batch_size=self.batch_size,
            workers=workers,
            distributed=self.distributed,
            progressive_resize=progressive_resize,
            num_classes=num_classes,
        )
        self.total_batches = len(self.train_loader)

        # Compute total steps required by the OneCycleLR
        if self.dynamic_batch_size is None:
            self.total_steps = len(self.train_loader) * self.epochs
        else:
            total_images = len(self.train_loader.dataset)

            # Initial batch size
            from_epoch = 0
            batch_size = self.batch_size
            steps_per_epoch = -(-total_images // batch_size)
            self.total_steps = 0

            milestones = sorted(self.dynamic_batch_size.keys())
            for epoch in milestones[1:]:
                self.total_steps += steps_per_epoch * (epoch - from_epoch)
                batch_size = self.dynamic_batch_size[epoch]
                steps_per_epoch = -(-total_images // batch_size)
                from_epoch = epoch

            # Add last epochs
            self.total_steps += steps_per_epoch * (self.epochs - from_epoch)

        # Configure Validation data loader
        val_dir = config.get("val_dir", "val")
        val_batch_size = config.get("val_batch_size", self.batch_size)
        self.val_loader = _create_validation_dataloader(
            data_dir=data_dir,
            val_dir=val_dir,
            batch_size=val_batch_size,
            workers=workers,
            num_classes=num_classes,
        )

        # Configure leaning rate scheduler
        lr_scheduler_class = config.get("lr_scheduler_class", None)
        if lr_scheduler_class is not None:
            lr_scheduler_args = config.get("lr_scheduler_args", {})
            if self.rank == 0:
                self.logger.debug("LR Scheduler args:")
                self.logger.debug(pformat(lr_scheduler_args))
            self.lr_scheduler = _create_lr_scheduler(
                optimizer=self.optimizer,
                lr_scheduler_class=lr_scheduler_class,
                lr_scheduler_args=lr_scheduler_args,
                total_steps=self.total_steps,
                steps_per_epoch=self.total_batches)
    def __init__(self, config):
        """Called once at the beginning of each experiment."""
        super(MNISTSparseExperiment, self).__init__()
        self.start_time = time.time()
        self.logger = get_logger(config["name"], config.get("verbose", 2))
        self.logger.debug("Config: %s", config)

        # Setup random seed
        seed = config["seed"]
        set_random_seed(seed)

        self.data_dir = config["data_dir"]
        self.batch_size = config["batch_size"]
        self.test_batch_size = config["test_batch_size"]
        self.first_epoch_batch_size = config["first_epoch_batch_size"]
        self.validation = config.get("validation", 50000.0 / 60000.0)
        self.learning_rate_factor = config["learning_rate_factor"]
        self.lr_scheduler_params = config.get("lr_scheduler_params", None)

        self._configure_dataloaders()

        # Configure Model
        cnn_input_shape = config.get("cnn_input_shape", (1, 28, 28))
        linear_n = config["linear_n"]
        linear_percent_on = config["linear_percent_on"]
        cnn_out_channels = config["cnn_out_channels"]
        cnn_percent_on = config["cnn_percent_on"]
        boost_strength = config["boost_strength"]
        weight_sparsity = config["weight_sparsity"]
        cnn_weight_sparsity = config["cnn_weight_sparsity"]
        boost_strength_factor = config["boost_strength_factor"]
        k_inference_factor = config["k_inference_factor"]
        use_batch_norm = config["use_batch_norm"]
        dropout = config.get("dropout", 0.0)

        model = nn.Sequential()

        # Add CNN Layers
        input_shape = cnn_input_shape
        cnn_layers = len(cnn_out_channels)
        if cnn_layers > 0:
            for i in range(cnn_layers):
                in_channels, height, width = input_shape
                add_sparse_cnn_layer(
                    network=model,
                    suffix=i + 1,
                    in_channels=in_channels,
                    out_channels=cnn_out_channels[i],
                    use_batch_norm=use_batch_norm,
                    weight_sparsity=cnn_weight_sparsity,
                    percent_on=cnn_percent_on[i],
                    k_inference_factor=k_inference_factor,
                    boost_strength=boost_strength,
                    boost_strength_factor=boost_strength_factor,
                )

                # Feed this layer output into next layer input
                in_channels = cnn_out_channels[i]

                # Compute next layer input shape
                wout = (width - 5) + 1
                maxpool_width = wout // 2
                input_shape = (in_channels, maxpool_width, maxpool_width)

        # Flatten CNN output before passing to linear layer
        model.add_module("flatten", Flatten())

        # Add Linear layers
        input_size = np.prod(input_shape)
        for i in range(len(linear_n)):
            add_sparse_linear_layer(
                network=model,
                suffix=i + 1,
                input_size=input_size,
                linear_n=linear_n[i],
                dropout=dropout,
                use_batch_norm=False,
                weight_sparsity=weight_sparsity,
                percent_on=linear_percent_on[i],
                k_inference_factor=k_inference_factor,
                boost_strength=boost_strength,
                boost_strength_factor=boost_strength_factor,
            )
            input_size = linear_n[i]

        # Output layer
        model.add_module("output", nn.Linear(input_size, 10))
        model.add_module("softmax", nn.LogSoftmax(dim=1))

        if torch.cuda.is_available():
            self.device = torch.device("cuda")
            model = model.cuda()
        else:
            self.device = torch.device("cpu")

        if torch.cuda.device_count() > 1:
            self.logger.debug("Using", torch.cuda.device_count(), "GPUs")
            model = torch.nn.DataParallel(model)

        self.model = model
        self.logger.debug("Model: %s", self.model)
        self.learning_rate = config["learning_rate"]
        self.momentum = config["momentum"]

        self.batches_in_epoch = config["batches_in_epoch"]
        self.batches_in_first_epoch = config["batches_in_first_epoch"]
        self.config = config

        self.optimizer = self._create_optimizer(name=config["optimizer"],
                                                model=self.model)
        self.lr_scheduler = self._create_learning_rate_scheduler(
            name=config.get("lr_scheduler", None), optimizer=self.optimizer)
    def setup_experiment(self, config):
        """
        Configure the experiment for training

        :param config: Dictionary containing the configuration parameters

            - distributed: Whether or not to use Pytorch Distributed training
            - backend: Pytorch Distributed backend ("nccl", "gloo")
                    Default: nccl
            - world_size: Total number of processes participating
            - rank: Rank of the current process
            - data: Dataset path
            - train_dir: Dataset training data relative path
            - batch_size: Training batch size
            - val_dir: Dataset validation data relative path
            - val_batch_size: Validation batch size
            - workers: how many data loading processes to use
            - train_loader_drop_last: Whether to skip last batch if it is
                                      smaller than the batch size
            - num_classes: Limit the dataset size to the given number of classes
            - model_class: Model class. Must inherit from "torch.nn.Module"
            - model_args: model model class arguments passed to the constructor
            - init_batch_norm: Whether or not to Initialize running batch norm
                               mean to 0.
            - optimizer_class: Optimizer class.
                               Must inherit from "torch.optim.Optimizer"
            - optimizer_args: Optimizer class class arguments passed to the
                              constructor
            - batch_norm_weight_decay: Whether or not to apply weight decay to
                                       batch norm modules parameters
                                       See https://arxiv.org/abs/1807.11205
            - bias_weight_decay: Whether or not to apply weight decay to
                                       bias parameters
            - lr_scheduler_class: Learning rate scheduler class.
                                 Must inherit from "_LRScheduler"
            - lr_scheduler_args: Learning rate scheduler class class arguments
                                 passed to the constructor
            - lr_scheduler_step_every_batch: Whether to step the lr-scheduler after
                                             after every batch (e.g. for OneCycleLR)
            - loss_function: Loss function. See "torch.nn.functional"
            - local_dir: Results path
            - logdir: Directory generated by Ray Tune for this Trial
            - epochs: Number of epochs to train
            - batches_in_epoch: Number of batches per epoch.
                                Useful for debugging
            - log_timestep_freq: Configures mixins and subclasses that log every
                                 timestep to only log every nth timestep (in
                                 addition to the final timestep of each epoch).
                                 Set to 0 to log only at the end of each epoch.
            - progress: Show progress during training
            - name: Experiment name. Used as logger name
            - log_level: Python Logging level
            - log_format: Python Logging format
            - seed: the seed to be used for pytorch, python, and numpy
            - mixed_precision: Whether or not to enable apex mixed precision
            - mixed_precision_args: apex mixed precision arguments.
                                    See "amp.initialize"
            - sample_transform: Transform acting on the training samples. To be used
                                additively after default transform or auto-augment.
            - target_transform: Transform acting on the training targets.
            - replicas_per_sample: Number of replicas to create per sample in the batch.
                                   (each replica is transformed independently)
                                   Used in maxup.
            - train_model_func: Optional user defined function to train the model,
                                expected to behave similarly to `train_model`
                                in terms of input parameters and return values
            - evaluate_model_func: Optional user defined function to validate the model
                                   expected to behave similarly to `evaluate_model`
                                   in terms of input parameters and return values
            - checkpoint_file: if not None, will start from this model. The model
                               must have the same model_args and model_class as the
                               current experiment.
            - load_checkpoint_args: args to be passed to `load_state_from_checkpoint`
            - checkpoint_at_init: boolean argument for whether to create a checkpoint
                                  of the initialized model. this differs from
                                  `checkpoint_at_start` for which the checkpoint occurs
                                  after the first epoch of training as opposed to
                                  before it
            - epochs_to_validate: list of epochs to run validate(). A -1 asks
                                  to run validate before any training occurs.
                                  Default: last three epochs.
            - extra_validations_per_epoch: number of additional validations to
                                           perform mid-epoch. Additional
                                           validations are distributed evenly
                                           across training batches.
            - launch_time: time the config was created (via time.time). Used to report
                           wall clock time until the first batch is done.
                           Default: time.time() in this setup_experiment().
        """
        # Configure logging related stuff
        log_format = config.get("log_format", logging.BASIC_FORMAT)
        log_level = getattr(logging, config.get("log_level", "INFO").upper())
        console = logging.StreamHandler()
        console.setFormatter(logging.Formatter(log_format))
        self.logger = logging.getLogger(config.get("name", type(self).__name__))
        self.logger.setLevel(log_level)
        self.logger.addHandler(console)
        self.progress = config.get("progress", False)
        self.launch_time = config.get("launch_time", time.time())
        self.logdir = config.get("logdir", None)

        # Configure seed
        self.seed = config.get("seed", self.seed)
        set_random_seed(self.seed, False)

        # Configure distribute pytorch
        self.distributed = config.get("distributed", False)
        self.rank = config.get("rank", 0)

        if self.rank == 0:
            self.logger.info(
                f"Execution order: {pformat(self.get_execution_order())}")

        if self.distributed:
            dist_url = config.get("dist_url", "tcp://127.0.0.1:54321")
            backend = config.get("backend", "nccl")
            world_size = config.get("world_size", 1)
            dist.init_process_group(
                backend=backend,
                init_method=dist_url,
                rank=self.rank,
                world_size=world_size,
            )
            # Only enable logs from first process
            self.logger.disabled = self.rank != 0
            self.progress = self.progress and self.rank == 0

        # Configure model
        self.device = config.get("device", self.device)
        self.model = self.create_model(config, self.device)
        self.transform_model()

        if self.rank == 0:
            self.logger.debug(self.model)

        # Configure optimizer
        group_decay, group_no_decay = [], []
        for module in self.model.modules():
            for name, param in module.named_parameters(recurse=False):
                if self.should_decay_parameter(module, name, param, config):
                    group_decay.append(param)
                else:
                    group_no_decay.append(param)

        optimizer_class = config.get("optimizer_class", torch.optim.SGD)
        optimizer_args = config.get("optimizer_args", {})
        self.optimizer = optimizer_class([dict(params=group_decay),
                                          dict(params=group_no_decay,
                                               weight_decay=0.)],
                                         **optimizer_args)

        # Validate mixed precision requirements
        self.mixed_precision = config.get("mixed_precision", False)
        if self.mixed_precision and amp is None:
            self.mixed_precision = False
            self.logger.error(
                "Mixed precision requires NVIDA APEX."
                "Please install apex from https://www.github.com/nvidia/apex"
                "Disabling mixed precision training.")

        # Configure mixed precision training
        if self.mixed_precision:
            amp_args = config.get("mixed_precision_args", {})
            self.model, self.optimizer = amp.initialize(
                self.model, self.optimizer, **amp_args)
            self.logger.info("Using mixed precision")

        # Apply DistributedDataParallel after all other model mutations
        if self.distributed:
            self.model = DistributedDataParallel(self.model)
        else:
            self.model = DataParallel(self.model)

        self._loss_function = config.get(
            "loss_function", torch.nn.functional.cross_entropy
        )

        self.num_classes = config.get("num_classes", 1000)
        self.epochs = config.get("epochs", 1)
        self.batches_in_epoch = config.get("batches_in_epoch", sys.maxsize)
        self.current_epoch = 0

        # Get initial batch size
        self.batch_size = config.get("batch_size", 1)

        # CUDA runtime does not support the fork start method.
        # See https://pytorch.org/docs/stable/notes/multiprocessing.html
        multiprocessing.set_start_method("spawn", force=True)

        # Configure data loaders
        self.train_loader = self.create_train_dataloader(config)
        self.val_loader = self.create_validation_dataloader(config)
        self.total_batches = len(self.train_loader)

        self.epochs_to_validate = config.get("epochs_to_validate",
                                             range(self.epochs - 3,
                                                   self.epochs + 1))

        extra_validations = config.get("extra_validations_per_epoch", 0)
        batches_to_validate = np.linspace(
            min(self.total_batches, self.batches_in_epoch),
            0,
            1 + extra_validations,
            endpoint=False
        )[::-1].round().astype("int").tolist()
        self.additional_batches_to_validate = batches_to_validate[:-1]
        if extra_validations > 0:
            self.logger.info(
                f"Extra validations per epoch: {extra_validations}, "
                f"batch indices: {self.additional_batches_to_validate}")

        # Used for logging. Conceptually, it is a version number for the model's
        # parameters. By default, this is the elapsed number of batches that the
        # model has been trained on. Experiments may also increment this on
        # other events like model prunings. When validation is performed after a
        # training batch, the validation results are assigned to the next
        # timestep after that training batch, since it was performed on the
        # subsequent version of the parameters.
        self.current_timestep = 0
        self.log_timestep_freq = config.get("log_timestep_freq", 1)

        # A list of [(timestep, result), ...] for the current epoch.
        self.extra_val_results = []

        # Configure learning rate scheduler
        self.lr_scheduler = self.create_lr_scheduler(
            config, self.optimizer, self.total_batches)
        if self.lr_scheduler is not None:
            lr_scheduler_class = self.lr_scheduler.__class__.__name__
            lr_scheduler_args = config.get("lr_scheduler_args", {})
            self.logger.info("LR Scheduler class: " + lr_scheduler_class)
            self.logger.info("LR Scheduler args:")
            self.logger.info(pformat(lr_scheduler_args))
            self.logger.info("steps_per_epoch=%s", self.total_batches)

        self.step_lr_every_batch = config.get("lr_scheduler_step_every_batch", False)
        if isinstance(self.lr_scheduler, (OneCycleLR, ComposedLRScheduler)):
            self.step_lr_every_batch = True

        # Set train and validate methods.
        self.train_model = config.get("train_model_func", train_model)
        self.evaluate_model = config.get("evaluate_model_func", evaluate_model)
示例#24
0
import ray
import torch

from nupic.research.archive.dynamic_sparse.common.ray_custom_loggers import (
    DEFAULT_LOGGERS,
)
from nupic.research.archive.dynamic_sparse.common.utils import (
    Trainable,
    new_experiment,
    run_experiment,
)
from nupic.research.frameworks.pytorch.model_utils import set_random_seed

# Set seed for `random`, `numpy`, and `pytorch`.
set_random_seed(32)


def serializer(obj):
    if obj.is_cuda:
        return obj.cpu().numpy()
    else:
        return obj.numpy()


def deserializer(serialized_obj):
    return serialized_obj


# experiment configurations
base_exp_config = dict(
    def setup_experiment(self, config):
        """
        Configure the experiment for training

        :param config: Dictionary containing the configuration parameters

            - distributed: Whether or not to use  Pytorch Distributed training
            - backend: Pytorch Distributed backend ("nccl", "gloo")
            - world_size: Total number of processes participating
            - rank: Rank of the current process
            - data: Dataset path
            - train_dir: Dataset training data relative path
            - batch_size: Training batch size
            - val_dir: Dataset validation data relative path
            - val_batch_size: Validation batch size
            - workers: how many data loading processes to use
            - num_classes: Limit the dataset size to the given number of classes
            - model_class: Model class. Must inherit from "torch.nn.Module"
            - model_args: model model class arguments passed to the constructor
            - init_batch_norm: Whether or not to Initialize running batch norm
                               mean to 0.
            - optimizer_class: Optimizer class.
                               Must inherit from "torch.optim.Optimizer"
            - optimizer_args: Optimizer class class arguments passed to the
                              constructor
            - batch_norm_weight_decay: Whether or not to apply weight decay to
                                       batch norm modules parameters
            - lr_scheduler_class: Learning rate scheduler class.
                                 Must inherit from "_LRScheduler"
            - lr_scheduler_args: Learning rate scheduler class class arguments
                                 passed to the constructor
            - loss_function: Loss function. See "torch.nn.functional"
            - local_dir: Results path
            - epochs: Number of epochs to train
            - batches_in_epoch: Number of batches per epoch.
                                Useful for debugging
            - progress: Show progress during training
            - profile: Whether or not to enable torch.autograd.profiler.profile
                       during training
            - name: Experiment name. Used as logger name
            - log_level: Python Logging level
            - log_format: Python Logging format
            - seed: the seed to be used for pytorch, python, and numpy
            - mixed_precision: Whether or not to enable apex mixed precision
            - mixed_precision_args: apex mixed precision arguments.
                                    See "amp.initialize"
            - init_hooks: list of hooks (functions) to call on the model
                          just following its initialization
            - post_epoch_hooks: list of hooks (functions) to call on the model
                                following each epoch of training
            - checkpoint_file: if not None, will start from this model. The model
                               must have the same model_args and model_class as the
                               current experiment.
            - validate_after_epoch: will only run validate after this epoch.
                                    Default: epochs - 3
        """
        # Configure logger
        log_format = config.get("log_format", logging.BASIC_FORMAT)
        log_level = getattr(logging, config.get("log_level", "INFO").upper())
        console = logging.StreamHandler()
        console.setFormatter(logging.Formatter(log_format))
        self.logger = logging.getLogger(config.get("name",
                                                   type(self).__name__))
        self.logger.setLevel(log_level)
        self.logger.addHandler(console)
        self.progress = config.get("progress", False)

        # Configure seed
        self.seed = config.get("seed", self.seed)
        set_random_seed(self.seed, False)

        # Configure distribute pytorch
        self.distributed = config.get("distributed", False)
        self.rank = config.get("rank", 0)
        if self.distributed:
            dist_url = config.get("dist_url", "tcp://127.0.0.1:54321")
            backend = config.get("backend", "nccl")
            world_size = config.get("world_size", 1)
            dist.init_process_group(
                backend=backend,
                init_method=dist_url,
                rank=self.rank,
                world_size=world_size,
            )
            # Only enable logs from first process
            self.logger.disabled = self.rank != 0
            self.progress = self.progress and self.rank == 0

        # Configure model
        model_class = config["model_class"]
        model_args = config.get("model_args", {})
        init_batch_norm = config.get("init_batch_norm", False)
        init_hooks = config.get("init_hooks", None)
        self.model = create_model(model_class=model_class,
                                  model_args=model_args,
                                  init_batch_norm=init_batch_norm,
                                  device=self.device,
                                  init_hooks=init_hooks,
                                  checkpoint_file=config.get(
                                      "checkpoint_file", None))
        if self.rank == 0:
            self.logger.debug(self.model)
            params_sparse, nonzero_params_sparse2 = count_nonzero_params(
                self.model)
            self.logger.debug("Params total/nnz %s / %s = %s ", params_sparse,
                              nonzero_params_sparse2,
                              float(nonzero_params_sparse2) / params_sparse)

        # Configure optimizer
        optimizer_class = config.get("optimizer_class", torch.optim.SGD)
        optimizer_args = config.get("optimizer_args", {})
        batch_norm_weight_decay = config.get("batch_norm_weight_decay", True)
        self.optimizer = create_optimizer(
            model=self.model,
            optimizer_class=optimizer_class,
            optimizer_args=optimizer_args,
            batch_norm_weight_decay=batch_norm_weight_decay,
        )

        # Validate mixed precision requirements
        self.mixed_precision = config.get("mixed_precision", False)
        if self.mixed_precision and amp is None:
            self.mixed_precision = False
            self.logger.error(
                "Mixed precision requires NVIDA APEX."
                "Please install apex from https://www.github.com/nvidia/apex"
                "Disabling mixed precision training.")

        # Configure mixed precision training
        if self.mixed_precision:
            amp_args = config.get("mixed_precision_args", {})
            self.model, self.optimizer = amp.initialize(
                self.model, self.optimizer, **amp_args)
            self.logger.info("Using mixed precision")

        # Apply DistributedDataParallel after all other model mutations
        if self.distributed:
            self.model = DistributedDataParallel(self.model)
        else:
            self.model = DataParallel(self.model)

        self.loss_function = config.get("loss_function",
                                        torch.nn.functional.cross_entropy)

        # Configure data loaders
        self.epochs = config.get("epochs", 1)
        self.batches_in_epoch = config.get("batches_in_epoch", sys.maxsize)
        self.validate_after_epoch = config.get("validate_after_epoch",
                                               self.epochs - 3)
        workers = config.get("workers", 0)
        data_dir = config["data"]
        train_dir = config.get("train_dir", "train")
        num_classes = config.get("num_classes", 1000)

        # Get initial batch size
        self.batch_size = config.get("batch_size", 1)

        # CUDA runtime does not support the fork start method.
        # See https://pytorch.org/docs/stable/notes/multiprocessing.html
        if torch.cuda.is_available():
            multiprocessing.set_start_method("spawn")

        # Configure Training data loader
        self.train_loader = create_train_dataloader(
            data_dir=data_dir,
            train_dir=train_dir,
            batch_size=self.batch_size,
            workers=workers,
            distributed=self.distributed,
            num_classes=num_classes,
            use_auto_augment=config.get("use_auto_augment", False),
        )
        self.total_batches = len(self.train_loader)

        # Configure Validation data loader
        val_dir = config.get("val_dir", "val")
        val_batch_size = config.get("val_batch_size", self.batch_size)
        self.val_loader = create_validation_dataloader(
            data_dir=data_dir,
            val_dir=val_dir,
            batch_size=val_batch_size,
            workers=workers,
            num_classes=num_classes,
        )

        # Configure leaning rate scheduler
        lr_scheduler_class = config.get("lr_scheduler_class", None)
        if lr_scheduler_class is not None:
            lr_scheduler_args = config.get("lr_scheduler_args", {})
            self.logger.debug("LR Scheduler args:")
            self.logger.debug(pformat(lr_scheduler_args))
            self.lr_scheduler = create_lr_scheduler(
                optimizer=self.optimizer,
                lr_scheduler_class=lr_scheduler_class,
                lr_scheduler_args=lr_scheduler_args,
                steps_per_epoch=self.total_batches)

        # Only profile from rank 0
        self.profile = config.get("profile", False) and self.rank == 0

        # Register post-epoch hooks. To be used as `self.model.apply(post_epoch_hook)`
        self.post_epoch_hooks = config.get("post_epoch_hooks", [])