def exp_process_function( thread_device_id: int, thread_exp_id: int, thread_exp_args: Dict, res_q: mp.Queue, ): logging_utils.setup_unique_log_file( root_folder_path=thread_exp_args["prune_out_folder_path"], file_name_format=FILE_NAME_FORMAT_SUB_LOG, ) logger = logging_utils.get_logger(LOGGER_NAME) logger.info(""" >>> process <<< thread_device_id: {thread_device_id} thread_exp_id: {thread_exp_id} thread_exp_args: {thread_exp_args} """.format( thread_device_id=thread_device_id, thread_exp_id=thread_exp_id, thread_exp_args=thread_exp_args, )) with torch.cuda.device(thread_device_id): res = run_single_experiment(**thread_exp_args) res_q.put(res, block=True) res_q.close() clear_mem(logger)
def main() -> None: args = get_args() # Logging. # Assuming that an existing log file means that the corresponding results file will be taken. datetime_string: Text = logging_utils.setup_unique_log_file( root_folder_path=args.out_folder, file_name_format=FILE_NAME_FORMAT_MAIN_LOG, ) logger = logging_utils.get_logger(LOGGER_NAME) logger.info(args) exp_config: exp_config_utils.ExpConfig = exp_config_utils.get_config_from_file( config_file_loc=args.exp_config) model_config: Optional[model_config_utils.ModelConfig] = None if args.model_config: model_config = model_config_utils.get_config_from_file( config_file_loc=args.model_config) # Set mp 'spawn' method for torch. mp.set_start_method("spawn") # Run everything. run_experiments( exp_config=exp_config, model_checkpoint_path=args.model_checkpoint, out_folder_path=args.out_folder, model_config=model_config, datetime_string=datetime_string, )
def get_layer_craig_subset(layer: Union[nn.Linear, nn.Conv2d], original_num_nodes: int, prune_percent_per_layer: float, similarity_metric: Union[Text, Dict] = "", prune_type: Text = "craig", **kwargs) -> Tuple[List[int], List[float]]: logger = logging_utils.get_logger(LOGGER_NAME) assert (0 <= prune_percent_per_layer) and ( prune_percent_per_layer <= 1), "prune_percent_per_layer ({}) must be within [0,1]".format( prune_percent_per_layer) assert prune_type in ( "craig", "random", ), "prune_type must be 'craig' or 'random'" assert (prune_type == "random") or ( similarity_metric ), "similarity_metric must be set for prune_type '{}'".format(prune_type) target_num_nodes: int = math.ceil( (1 - prune_percent_per_layer) * original_num_nodes) subset_nodes: List subset_weights: List if prune_type == "random": subset_nodes = random.sample(list(range(original_num_nodes)), target_num_nodes) subset_weights = [1 for _ in subset_nodes] else: # Assumes similarity_metric is set correctly. similarity_matrix: Any if isinstance(similarity_metric, dict): similarity_matrix = getattr(SimilarityMetrics, similarity_metric["name"])( layer=layer, **similarity_metric) else: similarity_matrix = getattr(SimilarityMetrics, similarity_metric)(layer=layer) ( subset_nodes, subset_weights, craig_time, ) = craig.get_craig_subset_and_weights( similarity_matrix=similarity_matrix, target_size=target_num_nodes) logger.info("craig runtime (s): {}".format(craig_time)) return subset_nodes, subset_weights
def main() -> None: args = get_args() experiment_folder_path: Text = args.out_folder if not os.path.exists(experiment_folder_path): os.makedirs(experiment_folder_path) logging_utils.setup_logging( os.path.join( experiment_folder_path, "log-{}.txt".format(datetime.now().strftime("%Y_%m_%d-%H_%M_%S")), )) logger = logging_utils.get_logger(__name__) logger.info(args) model_config_or_checkpoint: Union[model_config_utils.ModelConfig, Text] if args.model_config: model_config_or_checkpoint = model_config_utils.get_config_from_file( args.model_config) elif args.checkpoint: model_config_or_checkpoint = args.checkpoint else: err_msg = "Either --model_config or --checkpoint must be provided." logger.error(err_msg) raise ValueError(err_msg) train_config: train_config_utils.TrainConfig = train_config_utils.get_config_from_file( args.train_config) with torch.cuda.device(args.cuda_device_id): train_model_with_configs( model_config_or_checkpoint=model_config_or_checkpoint, train_config=train_config, experiment_folder_path=experiment_folder_path, resume_training=args.resume_training, save_interval=args.save_interval, save_best_checkpoint=args.save_best_checkpoint, use_gpu=not args.no_cuda, )
def main() -> None: args = get_args() config: prune_config_utils.PruneConfig = prune_config_utils.get_config_from_file( args.config) pruned_output_folder: Text = (args.out_folder if args.out_folder else config.pruned_model_out_folder) # Logging datetime_string: Text = datetime.now().strftime("%Y_%m_%d-%H_%M_%S") logging_utils.setup_logging(log_file_loc=os.path.join( pruned_output_folder, FILE_NAME_FORMAT_LOG.format(datetime_string), )) logger = logging_utils.get_logger(LOGGER_NAME) logger.info(args) prune_network( prune_config=config, pruned_output_folder=pruned_output_folder, model_checkpoint_path=args.model, )
import json import traceback import boto3 from utils.aws_utils import setup_s3_client, put_template_into_s3 from utils.pipeline_utils import put_job_failure, put_job_success, continue_job_later, \ PipelineUserParameters, PipelineStackConfig, load_pipeline_artifacts, \ parse_override_params, get_file_from_artifact, generate_output_artifact from utils.stack_utils import stack_exists, get_stack_status, \ stack_delete, change_set_exists, execute_change_set, get_change_set_status, delete_change_set, create_change_set, \ update_stack, create_stack, get_stack_output from utils.logging_utils import get_logger logger = get_logger() def start_stack_create_or_update(cf, job_id, stack_name, template_url, config: PipelineStackConfig, update=False, role_arn=None): if update: status = get_stack_status(cf, stack_name) if status not in [ 'CREATE_COMPLETE', 'ROLLBACK_COMPLETE', 'UPDATE_COMPLETE', 'UPDATE_ROLLBACK_COMPLETE' ]:
from __future__ import print_function from __future__ import unicode_literals from __future__ import division from __future__ import with_statement from __future__ import absolute_import import tarfile import argparse import json import shutil from utils.logging_utils import get_logger from utils.sha_utils import get_digest_sha256 logger = get_logger(__name__) def arg_parse(): parser = argparse.ArgumentParser( description='Make resulting update docker image') parser.add_argument('--difftar', help='diff tar previously produced', type=str, required=True) parser.add_argument('--oldimg', help='old img tar', type=str, required=True) parser.add_argument('--output', help='new tar to load to docker', type=str,
import json import tempfile import zipfile import os import boto3 from utils.aws_utils import file_to_dict from utils.logging_utils import get_logger code_pipeline = boto3.client('codepipeline') logger = get_logger() class PipelineUserParameters: def __init__(self, job_data, lambda_ctx): """Decodes the JSON user parameters and validates the required properties passed into Lambda function :param job_data: The job data structure containing the UserParameters string which should be a valid JSON structure :param lambda_ctx: Lambda context Possible ActionMode: - CREATE_UPDATE - DELETE_ONLY - REPLACE_ON_FAILURE - CHANGE_SET_REPLACE - CHANGE_SET_EXECUTE """ logger.debug("getting user parameters") user_parameters = None self.TemplateFile = None self.TemplateArtifact = None
def run_experiments( exp_config: exp_config_utils.ExpConfig, model_checkpoint_path: Text, out_folder_path: Text, model_config: Optional[model_config_utils.ModelConfig], datetime_string: Text, ) -> None: """ For each experiment: - Prune network, save pruned model (pruner). - Get train and test accuracy for pruned model on MNIST (eval_model). - Fine tune model. - Get train and test accuracy for pruned+finetuned model on MNIST (eval_model). Accumulate size, train accuracy, and test accuracy for each experiment. Print these out in a csv (or formatted that way). """ # Logging. logger = logging_utils.get_logger(name=LOGGER_NAME) # Track total run time. full_start_time = time.time() # Original model. original_model_name: Text = os.path.basename(model_checkpoint_path) original_size, original_train_acc, original_test_acc = evaluate_model( model_path=model_checkpoint_path, dataset_name=exp_config.evaluation_dataset_name, batch_size=exp_config.evaluation_dataset_batch_size, model_size_type="numel", ) original_model_results: List = [ original_size, original_train_acc, original_test_acc, ] logger.info( PRINT_FORMAT.format("original", original_size, original_train_acc, original_test_acc)) clear_mem(logger) # Experiment results container. # experiment_vals: List[List] = [] experiment_vals: Dict[int, List] = {} try: prune_type: Text = exp_config.prune_type if prune_type == "craig": run_craig_experiments( experiment_vals=experiment_vals, exp_config=exp_config, original_model_name=original_model_name, original_model_path=model_checkpoint_path, original_model_config=model_config, original_model_results=original_model_results, out_folder_path=out_folder_path, datetime_string=datetime_string, ) # TODO: Make Mussay compatible again. # elif prune_type == "mussay": # run_mussay_experiments( # experiment_vals=experiment_vals, # prune_out_folder_root_path=prune_out_folder_root_path, # original_model_name=original_model_name, # original_model_path=original_model_path, # original_model_config_path=original_model_config_path, # original_model_train_config_path=original_model_train_config_path, # original_model_results=original_model_results, # evaluation_epochs_list=evaluation_epochs_list, # ) else: raise ValueError("prune_type not supported: {}".format(prune_type)) finally: # Write results to csv. logger.info("writing final results...") out_csv_path: Text = write_results_to_csv( experiment_vals=experiment_vals, out_folder_path=out_folder_path, file_name_format=FILE_NAME_FORMAT_MAIN_RESULTS, datetime_string=datetime_string, ) logger.info("results written to: {}".format(out_csv_path)) logger.info("Total run time: {}".format( str(timedelta(seconds=time.time() - full_start_time))))
def run_craig_experiments( experiment_vals: Dict[int, List], exp_config: exp_config_utils.ExpConfig, original_model_name: Text, original_model_path: Text, original_model_config: Optional[model_config_utils.ModelConfig], original_model_results: List, out_folder_path: Text, datetime_string: Text, ) -> None: # Logging. logger = logging_utils.get_logger(name=LOGGER_NAME) clear_mem(logger) # Set up multiprocessing and cuda. num_cuda_devices: int = torch.cuda.device_count() mem_per_cuda_device: List[int] = [ torch.cuda.get_device_properties(device_id). total_memory # TODO: Make sure this is the right value, in bytes. for device_id in range(num_cuda_devices) ] cuda_device_names: List[Text] = [ torch.cuda.get_device_properties(device_id).name for device_id in range(num_cuda_devices) ] max_model_size: int = -1 if (exp_config.cuda_model_max_mb == -1) else (1000 * 1000 * exp_config.cuda_model_max_mb) max_procs_per_device: List[int] = [ int( # Take the floor. 1 if (max_model_size == -1) else ((mem * exp_config.cuda_max_percent_mem_usage) / max_model_size)) for mem in mem_per_cuda_device ] max_process_count: int = sum(max_procs_per_device) logger.info("Found the following cuda devices: {}".format([ "(name='{cdn}', mem={cdm}, max_exp={cde})".format(cdn=cdn, cdm=cdm, cde=cde) for cdn, cdm, cde in zip(cuda_device_names, mem_per_cuda_device, max_procs_per_device) ])) # TODO: Give each config an id. Allow each process to save its results to a list. # Set up root configs. prune_config_root: prune_config_utils.PruneConfig = prune_config_utils.PruneConfig( { "prune_type": "craig", "prune_params": {}, "original_model_path": original_model_path, }) if "model_input_shape" in exp_config._raw_dict: prune_config_root.model_input_shape = exp_config.model_input_shape if "data_transform_name" in exp_config._raw_dict: prune_config_root.data_transform_name = exp_config.data_transform_name finetuning_train_config: train_config_utils.TrainConfig = exp_config.finetuning_train_config # Create experiment parameters. # prune_layer_params: OrderedDict = OrderedDict( prune_layer_params: Dict = exp_config.prune_params[ prune_config_utils.KEY_LAYER_PARAMS] prune_param_values: List = [] layer_name_map: List[Text] = [] param_name_map: List[Text] = [] for layer_name, layer_params in prune_layer_params.items(): for param_name, param_list in layer_params.items(): prune_param_values.append(param_list) layer_name_map.append(layer_name) param_name_map.append(param_name) exp_value_permutations: List[List] = list( itertools.product(*prune_param_values)) # Create list of experiment function arguments. exp_function_arguments: List[Dict] = [] exp_names: List[Text] = [] for exp_id, param_permutation in enumerate(exp_value_permutations): # Start with a exp_id of 0. # Build layer params from this param_permutation. exp_layer_params: Dict = {} for exp_param_ind, exp_param in enumerate(param_permutation): exp_param_dict = exp_layer_params.setdefault( layer_name_map[exp_param_ind], {}) exp_param_dict[param_name_map[exp_param_ind]] = exp_param prune_config_root.prune_params = { prune_config_utils.KEY_LAYER_PARAMS: exp_layer_params } # Create experiment name. exp_name_temp_list = [] for e_layer_name, e_layer in exp_layer_params.items(): e_params = [ get_exp_str_from_param(e_p) for e_p in e_layer.values() ] exp_name_temp_list.append("{}-{}".format(e_layer_name, "_".join(e_params))) exp_name = "--".join(exp_name_temp_list) # Name the output folder after the experiment name. prune_out_folder_path: Text = os.path.join(out_folder_path, exp_name) exp_names.append(exp_name) exp_function_arguments.append( dict( exp_id=exp_id, prune_config=prune_config_utils.PruneConfig( prune_config_root._raw_dict.copy()), prune_out_folder_path=prune_out_folder_path, finetuning_train_config=finetuning_train_config, original_model_config=original_model_config, evaluation_epochs_list=exp_config.evaluation_epochs, )) logger.info("All experiment configs: {}".format(exp_function_arguments)) num_experiments_total: int = len(exp_function_arguments) num_experiments_complete: int = 0 next_exp_id: int = 0 processes_per_device: List[Dict] = [{} for i in range(num_cuda_devices)] # Results queue exp_results_q: mp.Queue = mp.Queue() def exp_thread_function(thread_device_id: int, thread_exp_id: int, thread_exp_args: Dict): # NOTE: Using a mp.Queue because it is process-safe, this is not the most elegant solution. thread_q: mp.Queue = mp.Queue() proc = mp.Process( target=exp_process_function, kwargs=dict( thread_device_id=thread_device_id, thread_exp_id=thread_exp_id, thread_exp_args=thread_exp_args, res_q=thread_q, ), ) try: proc.start() proc.join() except Exception as e: logger.error(e, exc_info=True) finally: # Adding this empty result so the thread can exit if something breaks. thread_q.put([]) exp_results_q.put( (thread_device_id, thread_exp_id, thread_q.get(block=True))) thread_q.close() # First, attempt to start new processes. for device_id, max_procs in enumerate(max_procs_per_device): if next_exp_id >= num_experiments_total: break for pid in range(max_procs): if next_exp_id >= num_experiments_total: break thread = threading.Thread( target=exp_thread_function, kwargs=dict( thread_device_id=device_id, thread_exp_id=next_exp_id, thread_exp_args=exp_function_arguments[next_exp_id], ), ) thread.start() processes_per_device[device_id][next_exp_id] = thread next_exp_id += 1 # Now, continually check for free devices. while num_experiments_complete < num_experiments_total: # Since there are still experiments to complete, just wait for results. next_result = exp_results_q.get(block=True) res_device_id, res_exp_id, res_vals = next_result logger.info("Got result for {} : {}".format(exp_names[res_exp_id], next_result)) # If a result is available, then an experiment is done. # Join on the thread, then remove from list. processes_per_device[res_device_id][res_exp_id].join() del processes_per_device[res_device_id][res_exp_id] # Increment the completion count. num_experiments_complete += 1 if not res_vals: logger.warning( "Results were empty so exp may have failed. Not saving.") else: # Add results to total results. experiment_vals[res_exp_id] = ([ res_exp_id, original_model_name, exp_names[res_exp_id], "", ] + original_model_results.copy() + res_vals) # Incrementally save experiment_vals. write_results_to_csv( experiment_vals=experiment_vals, out_folder_path=out_folder_path, file_name_format=FILE_NAME_FORMAT_MAIN_RESULTS, datetime_string=datetime_string, ) logger.info("Jobs complete: {}/{} ({:.2%})".format( num_experiments_complete, num_experiments_total, num_experiments_complete / num_experiments_total, )) # Add a new process to this device, if needed. if next_exp_id < num_experiments_total: num_seconds_to_sleep: float = 5 logger.info( "Sleeping for {} seconds to allow VRAM to clear...".format( num_seconds_to_sleep)) time.sleep(num_seconds_to_sleep) logger.info("Waking up from sleep.") logger.info("Starting next exp on device {} : ({}) {}".format( res_device_id, next_exp_id, exp_names[next_exp_id])) thread = threading.Thread( target=exp_thread_function, kwargs=dict( thread_device_id=res_device_id, thread_exp_id=next_exp_id, thread_exp_args=exp_function_arguments[next_exp_id], ), ) thread.start() processes_per_device[res_device_id][next_exp_id] = thread next_exp_id += 1 # Print running procs for reference. logger.info("Current process count per device: {}".format( [len(procs) for procs in processes_per_device])) logger.info("Current experiment IDs per device: {}".format( [list(procs.keys()) for procs in processes_per_device])) logger.info("All {}/{} jobs completed".format(num_experiments_complete, num_experiments_total))
def run_single_experiment( exp_id: int, prune_config: prune_config_utils.PruneConfig, prune_out_folder_path: Text, finetuning_train_config: train_config_utils.TrainConfig, original_model_config: Optional[model_config_utils.ModelConfig], evaluation_epochs_list: Sequence[Union[Text, int]], ) -> List: # Logging. logger = logging_utils.get_logger(name=LOGGER_NAME) logger.info("Starting experiment with exp_id: {}".format(exp_id)) # Set up prune folder. if not os.path.exists(prune_out_folder_path): os.makedirs(prune_out_folder_path) # Copy original model config. if original_model_config: general_config_utils.write_config_to_file( original_model_config, os.path.join(prune_out_folder_path, "config-model-original_model.json"), ) # Prune. logger.info("pruning...") pruner.prune_network(prune_config=prune_config, pruned_output_folder=prune_out_folder_path) pruned_model_path: Text = os.path.join(prune_out_folder_path, pruner.FILE_NAME_MODEL) # pruned_model_path: Text = os.path.join( # prune_out_folder_path, pruner.FILE_NAME_STATE_DICT # ) # Finetune. logger.info("finetuning...") finetuning_folder_path: Text = os.path.join(prune_out_folder_path, "finetuning") stat_counters: Dict[ Text, train_utils.StatCounter] = train_algo_1.train_model_with_configs( model_config_or_checkpoint=pruned_model_path, train_config=finetuning_train_config, experiment_folder_path=finetuning_folder_path, save_interval= 0, # Set to zero to never save per epoch, to save space. save_best_checkpoint=True, use_gpu=True, ) # Save results from stat_counters: train/test accuracy, and size. eval_results: List = [] evaluation_epochs_list = [(finetuning_train_config.num_epochs if (epoch == -1) else epoch) for epoch in evaluation_epochs_list] model_size_epochs: train_utils.StatCounter = stat_counters[ "model_size_epochs"] train_acc_epochs: train_utils.StatCounter = stat_counters[ "train_acc_epochs"] test_acc_epochs: train_utils.StatCounter = stat_counters["test_acc_epochs"] for epoch in evaluation_epochs_list: model_size: int train_acc: float test_acc: float if epoch == "best": test_acc_ind = max( range(len(test_acc_epochs._counter)), key=lambda x: test_acc_epochs._counter[x], ) test_acc = test_acc_epochs._counter[test_acc_ind] train_acc = train_acc_epochs._counter[test_acc_ind] model_size = model_size_epochs._counter[test_acc_ind] elif isinstance(epoch, int): test_acc = test_acc_epochs._counter[epoch] train_acc = train_acc_epochs._counter[epoch] model_size = model_size_epochs._counter[epoch] else: raise TypeError( "Found unsupported type in evaluation_epochs_list: {}".format( epoch)) eval_results.extend([ "", epoch, model_size, train_acc, test_acc, ]) logger.info( PRINT_FORMAT.format( "Epoch {}".format(epoch), model_size, train_acc, test_acc, )) return eval_results
def train_model_with_configs( model_config_or_checkpoint: Union[model_config_utils.ModelConfig, Text], train_config: train_config_utils.TrainConfig, experiment_folder_path: Text, resume_training: bool = False, save_interval: int = 1, save_best_checkpoint: bool = True, use_gpu: bool = True, # cuda_device_id: int = 0, ) -> Dict[Text, train_utils.StatCounter]: logger = logging_utils.get_logger(__name__) log_interval: int = 100 assert save_interval >= 0, "save_interval must be >= 0" save_checkpoint_per_epoch: bool = (save_interval != 0) torch_device = torch.device("cuda" if use_gpu else "cpu") if "random_seed" in train_config._raw_dict: random.seed(train_config.random_seed) np.random.seed(train_config.random_seed) torch.manual_seed(train_config.random_seed) torch.cuda.manual_seed(train_config.random_seed) # Using this for reproducibility torch.backends.cudnn.deterministic = True random_info_str: Text = """Random info: random.setstate({random}) np.random.set_state({nprandom}) torch.manual_seed({torch}) torch.cuda.manual_seed({torchcuda}) torch.backends.cudnn.deterministic = {torchcudnn} """.format( random=random.getstate(), nprandom=np.random.get_state(), torch=torch.initial_seed(), torchcuda=torch.cuda.initial_seed(), torchcudnn=torch.backends.cudnn.deterministic, ) logger.info(random_info_str) # Set up some experiment directories. checkpoints_folder_path: Text = os.path.join(experiment_folder_path, FOLDER_NAME_CHECKPOINTS) if not os.path.exists(checkpoints_folder_path): os.makedirs(checkpoints_folder_path) stats_folder_path: Text = os.path.join(experiment_folder_path, "stats") # Set up counters. train_loss_batches: train_utils.StatCounter = train_utils.StatCounter( default_save_params=dict( folder_path=stats_folder_path, file_prefix="train_loss_batches", xlabel="batch", ylabel="loss", title_prefix="train_loss_batches", )) train_loss_epochs: train_utils.StatCounter = train_utils.StatCounter( default_save_params=dict( folder_path=stats_folder_path, file_prefix="train_loss_epochs", xlabel="epoch", ylabel="loss", title_prefix="train_loss_epochs", )) train_acc_batches: train_utils.StatCounter = train_utils.StatCounter( default_save_params=dict( folder_path=stats_folder_path, file_prefix="train_accuracy_batches", xlabel="batch", ylabel="accuracy", title_prefix="train_accuracy_batches", )) train_acc_epochs: train_utils.StatCounter = train_utils.StatCounter( default_save_params=dict( folder_path=stats_folder_path, file_prefix="train_accuracy_epochs", xlabel="epoch", ylabel="accuracy", title_prefix="train_accuracy_epochs", )) test_loss_epochs: train_utils.StatCounter = train_utils.StatCounter( default_save_params=dict( folder_path=stats_folder_path, file_prefix="test_loss_epochs", xlabel="epoch", ylabel="loss", title_prefix="test_loss_epochs", )) test_acc_epochs: train_utils.StatCounter = train_utils.StatCounter( default_save_params=dict( folder_path=stats_folder_path, file_prefix="test_accuracy_epochs", xlabel="epoch", ylabel="accuracy", title_prefix="test_accuracy_epochs", )) model_size_epochs: train_utils.StatCounter = train_utils.StatCounter( default_save_params=dict( folder_path=stats_folder_path, file_prefix="model_size_epochs", xlabel="epoch", ylabel="number of model parameters", title_prefix="model_size_epochs", )) stat_counters: Dict[Text, train_utils.StatCounter] = { "train_loss_batches": train_loss_batches, "train_loss_epochs": train_loss_epochs, "train_acc_batches": train_acc_batches, "train_acc_epochs": train_acc_epochs, "test_loss_epochs": test_loss_epochs, "test_acc_epochs": test_acc_epochs, "model_size_epochs": model_size_epochs, } # Get data. data_transform = train_utils.DATASET_TRANSFORMS[train_config.dataset_name] train_loader = torch.utils.data.DataLoader( train_utils.DATASET_FUNCTIONS[train_config.dataset_name]( train_utils.DATA_FOLDER_PATH, train=True, download=True, transform=data_transform, ), batch_size=train_config.batch_size_train, shuffle=True, ) test_loader = torch.utils.data.DataLoader( train_utils.DATASET_FUNCTIONS[train_config.dataset_name]( train_utils.DATA_FOLDER_PATH, train=False, download=True, transform=data_transform, ), batch_size=train_config.batch_size_test, shuffle=True, ) # Load model. model_config: Optional[model_config_utils.ModelConfig] = None optimizer_state_dict: Optional[Any] = None scheduler_state_dict: Optional[Any] = None resume_epoch: Optional[int] = None model: torch.nn.Module if isinstance(model_config_or_checkpoint, model_config_utils.ModelConfig): model_config = model_config_or_checkpoint model_py_module = importlib.import_module("models.{}".format( model_config.model_architecture)) Model = model_py_module.Model # type: ignore model = Model(**model_config.model_params) elif isinstance(model_config_or_checkpoint, Text): model_checkpoint_path: Text = model_config_or_checkpoint loaded = torch.load(model_checkpoint_path, map_location=torch_device) if isinstance(loaded, torch.nn.Module): # Model. model = loaded else: # State dict. model_config = model_config_utils.ModelConfig( loaded["model_config"]) model_py_module = importlib.import_module("models.{}".format( model_config.model_architecture)) Model = model_py_module.Model # type: ignore model = Model(**model_config.model_params) model.load_state_dict(loaded["model_state_dict"]) if resume_training: optimizer_state_dict = loaded.get("optimizer_state_dict", None) scheduler_state_dict = loaded.get("scheduler_state_dict", None) resume_epoch = loaded.get("epoch", None) else: err_msg: Text = "Model config or path to model checkpoint must be provided." logger.error(err_msg) raise TypeError(err_msg) model = model.to(device=torch_device) # Just using basic Stochastic Gradient Descent. # TODO: Add weigh decay? May not be necesssary for this task optimizer = torch.optim.SGD( params=model.parameters(), lr=train_config.learning_rate, momentum=train_config.momentum, weight_decay=train_config.weight_decay, ) if optimizer_state_dict: optimizer.load_state_dict(optimizer_state_dict) # optimizer = torch.optim.Adadelta(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=train_config.lr_step_size, gamma=train_config.gamma) if scheduler_state_dict: scheduler.load_state_dict(scheduler_state_dict) # Set up first epoch, if need to resume. first_epoch: int = 1 if resume_epoch: first_epoch = resume_epoch try: # First, get initial train and test scores. initial_train_acc, initial_train_loss = eval_model.evaluate_model( model=model, dataloader=train_loader, torch_device=torch_device) train_acc_batches.add(initial_train_acc) train_acc_epochs.add(initial_train_acc) train_loss_batches.add(initial_train_loss) train_loss_epochs.add(initial_train_loss) initial_test_acc, initial_test_loss = eval_model.evaluate_model( model=model, dataloader=test_loader, torch_device=torch_device) test_acc_epochs.add(initial_test_acc) test_loss_epochs.add(initial_test_loss) model_size_epochs.add( eval_model.get_number_of_model_parameters(model=model)) # Save initial model checkpoint. if save_checkpoint_per_epoch: save_model_and_state_dict_checkpoint( model=model, checkpoints_folder_path=checkpoints_folder_path, epoch=0, model_config=model_config, optimizer=optimizer, scheduler=scheduler, ) clear_mem(logger) # Track best test accuracy. best_test_acc: float = initial_test_acc # Train. for epoch in range(first_epoch, train_config.num_epochs + 1): train( logger, log_interval, model, train_loader, epoch, optimizer, scheduler, torch_device, train_loss_batches, train_loss_epochs, train_acc_batches, train_acc_epochs, ) test_acc, test_loss = eval_model.evaluate_model( model=model, dataloader=test_loader, torch_device=torch_device) test_acc_epochs.add(test_acc) test_loss_epochs.add(test_loss) model_size_epochs.add( eval_model.get_number_of_model_parameters(model=model)) scheduler.step() # Save best model checkpoint, if needed. if test_acc > best_test_acc: best_test_acc = test_acc if save_best_checkpoint: save_model_and_state_dict_checkpoint( model=model, checkpoints_folder_path=checkpoints_folder_path, epoch=epoch, checkpoint_name=BEST_CHECKPOINT_EPOCH_TEXT, model_config=model_config, optimizer=optimizer, scheduler=scheduler, ) # Save incremental checkpoint, if needed. if save_checkpoint_per_epoch and ( (epoch == 1) or (epoch == train_config.num_epochs) or ((epoch % save_interval) == 0)): save_model_and_state_dict_checkpoint( model=model, checkpoints_folder_path=checkpoints_folder_path, epoch=epoch, model_config=model_config, optimizer=optimizer, scheduler=scheduler, ) # Incrementally save losses per epoch. for stat_counter in stat_counters.values(): stat_counter.save_default() clear_mem(logger) except Exception as exception: logger.error(exception, exc_info=True) finally: # Save losses. for stat_counter in stat_counters.values(): stat_counter.save_default() return stat_counters
def prune_network(prune_config: prune_config_utils.PruneConfig, pruned_output_folder: Text, model_checkpoint_path: Optional[Text] = None, **kwargs) -> None: """ Can provide a model_checkpoint_path to override any model checkpoint path specified in prune_config. """ logger = logging_utils.get_logger(LOGGER_NAME) # Create output folder, if it does not exist. if not os.path.exists(pruned_output_folder): os.makedirs(pruned_output_folder) # Save original prune config. general_config_utils.write_config_to_file( prune_config, os.path.join(pruned_output_folder, FILE_NAME_PRUNE_CONFIG)) # Load model. model_path: Text if model_checkpoint_path: model_path = model_checkpoint_path prune_config.original_model_path = model_checkpoint_path else: model_path = prune_config.original_model_path logger.info("Loading model checkpoint from: {}".format(model_path)) load_location = torch.device("cpu") # Can make this None, as default model = torch.load(model_path, map_location=load_location) with torch.no_grad(): # Perform pruning. model.eval() logger.info("Starting pruning for prune_type: {}".format( prune_config.prune_type)) if prune_config.prune_type == "craig": prune_network_with_craig(model=model, prune_config=prune_config, **kwargs) elif prune_config.prune_type == "mussay": torch_device: torch.device = torch.device("cpu") prune_network_with_mussay(model=model, prune_config=prune_config, torch_device=torch_device, **kwargs) else: raise ValueError("prune_type not supported: {}".format( prune_config.prune_type)) # Save pruned model. out_model_path: Text = os.path.join(pruned_output_folder, FILE_NAME_MODEL) torch.save(model, out_model_path) logger.info("Pruning complete") logger.info(model) # Save new model config. model_architecture = model.ARCHITECTURE_NAME out_model_config: Dict if model_architecture == "vgg": out_model_config = { "model_architecture": "vgg", "model_params": { "vgg_version": model.vgg_version, "num_classes": model.num_classes, "pretrained_imagenet": getattr(model, "pretrained_imagenet", False), }, } elif model_architecture == "fc_classifier": fc_layers = [ layer for layer in model.sequential_module if isinstance(layer, nn.Linear) ] out_model_config = { "model_architecture": "fc_classifier", "model_params": { "input_shape": [28, 28], "layers": [l.out_features for l in fc_layers[:-1]], "output_dim": 10, }, } elif model_architecture == "fc_2": fc_layers = [ layer for layer in model.sequential_module if isinstance(layer, nn.Linear) ] out_model_config = { "model_architecture": "fc_2", "model_params": { "input_shape": [28, 28], "layer_1_dim": fc_layers[0].out_features, "layer_2_dim": fc_layers[1].out_features, "output_dim": 10, }, } else: # Not supported. logger.info("Model architecture config not supported: {}".format( model_architecture)) return out_model_config_path: Text = os.path.join(pruned_output_folder, FILE_NAME_MODEL_CONFIG) with open(out_model_config_path, "w") as out_model_config_file: json.dump(out_model_config, out_model_config_file) logger.info("Wrote model config to: {}".format(out_model_config_path))
def prune_network_with_craig(model: nn.Module, prune_config: prune_config_utils.PruneConfig, **kwargs) -> None: """This currently assumes that all fully connected layers are directly in one sequence, and that there are no non-FC layers after the last FC layer of that sequence.""" logger = logging_utils.get_logger(LOGGER_NAME) model = model.to(torch.device("cpu")) # Get params for each layer. layer_params: Dict = prune_config.prune_params[ prune_config_utils.KEY_LAYER_PARAMS] # Get list of model layers/parameters. model_layers: List[nn.Module] = model.ordered_unpacking num_layers: int = len(model_layers) output_layer_index: int = num_layers - 1 model_data_shapes: List = [[] for _ in model_layers] # Use model input shape to get data output shape for each layer. def layer_shape_hook(layer_ind): def inner(self, input, output): # Discard the batch size. model_data_shapes[layer_ind] = output.data.shape[1:] return inner model_hooks = [] for layer_ind, layer in enumerate(model_layers): model_hooks.append( layer.register_forward_hook(layer_shape_hook(layer_ind))) run_single_data_point( model=model, model_input_shape=prune_config.model_input_shape, data_transform_name=prune_config.data_transform_name, ) for mhook in model_hooks: mhook.remove() curr_layer_i: int = 0 while curr_layer_i < output_layer_index: # Iterate through layers, prune as necessary. curr_layer: nn.Module = model_layers[curr_layer_i] curr_layer_type: Type[nn.Module] = type(curr_layer) curr_layer_params: Optional[Dict] curr_layer_prune_func: Optional[Callable[..., Tuple[List[int], List[float]]]] curr_layer_prune_func = CRAIG_LAYER_FUNCTION_MAP.get( curr_layer_type, None) curr_layer_params = layer_params.get( LAYER_NAME_MAP.get( curr_layer_type, None), # First try to get the current layer params layer_params.get( prune_config_utils. KEY_LAYER_ALL, # Otherwise try to get an "all" overriding param None, ), ) if (not curr_layer_prune_func) or (not curr_layer_params): # If either the prune function or prune params was not found, skip. curr_layer_i += 1 continue # Prune the current layer. subset_nodes: List[int] subset_weights: List[float] subset_nodes, subset_weights = curr_layer_prune_func( layer=curr_layer, **(curr_layer_params)) subset_len: int = len(subset_nodes) next_layer_i: int = curr_layer_i + 1 while next_layer_i < num_layers: # Find the next prunable layer and update the weights accordingly. next_layer: nn.Module = model_layers[next_layer_i] next_layer_type: Type[nn.Module] = type(next_layer) if next_layer_type not in CRAIG_LAYER_FUNCTION_MAP: # If this layer is not prunable, skip. next_layer_i += 1 continue if isinstance(next_layer, nn.Conv2d): # Change conv in channels to match the pruned subset. next_layer.weight = nn.Parameter( next_layer.weight[:, subset_nodes]) next_layer.in_channels = subset_len next_layer._in_channels = ( subset_len # Not sure if this is necessary. ) elif isinstance(next_layer, nn.Linear): # Assuming a pre-Linear flatten op, need to find the weights # that correspond to the channels that were kept in the pruning # of the previous layer. num_weights_per_channel: int if isinstance(curr_layer, nn.Conv2d): # If the initially pruned layer was a conv, then re-iterate # from curr_layer to next_layer, searching for the last # conv/pooling/relu/etc before a flatten-esque operation. for temp_i in range(curr_layer_i, next_layer_i): if len(model_data_shapes[temp_i]) != 3: break num_weights_per_channel = int( np.prod(model_data_shapes[temp_i][1:])) else: # Otherwise, the initially pruned layer must have been a # linear layer. In that case, we are currently assuming # that only other linear/relu/flatten/etc layers lie in # between. So, we can simply use the number of original # channels/features, which should be =1. num_weights_per_channel = int( np.prod(model_data_shapes[curr_layer_i][1:])) weights_to_keep: List[int] = [] for si in subset_nodes: weights_to_keep.extend( list( range( num_weights_per_channel * si, num_weights_per_channel * (si + 1), ))) next_layer.weight = nn.Parameter( next_layer.weight[:, weights_to_keep]) next_layer.in_features = len(weights_to_keep) else: logger.warning( "No pruning adjustment made to layer {} of type {}".format( next_layer_i, next_layer_type)) # Adjustments were attempted, now continue to the next layer for # pruning. break # Now that we have found the next prunable layer, we can jump to it. curr_layer_i = next_layer_i
""" CS224N 2018-19: Homework 3 parser_model.py: Feed-Forward Neural Network for Dependency Parsing Sahil Chopra <*****@*****.**> """ import pickle import os import time import torch import torch.nn as nn import torch.nn.functional as F import logging from utils import logging_utils logger = logging_utils.get_logger(module=__name__, loglevel=logging.INFO) class ParserModel(nn.Module): """ Feedforward neural network with an embedding layer and single hidden layer. The ParserModel will predict which transition should be applied to a given partial parse configuration. PyTorch Notes: - Note that "ParserModel" is a subclass of the "nn.Module" class. In PyTorch all neural networks are a subclass of this "nn.Module". - The "__init__" method is where you define all the layers and their respective parameters (embedding layers, linear layers, dropout layers, etc.). - "__init__" gets automatically called when you create a new instance of your class, e.g. when you write "m = ParserModel()". - Other methods of ParserModel can access variables that have "self." prefix. Thus,
import pickle import math import time from torch import nn, optim import torch from tqdm import tqdm import sys from parser_model import ParserModel from utils.parser_utils import minibatches, load_and_preprocess_data, AverageMeter import logging from utils import logging_utils logger = logging_utils.get_logger(loglevel = logging.DEBUG) # ----------------- # Primary Functions # ----------------- def train(parser, train_data, dev_data, output_path, batch_size=1024, n_epochs=10, lr=0.0005): """ Train the neural dependency parser. @param parser (Parser): Neural Dependency Parser @param train_data (): @param dev_data (): @param output_path (str): Path to which model weights and results are written. @param batch_size (int): Number of examples in a single batch @param n_epochs (int): Number of training epochs @param lr (float): Learning rate """