def train_epoch( epoch, net, train_metric, train_data, use_cuda, L, optimizer, # lr_scheduler, batch_size, log_interval): tic = time.time() net.train() train_metric.reset() train_loss = 0.0 btic = time.time() for i, (data, target) in enumerate(train_data): if use_cuda: data = data.cuda(non_blocking=True) target = target.cuda(non_blocking=True) output = net(data) loss = L(output, target) optimizer.zero_grad() loss.backward() optimizer.step() train_loss += loss.item() train_metric.update(labels=target, preds=output) if log_interval and not (i + 1) % log_interval: speed = batch_size * log_interval / (time.time() - btic) btic = time.time() train_accuracy_msg = report_accuracy(metric=train_metric) logging.info( "Epoch[{}] Batch [{}]\tSpeed: {:.2f} samples/sec\t{}\tlr={:.5f}" .format(epoch + 1, i, speed, train_accuracy_msg, optimizer.param_groups[0]["lr"])) throughput = int(batch_size * (i + 1) / (time.time() - tic)) logging.info( "[Epoch {}] speed: {:.2f} samples/sec\ttime cost: {:.2f} sec".format( epoch + 1, throughput, time.time() - tic)) train_loss /= (i + 1) train_accuracy_msg = report_accuracy(metric=train_metric) logging.info("[Epoch {}] training: {}\tloss={:.4f}".format( epoch + 1, train_accuracy_msg, train_loss)) return train_loss
def test(net, test_data, metric, use_cuda, input_image_size, in_channels, calc_weight_count=False, calc_flops=False, calc_flops_only=True, extended_log=False): if not calc_flops_only: tic = time.time() validate( metric=metric, net=net, val_data=test_data, use_cuda=use_cuda) accuracy_msg = report_accuracy( metric=metric, extended_log=extended_log) logging.info("Test: {}".format(accuracy_msg)) logging.info("Time cost: {:.4f} sec".format( time.time() - tic)) if calc_weight_count: weight_count = calc_net_weight_count(net) if not calc_flops: logging.info("Model: {} trainable parameters".format(weight_count)) if calc_flops: num_flops, num_macs, num_params = measure_model(net, in_channels, input_image_size) assert (not calc_weight_count) or (weight_count == num_params) stat_msg = "Params: {params} ({params_m:.2f}M), FLOPs: {flops} ({flops_m:.2f}M)," \ " FLOPs/2: {flops2} ({flops2_m:.2f}M), MACs: {macs} ({macs_m:.2f}M)" logging.info(stat_msg.format( params=num_params, params_m=num_params / 1e6, flops=num_flops, flops_m=num_flops / 1e6, flops2=num_flops / 2, flops2_m=num_flops / 2 / 1e6, macs=num_macs, macs_m=num_macs / 1e6))
def test(net, test_data, metric, use_cuda, input_image_size, in_channels, calc_weight_count=False, calc_flops=False, calc_flops_only=True, extended_log=False, show_bad_samples=False): """ Main test routine. Parameters: ---------- net : Module Model. test_data : DataLoader Data loader. metric : EvalMetric Metric object instance. use_cuda : bool Whether to use CUDA. input_image_size : tuple of 2 ints Spatial size of the expected input image. in_channels : int Number of input channels. calc_weight_count : bool, default False Whether to calculate count of weights. calc_flops : bool, default False Whether to calculate FLOPs. calc_flops_only : bool, default True Whether to only calculate FLOPs without testing. extended_log : bool, default False Whether to log more precise accuracy values. show_bad_samples : bool, default False Whether to log file names for bad samples. """ if not calc_flops_only: tic = time.time() validate( metric=metric, net=net, val_data=test_data, use_cuda=use_cuda) accuracy_msg = report_accuracy( metric=metric, extended_log=extended_log) logging.info("Test: {}".format(accuracy_msg)) logging.info("Time cost: {:.4f} sec".format( time.time() - tic)) if calc_weight_count: weight_count = calc_net_weight_count(net) if not calc_flops: logging.info("Model: {} trainable parameters".format(weight_count)) if calc_flops: num_flops, num_macs, num_params = measure_model(net, in_channels, input_image_size) assert (not calc_weight_count) or (weight_count == num_params) stat_msg = "Params: {params} ({params_m:.2f}M), FLOPs: {flops} ({flops_m:.2f}M)," \ " FLOPs/2: {flops2} ({flops2_m:.2f}M), MACs: {macs} ({macs_m:.2f}M)" logging.info(stat_msg.format( params=num_params, params_m=num_params / 1e6, flops=num_flops, flops_m=num_flops / 1e6, flops2=num_flops / 2, flops2_m=num_flops / 2 / 1e6, macs=num_macs, macs_m=num_macs / 1e6)) if show_bad_samples: store_misses = StoreMisses() validate( metric=store_misses, net=net, val_data=test_data, use_cuda=use_cuda) _, misses_list = store_misses.get() if len(misses_list) > 0: dataset = test_data.iterable.dataset if isinstance(test_data, tqdm) else test_data.dataset for i, miss_ind in enumerate(misses_list): logging.info("Miss [{}]: {}".format(i, dataset.get_file_name(miss_ind)))
def calc_model_accuracy(net, test_data, metric, use_cuda, input_image_size, in_channels, calc_weight_count=False, calc_flops=False, calc_flops_only=True, extended_log=False, ml_type="cls"): """ Estimating particular model accuracy. Parameters: ---------- net : Module Model. test_data : DataLoader Data loader. metric : EvalMetric Metric object instance. use_cuda : bool Whether to use CUDA. input_image_size : tuple of 2 ints Spatial size of the expected input image. in_channels : int Number of input channels. calc_weight_count : bool, default False Whether to calculate count of weights. calc_flops : bool, default False Whether to calculate FLOPs. calc_flops_only : bool, default True Whether to only calculate FLOPs without testing. extended_log : bool, default False Whether to log more precise accuracy values. ml_type : str, default 'cls' Machine learning type. Returns: ------- list of floats Accuracy values. """ if not calc_flops_only: tic = time.time() validate( metric=metric, net=net, val_data=test_data, use_cuda=use_cuda) accuracy_msg = report_accuracy( metric=metric, extended_log=extended_log) logging.info("Test: {}".format(accuracy_msg)) logging.info("Time cost: {:.4f} sec".format( time.time() - tic)) acc_values = metric.get()[1] acc_values = acc_values if type(acc_values) == list else [acc_values] else: acc_values = [] if calc_weight_count: weight_count = calc_net_weight_count(net) if not calc_flops: logging.info("Model: {} trainable parameters".format(weight_count)) if calc_flops: in_shapes = [(1, 640 * 25 * 5), (1,)] if ml_type == "asr" else\ [(1, in_channels, input_image_size[0], input_image_size[1])] num_flops, num_macs, num_params = measure_model( model=net, in_shapes=in_shapes) assert (not calc_weight_count) or (weight_count == num_params) stat_msg = "Params: {params} ({params_m:.2f}M), FLOPs: {flops} ({flops_m:.2f}M)," \ " FLOPs/2: {flops2} ({flops2_m:.2f}M), MACs: {macs} ({macs_m:.2f}M)" logging.info(stat_msg.format( params=num_params, params_m=num_params / 1e6, flops=num_flops, flops_m=num_flops / 1e6, flops2=num_flops / 2, flops2_m=num_flops / 2 / 1e6, macs=num_macs, macs_m=num_macs / 1e6)) return acc_values
def train_net(batch_size, num_epochs, start_epoch1, train_data, val_data, net, optimizer, lr_scheduler, lp_saver, log_interval, num_classes, val_metric, train_metric, use_cuda): """ Main procedure for training model. Parameters: ---------- batch_size : int Training batch size. num_epochs : int Number of training epochs. start_epoch1 : int Number of starting epoch (1-based). train_data : DataLoader Data loader (training subset). val_data : DataLoader Data loader (validation subset). net : Module Model. optimizer : Optimizer Optimizer. lr_scheduler : LRScheduler Learning rate scheduler. lp_saver : TrainLogParamSaver Model/trainer state saver. log_interval : int Batch count period for logging. num_classes : int Number of model classes. val_metric : EvalMetric Metric object instance (validation subset). train_metric : EvalMetric Metric object instance (training subset). use_cuda : bool Whether to use CUDA. """ assert (num_classes > 0) L = nn.CrossEntropyLoss() if use_cuda: L = L.cuda() assert (type(start_epoch1) == int) assert (start_epoch1 >= 1) if start_epoch1 > 1: logging.info("Start training from [Epoch {}]".format(start_epoch1)) validate(metric=val_metric, net=net, val_data=val_data, use_cuda=use_cuda) val_accuracy_msg = report_accuracy(metric=val_metric) logging.info("[Epoch {}] validation: {}".format( start_epoch1 - 1, val_accuracy_msg)) gtic = time.time() for epoch in range(start_epoch1 - 1, num_epochs): lr_scheduler.step() train_loss = train_epoch( epoch=epoch, net=net, train_metric=train_metric, train_data=train_data, use_cuda=use_cuda, L=L, optimizer=optimizer, # lr_scheduler, batch_size=batch_size, log_interval=log_interval) validate(metric=val_metric, net=net, val_data=val_data, use_cuda=use_cuda) val_accuracy_msg = report_accuracy(metric=val_metric) logging.info("[Epoch {}] validation: {}".format( epoch + 1, val_accuracy_msg)) if lp_saver is not None: state = { "epoch": epoch + 1, "state_dict": net.state_dict(), "optimizer": optimizer.state_dict(), } lp_saver_kwargs = {"state": state} val_acc_values = val_metric.get()[1] train_acc_values = train_metric.get()[1] val_acc_values = val_acc_values if type( val_acc_values) == list else [val_acc_values] train_acc_values = train_acc_values if type( train_acc_values) == list else [train_acc_values] lp_saver.epoch_test_end_callback( epoch1=(epoch + 1), params=(val_acc_values + train_acc_values + [train_loss, optimizer.param_groups[0]["lr"]]), **lp_saver_kwargs) logging.info("Total time cost: {:.2f} sec".format(time.time() - gtic)) if lp_saver is not None: opt_metric_name = get_metric_name(val_metric, lp_saver.acc_ind) logging.info("Best {}: {:.4f} at {} epoch".format( opt_metric_name, lp_saver.best_eval_metric_value, lp_saver.best_eval_metric_epoch))
def train_epoch( epoch, net, train_metric, train_data, use_cuda, L, optimizer, # lr_scheduler, batch_size, log_interval): """ Train model on particular epoch. Parameters: ---------- epoch : int Epoch number. net : Module Model. train_metric : EvalMetric Metric object instance. train_data : DataLoader Data loader. use_cuda : bool Whether to use CUDA. L : Loss Loss function. optimizer : Optimizer Optimizer. batch_size : int Training batch size. log_interval : int Batch count period for logging. Returns ------- float Loss value. """ tic = time.time() net.train() train_metric.reset() train_loss = 0.0 btic = time.time() for i, (data, target) in enumerate(train_data): if use_cuda: data = data.cuda(non_blocking=True) target = target.cuda(non_blocking=True) output = net(data) loss = L(output, target) optimizer.zero_grad() loss.backward() optimizer.step() train_loss += loss.item() train_metric.update(labels=target, preds=output) if log_interval and not (i + 1) % log_interval: speed = batch_size * log_interval / (time.time() - btic) btic = time.time() train_accuracy_msg = report_accuracy(metric=train_metric) logging.info( "Epoch[{}] Batch [{}]\tSpeed: {:.2f} samples/sec\t{}\tlr={:.5f}" .format(epoch + 1, i, speed, train_accuracy_msg, optimizer.param_groups[0]["lr"])) throughput = int(batch_size * (i + 1) / (time.time() - tic)) logging.info( "[Epoch {}] speed: {:.2f} samples/sec\ttime cost: {:.2f} sec".format( epoch + 1, throughput, time.time() - tic)) train_loss /= (i + 1) train_accuracy_msg = report_accuracy(metric=train_metric) logging.info("[Epoch {}] training: {}\tloss={:.4f}".format( epoch + 1, train_accuracy_msg, train_loss)) return train_loss
def train_net(batch_size, num_epochs, start_epoch1, train_data, val_data, net, optimizer, lr_scheduler, lp_saver, log_interval, num_classes, val_metric, train_metric, use_cuda): assert (num_classes > 0) L = nn.CrossEntropyLoss() if use_cuda: L = L.cuda() assert (type(start_epoch1) == int) assert (start_epoch1 >= 1) if start_epoch1 > 1: logging.info("Start training from [Epoch {}]".format(start_epoch1)) validate( metric=val_metric, net=net, val_data=val_data, use_cuda=use_cuda) val_accuracy_msg = report_accuracy(metric=val_metric) logging.info("[Epoch {}] validation: {}".format(start_epoch1 - 1, val_accuracy_msg)) gtic = time.time() for epoch in range(start_epoch1 - 1, num_epochs): lr_scheduler.step() train_loss = train_epoch( epoch=epoch, net=net, train_metric=train_metric, train_data=train_data, use_cuda=use_cuda, L=L, optimizer=optimizer, # lr_scheduler, batch_size=batch_size, log_interval=log_interval) validate( metric=val_metric, net=net, val_data=val_data, use_cuda=use_cuda) val_accuracy_msg = report_accuracy(metric=val_metric) logging.info("[Epoch {}] validation: {}".format(epoch + 1, val_accuracy_msg)) if lp_saver is not None: state = { "epoch": epoch + 1, "state_dict": net.state_dict(), "optimizer": optimizer.state_dict(), } lp_saver_kwargs = {"state": state} val_acc_values = val_metric.get()[1] train_acc_values = train_metric.get()[1] val_acc_values = val_acc_values if type(val_acc_values) == list else [val_acc_values] train_acc_values = train_acc_values if type(train_acc_values) == list else [train_acc_values] lp_saver.epoch_test_end_callback( epoch1=(epoch + 1), params=(val_acc_values + train_acc_values + [train_loss, optimizer.param_groups[0]["lr"]]), **lp_saver_kwargs) logging.info("Total time cost: {:.2f} sec".format(time.time() - gtic)) if lp_saver is not None: opt_metric_name = get_metric_name(val_metric, lp_saver.acc_ind) logging.info("Best {}: {:.4f} at {} epoch".format( opt_metric_name, lp_saver.best_eval_metric_value, lp_saver.best_eval_metric_epoch))