def eval_acc(inference, val_loader, ctx, return_meta=False): mtc_acc = Accuracy() mtc_acc.reset() feature_nest, y_nest, y_hat_nest = [], [], [] for X, y in val_loader: X = X.as_in_context(ctx[0]) y = y.as_in_context(ctx[0]) with autograd.record(train_mode=False): y_hat, features = inference(X) # update metric mtc_acc.update([y], [y_hat]) if return_meta: y_nest.extend(y.asnumpy()) feature_nest.extend(features.asnumpy()) y_hat_nest.extend(y_hat.asnumpy()) feature_nest = np.array(feature_nest) y_nest = np.array(y_nest) y_hat_nest = np.array(y_hat_nest) if return_meta: return mtc_acc.get()[1], y_nest, y_hat_nest, feature_nest return mtc_acc.get()[1]
def eval(self, inference, val_loader, log=True, target=True, epoch=True): """ Evaluate the model :param inference: network :param val_loader: data loader :param log: log flag :param target: target flag for updating the record and log :param epoch: epoch flag for updating the record and log :return: """ mtc_acc = Accuracy() mtc_acc.reset() # val_loader.reset() feature_nest, y_nest, y_hat_nest = [], [], [] for X, Y in val_loader: X_lst = split_and_load(X, self.args.ctx, even_split=False) Y_lst = split_and_load(Y, self.args.ctx, even_split=False) for x, y in zip(X_lst, Y_lst): y_hat, features = inference(x) # update metric mtc_acc.update([y], [y_hat]) y_nest.extend(y.asnumpy()) feature_nest.extend(features.asnumpy()) y_hat_nest.extend(y_hat.asnumpy()) feature_nest = np.array(feature_nest) y_nest = np.array(y_nest).astype(int) y_hat_nest = np.array(y_hat_nest) if log: target_key = 'Tgt' if target else 'Src' epoch_key = 'Epoch' if epoch else 'Iter' record = self.cur_epoch if epoch else self.cur_iter if mtc_acc.get()[1] > self.records[epoch_key]['%s-Acc' % target_key]: if target: self.records[epoch_key][epoch_key] = record self.records[epoch_key]['%s-Acc' % target_key] = mtc_acc.get()[1] self.records[epoch_key]['%s-label' % target_key] = y_nest self.records[epoch_key]['%s-preds' % target_key] = y_hat_nest self.records[epoch_key]['%s-features' % target_key] = feature_nest self.save_params(inference, 0, epoch_key) self.logger.update_scalar( '%s [%d]: Eval-Acc-%s' % (epoch_key, record, target_key), mtc_acc.get()[1]) if self.sw: self.sw.add_scalar('Acc/Eval-%s-Acc-%s' % (epoch, target_key), mtc_acc.get()[1], global_step=record) return mtc_acc.get()[1], y_nest, y_hat_nest, feature_nest
def train_block(self, data_iter: DataLoader, docs: Sequence[Document]) -> float: acc = Accuracy() for dids, sids, data, label in tqdm(data_iter, leave=False): # batch_size, sequence_length, input_size -> sequence_length, batch_size, input_size X = nd.transpose(data, axes=(1, 0, 2)).as_in_context(self.ctx) # batch_size, sequence_length -> sequence_length, batch_size Y = label.T.as_in_context(self.ctx) state = self.model.begin_state(batch_size=X.shape[1], ctx=self.ctx) for s in state: s.detach() with autograd.record(): output, state = self.model(X, state) l = self.loss(output, Y) l.backward() grads = [param.grad(self.ctx) for param in self.model.collect_params().values()] clip_global_norm(grads, self.model.rnn_layer.clip * X.shape[0] * X.shape[1]) # sequence_length, batch_size -> batch_size, sequence_length for batch, (preds, labels) in enumerate(zip(nd.argmax(output, axis=2).T, label)): sen = docs[dids[batch].asscalar()].sentences[sids[batch].asscalar()] sequence_length = len(sen) preds = preds[:sequence_length] labels = labels[:sequence_length] acc.update(labels=labels, preds=preds) self.trainer.step(data.shape[0]) return float(acc.get()[1])
def evaluate_block(self, data_iter: DataLoader, docs: Sequence[Document]) -> float: """ :param data_iter: :param docs: :return: """ self.decode_block(data_iter=data_iter, docs=docs) if self.chunking: acc = ChunkF1() for doc in docs: for sen in doc.sentences: acc.update(labels=sen[to_gold(self.key)], preds=sen[self.key]) else: acc = Accuracy() for doc in docs: for sen in doc.sentences: labels = nd.array([ self.label_map.cid(label) for label in sen[to_gold(self.key)] ]) preds = nd.array( [self.label_map.cid(pred) for pred in sen[self.key]]) acc.update(labels=labels, preds=preds) return acc.get()[1]
def eval_epoch(self): self.is_train = False meter = Accuracy() meter.reset() for X, y in self.test_loader: X = X.as_in_context(self.ctx[0]) y = y.as_in_context(self.ctx[0]) y_hat, features = self.net(X) meter.update([y], [y_hat]) acc = meter.get()[1] logging.info('Test - Epoch {}, Iter {}, Acc {:.2f} %'.format( self.cur_epoch, self.cur_iter, acc * 100)) if acc > self.eval_tracker['Acc']: self.eval_tracker.update({ 'Epoch': self.cur_epoch, 'Iter': self.cur_iter, 'Acc': acc }) self.net.save_parameters('{}_{}_{}_{:.2f}.params'.format( self.cfg.META.CKPT_PATH, self.cur_epoch, self.cur_iter, acc))
def run_training(net, trainer, train_dataloader, val_dataloader, epochs, model_path, context): loss_fn = mx.gluon.loss.SoftmaxCrossEntropyLoss() for e in range(epochs): train_acc = Accuracy() val_acc = Accuracy() train_loss = 0. total_items = 0 for i, (data, label) in enumerate(train_dataloader): items_per_iteration = data.shape[0] total_items += items_per_iteration data = data.as_in_context(context) label = label.as_in_context(context) with autograd.record(): output = net(data) output = output.reshape((-1, 3)) label = label.reshape((-1, 1)) loss = loss_fn(output, label) loss.backward() trainer.step(items_per_iteration) train_loss += loss.mean().asscalar() train_acc.update(label.flatten(), output.argmax(axis=1).flatten()) for i, (data, label) in enumerate(val_dataloader): data = data.as_in_context(context) label = label.as_in_context(context) output = net(data) output = output.reshape((-1, 3)) val_acc.update( label.reshape(-1, 1).flatten(), output.argmax(axis=1).flatten()) print( "Epoch {}. Current Loss: {:.5f}. Train accuracy: {:.3f}, Validation accuracy: {:.3f}." .format(e, train_loss / total_items, train_acc.get()[1], val_acc.get()[1])) net.save_parameters(model_path) return model_path
def validate(net, val_loader, gpu_id, train_index2words, val_index2words): metric = BleuMetric(pred_index2words=train_index2words, label_index2words=val_index2words) metruc_acc = Accuracy() metruc_acc.reset() metric.reset() for batch in tqdm.tqdm(val_loader): batch = [x.as_in_context(mx.gpu(gpu_id)) for x in batch] image, label, label_len = batch predictions, alphas = net(image, None, None) for n, l in enumerate(label_len): l = int(l.asscalar()) la = label[n, 1:l] pred = predictions[n, :] metric.update(la, pred) metruc_acc.update(la, predictions[n, :(l - 1)]) return metric.get()[1], metruc_acc.get()[1]
def train_block(self, data_iter: DataLoader, docs: Sequence[Document]) -> float: """ :param data_iter: :param sens: :return: """ acc = Accuracy() for data, label in tqdm(data_iter, leave=False): data = data.as_in_context(self.ctx) label = label.as_in_context(self.ctx) with autograd.record(): output = self.model(data) l = self.loss(output, label) l.backward() for preds, labels in zip(nd.argmax(output, axis=1), label): acc.update(labels=labels, preds=preds) self.trainer.step(data.shape[0]) return float(acc.get()[1])
def validate(net, val_loader, gpu_id, train_index2words, val_index2words): metric = BleuMetric(pred_index2words=train_index2words, label_index2words=val_index2words) metruc_acc = Accuracy() metruc_acc.reset() metric.reset() for batch in tqdm.tqdm(val_loader): batch = [Variable(torch.from_numpy(x.asnumpy()).cuda()) for x in batch] image, label, label_len = batch label = label.long() label_len = label_len.long() predictions, alphas = net(image, None, None) for n, l in enumerate(label_len): l = int(l.data.cpu().numpy().squeeze().tolist()) la = label[n, 1:l].data.cpu().numpy() pred = predictions[n, :].data.cpu().numpy() metric.update(la, pred) metruc_acc.update( mx.nd.array(la), mx.nd.array(predictions[n, :(l - 1)].data.cpu().numpy())) return metric.get()[1], metruc_acc.get()[1]
def evaluation(self, x, y_true): """ 输入一组数据和标签返回正确率和交叉熵(y与y_true) :param x: data :param y_true: label(one-hot-like) :return: (accuracy,crossentropy) """ #处理onehot标签得到真实标签 nor_label = nd.argmax(y_true, axis=1, keepdims=False) #type:nd.NDArray #predict不一定是softmax过的值 应将其归一化 使其相加值为1 #否则会出现NaN的情况 raw_pred = self.predict(x) #type:nd.NDArray y_pred = raw_pred / raw_pred.sum(axis=1, keepdims=True) #type:nd.NDArray y_pred_sparse = y_pred.argmax(axis=1, keepdims=False) ##开始求各参数 acc = Accuracy() acc.update(labels=[nor_label], preds=[y_pred]) acc_val = acc.get()[1] # 交叉熵 cro = CrossEntropy() cro.update(labels=[nor_label], preds=[y_pred]) cro_val = cro.get()[1] #确定average方式 如果预测值中的每个item的长度大于2表示是多分类 则使用macro方式统计 否则采用binary average = "macro" if len(raw_pred[0]) > 2 else "binary" # Recall recall = recall_score(nor_label.asnumpy(), y_pred_sparse.asnumpy(), average=average, pos_label=self.pos_label) # 精确率 precision = precision_score(nor_label.asnumpy(), y_pred_sparse.asnumpy(), average=average, pos_label=self.pos_label) # 返回 return acc_val, cro_val, recall, precision
losses = [] with ag.record(): for x, y in zip(data, label): z = model(x) # computes softmax cross entropy loss l = loss_fn(z, y) output.append(z) losses.append(l) # backpropagate the error for one iteration. for l in losses: l.backward() # Update network weights trainer.step(BATCH_SIZE) # Update metric metric.update(label, output) str1 = 'Epoch [{}], Accuracy {:.4f}'.format(epoch, metric.get()[1]) str2 = '~Samples/Sec {:.4f}'.format(BATCH_SIZE * (i + 1) / (time.time() - tick_0)) print('%s %s' % (str1, str2)) metric.reset() elapsed = time.perf_counter() - start print('elapsed: {:0.3f}'.format(elapsed)) # use Accuracy as the evaluation metric metric = Accuracy() for data, label in test_data: data = split_and_load(data, ctx_list=ctx, batch_axis=0) label = split_and_load(label, ctx_list=ctx, batch_axis=0) outputs = [] for x in data:
def main(train_list, val_list, model, exp, saved_model, batch_size, optimizer, nb_epochs, augment, max_lr, min_lr, loss_function, train_all, nb_frames, eager, params=None, **kwargs): print("Unused arguments:", kwargs) setname = train_list.split(os.sep)[0] # Timestamp to name experiment folder xptime = strftime("%Y-%m-%d_%Hh%Mm%Ss", gmtime()) xp_folder = "experiments/%s-%s-%s_%s" % (setname, model, exp, xptime) # Make folder mkdir_p(xp_folder) mkdir_p(os.path.join(xp_folder, 'checkpoints')) mkdir_p(os.path.join(xp_folder, 'tb')) print("\nSaving experiment data to:", xp_folder) # Save command (as well as possible) with open(os.path.join(xp_folder, 'command.sh'), "w") as f: command = " ".join(sys.argv[:]) + "\n" f.write(command) # Save employed parameters for future reference if params is not None: write_params(os.path.join(xp_folder, 'params.json'), params) ############# # Callbacks # ############# # Helper: Save the model. ckpt_fmt = os.path.join( xp_folder, 'checkpoints', model + '-' + exp + '.{epoch:03d}-loss{val_loss:.3f}-acc{val_acc:.3f}.hdf5') checkpointer = ModelCheckpoint(filepath=ckpt_fmt, verbose=1, save_best_only=True, monitor='val_acc') # Helper: TensorBoard tb = HistoryKeeper(logdir=os.path.join(xp_folder), keys=['val_acc', 'val_loss', 'train_time', 'val_time']) # Helper: Stop when we stop learning. # early_stopper = EarlyStopper(patience=15) # Helper: Terminate when finding a NaN loss nan_term = TerminateOnNaN() callbacks = [tb, checkpointer, nan_term] ############# ############# # Loading # ############# if augment: augmenter = default_augmenter_vid(strip_size=4) else: augment = False augmenter = None # Dataset classes train_data = ArrayData(train_list, nb_frames=nb_frames, augmenter=augmenter, eager=eager) val_data = ArrayData(val_list, nb_frames=nb_frames, augmenter=None, eager=eager, encoder=train_data.get_encoder()) # Saving encoder with open(os.path.join(xp_folder, 'encoder.pkl'), 'wb') as f: pickle.dump(train_data.get_encoder(), f) # Train loader train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, last_batch='keep', num_workers=10) nb_samples = len(train_data) # loader should provide the number of sampĺes # Validation loader val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, last_batch='keep', num_workers=10) nb_validation = len( val_data) # loader should provide the number of sampĺes # Compute number of steps steps_per_epoch = math.ceil(nb_samples / batch_size) validation_steps = math.ceil(nb_validation / batch_size) # The model net = ResearchModels(train_data.nb_classes, model, saved_model, input_shape=train_data.shape, train_all=train_all).model # A little more verbosity print("************************************") if train_all: print("Train all layers.") print("Max lr:", max_lr, " Min lr:", min_lr) print("Batch size:", batch_size) print(nb_samples, "training samples,", steps_per_epoch, "steps per epoch") print(nb_validation, "validation samples,", validation_steps, "validation steps") print("Optimizer:", optimizer) if augment: print("Using data augmentation") else: print("WARNING: Not using data augmentation") print("************************************") ############################ # Loss and Optimization # ############################ trainer = gluon.Trainer(net.collect_params(), optimizer, {'learning_rate': max_lr}) if loss_function == 'categorical_crossentropy': loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() loss_fn.hybridize() ############ # Training # ############ progress_desc = "Super epoch %03d - acc %.3f - loss %.3f " acc = Accuracy() start_time = time() super_epoch_size = 250 # Learning rate decay iteration = 1 decay_alpha = 0.01**0.25 lr = max_lr for epoch in range(1, nb_epochs + 1): train_loss, val_loss = 0., 0. nb_batches = 0 tic = time() acc.reset() start_training = time() t = tqdm(range(super_epoch_size), unit='epochs') for _ in t: for data, label in train_loader: # Learning rate decay if iteration % 10000 == 0: lr *= decay_alpha trainer.set_learning_rate(lr) print("Learning rate updated to", lr) iteration += 1 current_batch_size = data.shape[0] data = data.copyto(mx.gpu(0)) label = label.copyto(mx.gpu(0)) with autograd.record(): output = net(data) loss = loss_fn(output, label) loss.backward() # print(mx.nd.log_softmax(output[0], axis=-1), label[0]) # update parameters trainer.step(current_batch_size) # calculate training metrics train_loss += loss.mean().asscalar() # accuracy(output, label) acc.update(preds=output, labels=label) nb_batches += 1 t.set_description(progress_desc % (epoch, acc.get()[1], train_loss / nb_batches)) train_time = time() - start_training train_loss /= steps_per_epoch * super_epoch_size train_acc = acc.get()[1] acc.reset() start_val = time() # calculate validation accuracy tval = tqdm(val_loader, leave=False, desc='Running validation', unit='batch') for data, label in tval: data = data.copyto(mx.gpu(0)) label = label.copyto(mx.gpu(0)) # Compute outputs output = net(data) loss = loss_fn(output, label) # Compute metrics val_loss += loss.mean().asscalar() # val_acc += accuracy(output, label) acc.update(preds=output, labels=label) val_time = time() - start_val val_loss /= validation_steps val_acc = acc.get()[1] print( "Epoch %d: loss %.3f, acc %.3f, val_loss %.3f, val_acc %.3f, in %.1f sec" % (epoch, train_loss, train_acc, val_loss, val_acc, time() - tic)) print( "--------------------------------------------------------------------------------" ) stop = False train_info = { 'epoch': epoch, 'loss': train_loss, 'acc': train_acc, 'val_loss': val_loss, 'val_acc': val_acc, 'train_time': train_time, 'val_time': val_time } for cb in callbacks: if cb(net, train_info): stop = True if stop: break print() hours, rem = divmod(time() - start_time, 3600) days, hours = divmod(hours, 24) minutes, seconds = divmod(rem, 60) print("%d training epochs in %dd, %dh%dm%.2fs." % (nb_epochs, int(days), int(hours), int(minutes), seconds))
def run_training(net, trainer, train_dataloader, val_dataloader, intents_count, epochs, model_path, context): intent_loss_fn = mx.gluon.loss.SoftmaxCrossEntropyLoss() max_val_accuracy = 0 best_model_path = '' for e in range(epochs): intent_train_acc = Accuracy() slot_train_acc = Accuracy() intent_val_acc = Accuracy() slot_val_acc = Accuracy() train_loss = 0. total_items = 0 for i, (data, valid_lengths, entities, intent) in enumerate(train_dataloader): length = data.shape[1] items_per_iteration = data.shape[0] total_items += items_per_iteration data = data.as_in_context(context) intent = intent.as_in_context(context) entities = entities.as_in_context(context) hidden_state = net.elmo_container[0].begin_state( mx.nd.zeros, batch_size=items_per_iteration, ctx=context) mask = get_data_mask(length, valid_lengths, items_per_iteration, context) with autograd.record(): intents, slots = net(data, hidden_state, mask) intents = intents.reshape((-1, intents_count)) intent = intent.reshape((-1, 1)) loss_intent = intent_loss_fn(intents, intent) # crf accepts seq_len x bs x channels score, slots_seq = net.crf(slots.transpose(axes=(1, 0, 2))) neg_log_likelihood = net.crf.neg_log_likelihood( slots.transpose(axes=(1, 0, 2)), entities) loss = 0.1 * loss_intent.mean( ) + 0.9 * neg_log_likelihood.mean() loss.backward() trainer.step(1) train_loss += loss.mean().asscalar() intent_train_acc.update(intent.flatten(), intents.argmax(axis=1).flatten()) slot_train_acc.update(entities, slots_seq) for i, (data, valid_lengths, entities, intent) in enumerate(val_dataloader): items_per_iteration = data.shape[0] length = data.shape[1] data = data.as_in_context(context) intent = intent.as_in_context(context) entities = entities.as_in_context(context) hidden_state = net.elmo_container[0].begin_state( mx.nd.zeros, batch_size=items_per_iteration, ctx=context) mask = get_data_mask(length, valid_lengths, items_per_iteration, context) intents, slots = net(data, hidden_state, mask) intents = intents.reshape((-1, intents_count)) intent = intent.reshape((-1, 1)) score, slots_seq = net.crf(slots.transpose(axes=(1, 0, 2))) intent_val_acc.update(intent.flatten(), intents.argmax(axis=1).flatten()) slot_val_acc.update(entities, slots_seq) print( "Epoch {}. Current Loss: {:.5f}. \n" "Intent train accuracy: {:.3f}, Slots train accuracy: {:.3f}, \n" "Intent valid accuracy: {:.3f}, Slot val accuracy: {:.3f}".format( e, train_loss / total_items, intent_train_acc.get()[1], slot_train_acc.get()[1], intent_val_acc.get()[1], slot_val_acc.get()[1])) if max_val_accuracy < slot_val_acc.get()[1]: max_val_accuracy = slot_val_acc.get()[1] best_model_path = model_path + '_{:04d}.params'.format(e) net.save_parameters(best_model_path) print("Improvement observed") else: print("No improvement") return best_model_path
def load_net(param_file="net.params", ctx=cpu(0)): net = SimpleNet() net.load_parameters(param_file, ctx=ctx) return net def get_val_data(transformer, batch_size=128): mnist_valid = gluon.data.vision.FashionMNIST(train=False) valid_data = gluon.data.DataLoader( mnist_valid.transform_first(transformer), batch_size=batch_size, num_workers=4) return valid_data if __name__ == "__main__": ctx = gpu(0) if context.num_gpus() else cpu(0) net = load_net("net.params", ctx=ctx) valid_data = get_val_data(transformer) val_acc = Accuracy() for data, label in valid_data: data = data.as_in_context(ctx) label = label.as_in_context(ctx) with autograd.predict_mode(): out = net(data) val_acc.update(label, out) print("Accuray: ", val_acc.get()[1])