def _train_validate_RF_Classifier(self, trees, max_depth): '''Treina um classificador Random Forest especificando a quantidade de árvores e profundidade máxima. As amostras de treino são usadas para criar/treinar o classificador e as amostras de do set de validação são usadas para obter as métricas. trees (int) : quantidade de árvores usadas para criar o classificador. max_depth (int) : profundidade máxima permitida. Return: metrics (float) : acurácia obtida pelo classificador na base de validação. ''' trees = int(trees) max_depth = int(max_depth) # Criando classificador rf_clf = RandomForestClassifier(n_estimators=trees, max_depth=max_depth, random_state=self.random_seed) # Fita o classificador rf_clf.fit(self.x_train, self.y_train) # Mede acurácia média para o treino accuracy_train = rf_clf.score(self.x_train, self.y_train) # Obtém métricas no conjunto de validação y_pred = rf_clf.predict(self.x_val) metrics_val = metrics.calculate_metrics(y_pred, self.y_val) # Salva na lista com os resultados self.results_train_val.append( (accuracy_train, metrics_val['accuracy'], trees, max_depth, metrics_val['TP'], metrics_val['FP'], metrics_val['FN'], metrics_val['TN'], metrics_val['precision'], metrics_val['recall'], metrics_val['f1'])) # Otimiza pela acurácia do dataset de validação return metrics_val['accuracy']
def one_feature_pipeline_cross_val(train: Data, test: Data, params: SingleBaseParams) -> None: f = get_f(params.get_feature_name()) X_train, y_train = train.get_x_y(f) X_test, y_test = test.get_x_y(f) pipeline = get_feature_pipeline(params) from utils.metrics import calculate_metrics from sklearn.model_selection import cross_val_score results = cross_val_score(pipeline, X_train, y_train, cv=10) print(params.method) print("Cross validated accuracy score: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100)) pipeline.fit(X_train, y_train) test_score = pipeline.score(X_test, y_test) preds = pipeline.predict(X_test) m = calculate_metrics(y_test, preds) print(m.to_string()) print("test score = %0.4f" % test_score) print(params.get_feature_name()) with open( "base_features_hyperparams/" + params.get_feature_name() + '.txt', 'a') as f: f.write("-------\n") f.write("score: %.2f%% (%.2f%%)\n" % (results.mean() * 100, results.std() * 100)) f.write("test score = %0.4f\n" % test_score) f.write(m.to_string() + '\n') f.write("method %s\n" % params.method) f.write("pca" + str(params.pca) + '\n') f.write("-------\n")
def get_state_dict(self) -> Dict: loss = self.loss / self.batch_processed if self.batch_processed > 0 else 0.0 state_dict = {'loss': loss} if self.lr != 0: state_dict['learning_rate'] = self.lr state_dict.update(calculate_metrics(self.statistics)) return state_dict
def test_calculating_zero_metrics(self): statistics = { 'true_positive': 0, 'false_positive': 0, 'false_negative': 0 } metrics = calculate_metrics(statistics) for metric, value in metrics.items(): self.assertEqual(0.0, value)
def __init__(self, x: List[List[float]], y: List[str], feature: BaseParams, verbose, with_scaling): self.x = x from utils.mapping import map_y_array_to_int from utils.metrics import calculate_metrics self.y = map_y_array_to_int(y) self.feature = feature self.feature_name = feature.get_features_names_str() self.updated_ranking = dict() self.updated_ranking_changed: int = 0 self.verbose = verbose self.accuracy = calculate_metrics(self.y, x).accuracy self.with_scaling = with_scaling
def __init__(self, x: List[List[float]], y: List[str], feature: BaseParams, verbose, with_scaling): self.x = x self._i = 0 self.y = map_y_array_to_int(y) self.feature = feature self.feature_name = feature.get_features_names_str() self.verbose = verbose self.accuracy = calculate_metrics(self.y, x).accuracy print(self.feature_name, self.accuracy) self.with_scaling = with_scaling
def train_one_batch(self, model, vocab, src, trg, src_percentages, src_lengths, trg_lengths, smoothing, loss_type): pred, gold, hyp = model(src, src_lengths, trg, verbose=False) strs_golds, strs_hyps = [], [] for j in range(len(gold)): ut_gold = gold[j] strs_golds.append("".join( [vocab.id2label[int(x)] for x in ut_gold])) for j in range(len(hyp)): ut_hyp = hyp[j] strs_hyps.append("".join([vocab.id2label[int(x)] for x in ut_hyp])) # handling the last batch seq_length = pred.size(1) sizes = src_percentages.mul_(int(seq_length)).int() loss, num_correct = calculate_metrics(pred, gold, vocab.PAD_ID, input_lengths=sizes, target_lengths=trg_lengths, smoothing=smoothing, loss_type=loss_type) if loss is None: print("loss is None") if loss.item() == float('Inf'): logging.info("Found infinity loss, masking") print("Found infinity loss, masking") loss = torch.where(loss != loss, torch.zeros_like(loss), loss) # NaN masking total_cer, total_wer, total_char, total_word = 0, 0, 0, 0 for j in range(len(strs_hyps)): strs_hyps[j] = post_process(strs_hyps[j], vocab.special_token_list) strs_golds[j] = post_process(strs_golds[j], vocab.special_token_list) cer = calculate_cer(strs_hyps[j].replace(' ', ''), strs_golds[j].replace(' ', '')) wer = calculate_wer(strs_hyps[j], strs_golds[j]) total_cer += cer total_wer += wer total_char += len(strs_golds[j].replace(' ', '')) total_word += len(strs_golds[j].split(" ")) return loss, total_cer, total_char
def play(self): predictions = [] for i in range(len(self.agents[0].x)): if self.verbose: print("\n----------------------") print("round:", i) print("----------------------\n") winner = self.small_round() if self.verbose: print(winner, "TRUE:", self.agents[0].y[i]) predictions.append(winner) self.upp_i() y = self.agents[0].y from utils.metrics import calculate_metrics return calculate_metrics(y, predictions)
def cross_validate(data, n, metric, **kwargs): kf = KFold(n_splits=n, shuffle=True) train_metric = [] test_metric = [] for train_index, test_index in kf.split(data): train, test = data.iloc[train_index, :], data.iloc[test_index, :] write_to_file(train, "fasttext_data/captions/training_data.txt") write_to_file(test, "fasttext_data/captions/testing_data.txt") model = fasttext.train_supervised( input="fasttext_data/captions/training_data.txt", verbose=0, **kwargs) train_metric_result = calculate_metrics( "fasttext_data/captions/training_data.txt", model)['weighted avg'][metric] test_metric_result = calculate_metrics( "fasttext_data/captions/testing_data.txt", model)['weighted avg'][metric] train_metric.append(train_metric_result) test_metric.append(test_metric_result) print(test_metric) print("mean %s: %f" % (metric, np.mean(test_metric))) return (np.mean(train_metric), np.mean(test_metric))
def optimize_hyperparameters(data): training, validation = train_test_split(data, test_size=0.2) write_to_file(training, "fasttext_data/captions/optimized_training_data.txt") write_to_file(validation, "fasttext_data/captions/optimized_validation_data.txt") print("starting automatic hyperparameter optimization") model = fasttext.train_supervised( input='fasttext_data/captions/optimized_training_data.txt', autotuneValidationFile= 'fasttext_data/captions/optimized_validation_data.txt', autotuneDuration=600, verbose=3) print("finished optimization, saving model") model.save_model("models/captions/optimized_model.bin") return calculate_metrics( "fasttext_data/captions/optimized_validation_data.txt", model)
def read(feature: BaseParams) -> Tuple[Optional[BaseParams], float]: try: _, x_train, y_train = read_one_feature(feature, train_vs_test="train", only_ab=False, to_binary=False) _, x_test, y_test = read_one_feature(feature, train_vs_test="test", only_ab=False, to_binary=False) x = list(x_train) + list(x_test) y = list(y_train) + list(y_test) metrics = calculate_metrics(y, proba_to_letters(x)) print(feature.get_features_names_str(), metrics.accuracy) return (feature, metrics.accuracy) except Exception as ignored: return (None, 0.)
def play(self): predictions = [] for i in range(len(self.agents[0].x)): if self.verbose: print("\n----------------------") print("round:", i) print("----------------------\n") winner: Optional[int] = None while len(self.agents) > len(self.removed_from_round) + 1: round = self.small_round() if round is not None: winner = round break if not winner: winner = self.get_winner() if self.verbose: print(winner, "TRUE:", self.agents[0].y[i]) predictions.append(winner) self.upp_i() y = self.agents[0].y from utils.metrics import calculate_metrics return calculate_metrics(y, predictions)
def run_grid_search(train: Data, test: Data, params: BaseParams, parameters: Dict[str, Any]) -> None: X_train, y_train = get_Xy(train, params) X_test, y_test = get_Xy(test, params) clf = params.classifier pipeline = get_pipeline(params) grid = GridSearchCV(pipeline, param_grid=parameters, cv=10) grid.fit(X_train, y_train) y_predicted = grid.predict(X_test) print("best_score", grid.best_score_) index = grid.best_index_ print(grid.cv_results_['mean_test_score'][index], grid.cv_results_['std_test_score'][index]) from utils.metrics import calculate_metrics m = calculate_metrics(y_test, y_predicted) print(m.to_string()) # print(feature_name) print("best_params", grid.best_params_) from utils.file_management import save_data_with_ultimate_dir_creation lines = ("clf: %s\n" "mean_test_score = %3.4f\n" "std_test_score %3.4f\n" "metrics:\n%s\n" "best_params: %s\n" "------------------\n" % (clf, grid.cv_results_['mean_test_score'][index] * 100, grid.cv_results_['std_test_score'][index] * 100, m.to_string(), str(grid.best_params_))) path = "out/" + params.dirname + "_hyperparams/" + params.get_features_names_str( ) + '.txt' save_data_with_ultimate_dir_creation(path, [lines])
def play(self): predictions = [] for i in range(len(self.agents[0].x)): if self.verbose: print("\n----------------------") print("round:", i, "SOR: ", self.agents[0].y[i]) print("----------------------\n") while len(self.agents) > len(self.removed_from_round) + 1: self.small_round(i) winner = self.get_winner(i) if self.verbose: print(winner, "TRUE:", self.agents[0].y[i]) predictions.append(winner) self.reset() y = self.agents[0].y from utils.metrics import calculate_metrics m = calculate_metrics(y, predictions) if self.verbose: print(m.accuracy, m.aed_score, self.C) return m
df = pandas.read_csv('senia.csv').values x = map(lambda x: map(lambda y: generateIntSequence(y), x), df[:, 1:-2]) #print x[0], x[-1] y = map(lambda y: int(y), df[:, -1]) y = numpy.array(y) #print y[0], y[-1] x_metrics = [] for i, x_i in enumerate(x): timestamp = int(df[i, -2]) data = [ timestamp, ] + x_i #if i % 1000 == 0: # print data x_metrics.append(calculate_metrics(data)[:-1]) #print x_metrics[0] x_metrics = numpy.array(x_metrics) #x_metrics = normalize(x_metrics) #x_metrics = scale(x_metrics) def f6(x): if x == 7: return 2 elif x > 7: return x - 1 else: return x
def forward_one_batch(self, model, vocab, src, trg, src_percentages, src_lengths, trg_lengths, smoothing, loss_type, verbose=False, discriminator=None, accent_id=None, multi_task=False): if discriminator is None: pred, gold, hyp = model(src, src_lengths, trg, verbose=False) else: enc_output = model.encode(src, src_lengths) accent_pred = discriminator(torch.sum(enc_output, dim=1)) pred, gold, hyp = model.decode(enc_output, src_lengths, trg) if multi_task: # calculate multi disc_loss = calculate_multi_task(accent_pred, accent_id) else: # calculate discriminator loss and encoder loss disc_loss, enc_loss = calculate_adversarial( accent_pred, accent_id) strs_golds, strs_hyps = [], [] for j in range(len(gold)): ut_gold = gold[j] strs_golds.append("".join( [vocab.id2label[int(x)] for x in ut_gold])) for j in range(len(hyp)): ut_hyp = hyp[j] strs_hyps.append("".join([vocab.id2label[int(x)] for x in ut_hyp])) # handling the last batch seq_length = pred.size(1) sizes = src_percentages.mul_(int(seq_length)).int() loss, _ = calculate_metrics(pred, gold, vocab.PAD_ID, input_lengths=sizes, target_lengths=trg_lengths, smoothing=smoothing, loss_type=loss_type) if loss is None: print("loss is None") if loss.item() == float('Inf'): logging.info("Found infinity loss, masking") print("Found infinity loss, masking") loss = torch.where(loss != loss, torch.zeros_like(loss), loss) # NaN masking # if verbose: # print(">PRED:", strs_hyps) # print(">GOLD:", strs_golds) total_cer, total_wer, total_char, total_word = 0, 0, 0, 0 for j in range(len(strs_hyps)): strs_hyps[j] = post_process(strs_hyps[j], vocab.special_token_list) strs_golds[j] = post_process(strs_golds[j], vocab.special_token_list) cer = calculate_cer(strs_hyps[j].replace(' ', ''), strs_golds[j].replace(' ', '')) wer = calculate_wer(strs_hyps[j], strs_golds[j]) total_cer += cer total_wer += wer total_char += len(strs_golds[j].replace(' ', '')) total_word += len(strs_golds[j].split(" ")) if verbose: print('Total CER', total_cer) print('Total char', total_char) print("PRED:", strs_hyps) print("GOLD:", strs_golds, flush=True) if discriminator is None: return loss, total_cer, total_char else: if multi_task: return loss, total_cer, total_char, disc_loss else: return loss, total_cer, total_char, disc_loss, enc_loss
def run(self): ############################################################################################ # Obtendo amostras de treino, validação e teste # ############################################################################################ # Definindo dataset de treino train_data_file_path = os.path.join(dir_data, 'train_dataset.pickle') train_ds = PixForceDataset(train_data_file_path, transformations=self.transformations_train) # Obtendo amostras para treino samples_train = train_ds[0:len(train_ds)] self.x_train = samples_train['images'] self.y_train = samples_train['labels'] # Definindo dataset de validacao val_data_file_path = os.path.join(dir_data, 'validation_dataset.pickle') val_ds = PixForceDataset(val_data_file_path, transformations=[Flatten()]) # Obtendo amostras para validação samples_val = val_ds[0:len(train_ds)] self.x_val = samples_val['images'] self.y_val = samples_val['labels'] # Definindo dataset de teste test_data_file_path = os.path.join(dir_data, 'test_dataset.pickle') test_ds = PixForceDataset(test_data_file_path, transformations=[Flatten()]) # Obtendo amostras para validação samples_test = test_ds[0:len(train_ds)] self.x_test = samples_test['images'] self.y_test = samples_test['labels'] ############################################################################################ # Definindo parâmetros para o otimizador bayesiano # ############################################################################################ # Definindo limites dos parâmetros do RF para busca do otimizador bayesiano pbounds = {'trees': (5, 200), 'max_depth': (5, 200)} # Definindo otimizador bayesiano optimizer = BayesianOptimization( f=self._train_validate_RF_Classifier, pbounds=pbounds, random_state=self.random_seed, ) self.results_train_val = [] # Chama o otimizador bayesiano optimizer.maximize(init_points=15, n_iter=self.iterations_bo) ############################################################################################ # Verificando a acurácia na base de teste # ############################################################################################ # Aplicando as amostras de teste no classificador com a melhor acurácia na validação self.results_train_val = np.array(self.results_train_val) best_cls = self.results_train_val[np.argmax(self.results_train_val[:, 1])] trees = int(best_cls[2]) max_depth = int(best_cls[3]) # Cria classificador com a melhor configuração obtida na validação rf_final = RandomForestClassifier(n_estimators=trees, max_depth=max_depth, random_state=self.random_seed) rf_final.fit(self.x_train + self.x_val, self.y_train + self.y_val) y_pred_test = rf_final.predict(self.x_test) self.results_test = metrics.calculate_metrics(y_pred_test, self.y_test) # Mostra os resultados print(f'Acurácia na base de teste: {self.results_test["accuracy"]}\n') return { 'final_classifier': rf_final, 'results_train_val_set': self.results_train_val, 'metrics_test_set': self.results_test }
def get_state_dict(self) -> Dict: loss = self.loss / self.batch_processed if self.batch_processed > 0 else 0.0 state_dict = {'loss': loss} state_dict.update(calculate_metrics(self.statistics)) return state_dict
def train( model: GraphConvolutionalNetwork, train_data: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]], validation_data: Optional[List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]] = None, criterion: Callable = nn.CrossEntropyLoss(), num_epochs: int = 10, learning_rate: float = 1e-3, metrics_to_log: Optional[List[str]] = None, model_path: Optional[str] = None ) -> None: # Send model to device and initialize optimize model = model.to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) logger.info("training model...") for i in range(num_epochs): count = 0 total_loss = 0 for input, adjacency, target in train_data: # Send data to device input = input.to(DEVICE) adjacency = adjacency.to(DEVICE) target = target.to(DEVICE) # Compute prediction and loss predicted = model(input=input, adjacency=adjacency).to(DEVICE) loss = criterion(predicted.unsqueeze(0), target.unsqueeze(0)).to(DEVICE) # Perform gradient step optimizer.zero_grad() loss.backward() optimizer.step() # Track progress count += 1 total_loss += loss.cpu().item() # Log progress logger.info(f"epochs completed: \t {i + 1}/{num_epochs}") logger.info(f"mean loss: \t {'{0:.3f}'.format(total_loss / count)}") if metrics_to_log: logger.info("calculating training metrics...") log_metrics( metrics=calculate_metrics( model=model, data=train_data ), metrics_to_log=metrics_to_log ) if validation_data: logger.info("calculating validation metrics...") log_metrics( metrics=calculate_metrics( model=model, data=validation_data ), metrics_to_log=metrics_to_log ) logger.info("-" * 50) if model_path: logger.info("saving model...") model.save(path=model_path)
def train(self, model, train_loader, train_sampler, valid_loader_list, opt, loss_type, start_epoch, num_epochs, label2id, id2label, last_metrics=None): """ Training args: model: Model object train_loader: DataLoader object of the training set valid_loader_list: a list of Validation DataLoader objects opt: Optimizer object start_epoch: start epoch (> 0 if you resume the process) num_epochs: last epoch last_metrics: (if resume) """ history = [] start_time = time.time() best_valid_loss = 1000000000 if last_metrics is None else last_metrics[ 'valid_loss'] smoothing = constant.args.label_smoothing logging.info("name " + constant.args.name) for epoch in range(start_epoch, num_epochs): sys.stdout.flush() total_loss, total_cer, total_wer, total_char, total_word = 0, 0, 0, 0, 0 start_iter = 0 logging.info("TRAIN") model.train() pbar = tqdm(iter(train_loader), leave=True, total=len(train_loader)) for i, (data) in enumerate(pbar, start=start_iter): src, tgt, src_percentages, src_lengths, tgt_lengths = data if constant.USE_CUDA: src = src.cuda() tgt = tgt.cuda() opt.zero_grad() pred, gold, hyp_seq, gold_seq = model(src, src_lengths, tgt, verbose=False) try: # handle case for CTC strs_gold, strs_hyps = [], [] for ut_gold in gold_seq: str_gold = "" for x in ut_gold: if int(x) == constant.PAD_TOKEN: break str_gold = str_gold + id2label[int(x)] strs_gold.append(str_gold) for ut_hyp in hyp_seq: str_hyp = "" for x in ut_hyp: if int(x) == constant.PAD_TOKEN: break str_hyp = str_hyp + id2label[int(x)] strs_hyps.append(str_hyp) except Exception as e: print(e) logging.info("NaN predictions") continue seq_length = pred.size(1) sizes = Variable(src_percentages.mul_(int(seq_length)).int(), requires_grad=False) loss, num_correct = calculate_metrics( pred, gold, input_lengths=sizes, target_lengths=tgt_lengths, smoothing=smoothing, loss_type=loss_type) if loss.item() == float('Inf'): logging.info("Found infinity loss, masking") loss = torch.where(loss != loss, torch.zeros_like(loss), loss) # NaN masking continue # if constant.args.verbose: # logging.info("GOLD", strs_gold) # logging.info("HYP", strs_hyps) for j in range(len(strs_hyps)): strs_hyps[j] = strs_hyps[j].replace( constant.SOS_CHAR, '').replace(constant.EOS_CHAR, '') strs_gold[j] = strs_gold[j].replace( constant.SOS_CHAR, '').replace(constant.EOS_CHAR, '') cer = calculate_cer(strs_hyps[j].replace(' ', ''), strs_gold[j].replace(' ', '')) wer = calculate_wer(strs_hyps[j], strs_gold[j]) total_cer += cer total_wer += wer total_char += len(strs_gold[j].replace(' ', '')) total_word += len(strs_gold[j].split(" ")) loss.backward() if constant.args.clip: torch.nn.utils.clip_grad_norm_(model.parameters(), constant.args.max_norm) opt.step() total_loss += loss.item() non_pad_mask = gold.ne(constant.PAD_TOKEN) num_word = non_pad_mask.sum().item() pbar.set_description( "(Epoch {}) TRAIN LOSS:{:.4f} CER:{:.2f}% LR:{:.7f}". format((epoch + 1), total_loss / (i + 1), total_cer * 100 / total_char, opt._rate)) logging.info( "(Epoch {}) TRAIN LOSS:{:.4f} CER:{:.2f}% LR:{:.7f}".format( (epoch + 1), total_loss / (len(train_loader)), total_cer * 100 / total_char, opt._rate)) # evaluate print("") logging.info("VALID") model.eval() for ind in range(len(valid_loader_list)): valid_loader = valid_loader_list[ind] total_valid_loss, total_valid_cer, total_valid_wer, total_valid_char, total_valid_word = 0, 0, 0, 0, 0 valid_pbar = tqdm(iter(valid_loader), leave=True, total=len(valid_loader)) for i, (data) in enumerate(valid_pbar): src, tgt, src_percentages, src_lengths, tgt_lengths = data if constant.USE_CUDA: src = src.cuda() tgt = tgt.cuda() pred, gold, hyp_seq, gold_seq = model(src, src_lengths, tgt, verbose=False) seq_length = pred.size(1) sizes = Variable(src_percentages.mul_( int(seq_length)).int(), requires_grad=False) loss, num_correct = calculate_metrics( pred, gold, input_lengths=sizes, target_lengths=tgt_lengths, smoothing=smoothing, loss_type=loss_type) if loss.item() == float('Inf'): logging.info("Found infinity loss, masking") loss = torch.where(loss != loss, torch.zeros_like(loss), loss) # NaN masking continue try: # handle case for CTC strs_gold, strs_hyps = [], [] for ut_gold in gold_seq: str_gold = "" for x in ut_gold: if int(x) == constant.PAD_TOKEN: break str_gold = str_gold + id2label[int(x)] strs_gold.append(str_gold) for ut_hyp in hyp_seq: str_hyp = "" for x in ut_hyp: if int(x) == constant.PAD_TOKEN: break str_hyp = str_hyp + id2label[int(x)] strs_hyps.append(str_hyp) except Exception as e: print(e) logging.info("NaN predictions") continue for j in range(len(strs_hyps)): strs_hyps[j] = strs_hyps[j].replace( constant.SOS_CHAR, '').replace(constant.EOS_CHAR, '') strs_gold[j] = strs_gold[j].replace( constant.SOS_CHAR, '').replace(constant.EOS_CHAR, '') cer = calculate_cer(strs_hyps[j].replace(' ', ''), strs_gold[j].replace(' ', '')) wer = calculate_wer(strs_hyps[j], strs_gold[j]) total_valid_cer += cer total_valid_wer += wer total_valid_char += len(strs_gold[j].replace(' ', '')) total_valid_word += len(strs_gold[j].split(" ")) total_valid_loss += loss.item() valid_pbar.set_description( "VALID SET {} LOSS:{:.4f} CER:{:.2f}%".format( ind, total_valid_loss / (i + 1), total_valid_cer * 100 / total_valid_char)) logging.info("VALID SET {} LOSS:{:.4f} CER:{:.2f}%".format( ind, total_valid_loss / (len(valid_loader)), total_valid_cer * 100 / total_valid_char)) metrics = {} metrics["train_loss"] = total_loss / len(train_loader) metrics["valid_loss"] = total_valid_loss / (len(valid_loader)) metrics["train_cer"] = total_cer metrics["train_wer"] = total_wer metrics["valid_cer"] = total_valid_cer metrics["valid_wer"] = total_valid_wer metrics["history"] = history history.append(metrics) if epoch % constant.args.save_every == 0: save_model(model, (epoch + 1), opt, metrics, label2id, id2label, best_model=False) # save the best model if best_valid_loss > total_valid_loss / len(valid_loader): best_valid_loss = total_valid_loss / len(valid_loader) save_model(model, (epoch + 1), opt, metrics, label2id, id2label, best_model=True) if constant.args.shuffle: logging.info("SHUFFLE") print("SHUFFLE") train_sampler.shuffle(epoch)
def train(self, model, train_loader, train_sampler, valid_loaders, opt, loss_type, start_epoch, num_epochs, label2id, id2label, last_metrics=None, logger=None): """ Training args: model: Model object train_loader: DataLoader object of the training set valid_loaders: list of DataLoader object of the validation set opt: Optimizer object start_epoch: start epoch (> 0 if you resume the process) num_epochs: last epoch last_metrics: (if resume) """ if logger is not None: sys.out = logger start_time = time.time() best_valid_loss = 1000000000 if last_metrics is None else last_metrics[ 'valid_loss'] smoothing = constant.args.label_smoothing history = [] for epoch in range(start_epoch, num_epochs): sys.out.flush() total_loss, total_cer, total_wer, total_char, total_word = 0, 0, 0, 0, 0 start_iter = 0 print("TRAIN") model.train() pbar = tqdm(iter(train_loader), leave=True, total=len(train_loader)) for i, (data) in enumerate(pbar, start=start_iter): src, tgt, src_percentages, src_lengths, tgt_lengths = data if constant.USE_CUDA: src = src.cuda() tgt = tgt.cuda() opt.optimizer.zero_grad() pred, gold, hyp_seq, gold_seq = model( src, input_lengths=src_lengths, padded_target=tgt, verbose=constant.args.verbose) strs_gold = [ "".join([id2label[int(x)] for x in gold]) for gold in gold_seq ] strs_hyps = [ "".join([id2label[int(x)] for x in hyp]) for hyp in hyp_seq ] loss, num_correct = calculate_metrics( pred, gold, smoothing=smoothing, loss_type=loss_type, input_lengths=src_lengths, target_lengths=tgt_lengths) if constant.args.verbose: print("GOLD", strs_gold) print("HYP", strs_hyps) for j in range(len(strs_hyps)): cer = calculate_cer(strs_hyps[j], strs_gold[j]) wer = calculate_wer(strs_hyps[j], strs_gold[j]) total_cer += cer total_wer += wer total_char += len(strs_gold[j]) total_word += len(strs_gold[j].split(" ")) loss.backward() opt.optimizer.step() total_loss += loss.detach().item() non_pad_mask = gold.ne(constant.PAD_TOKEN) num_word = non_pad_mask.sum().item() pbar.set_description( "(Epoch {}) TRAIN LOSS:{:.4f} CER:{:.2f}% WER:{:.2f}%". format((epoch + 1), total_loss / (i + 1), total_cer * 100 / total_char, total_wer * 100 / total_word)) print( "(Epoch {}) TRAIN LOSS:{:.4f} CER:{:.2f}% WER:{:.2f}%".format( (epoch + 1), total_loss / (len(train_loader)), total_cer * 100 / total_char, total_wer * 100 / total_word)) print("VALID") all_valid_loss = [] for valid_task_id in range(len(valid_loaders)): model.eval() sys.out.flush() valid_loader = valid_loaders[valid_task_id] total_valid_loss, total_valid_cer, total_valid_wer, total_valid_char, total_valid_word = 0, 0, 0, 0, 0 valid_pbar = tqdm(iter(valid_loader), leave=True, total=len(valid_loader)) for i, (data) in enumerate(valid_pbar): src, tgt, src_percentages, src_lengths, tgt_lengths = data if constant.USE_CUDA: src = src.cuda() tgt = tgt.cuda() pred, gold, hyp_seq, gold_seq = model( src, input_lengths=src_lengths, padded_target=tgt, verbose=constant.args.verbose) loss, num_correct = calculate_metrics( pred, gold, smoothing=smoothing, loss_type=loss_type, input_lengths=src_lengths, target_lengths=tgt_lengths) strs_gold = [ "".join([id2label[int(x)] for x in gold]) for gold in gold_seq ] strs_hyps = [ "".join([id2label[int(x)] for x in hyp]) for hyp in hyp_seq ] for j in range(len(strs_hyps)): cer = calculate_cer(strs_hyps[j], strs_gold[j]) wer = calculate_wer(strs_hyps[j], strs_gold[j]) total_valid_cer += cer total_valid_wer += wer total_valid_char += len(strs_gold[j]) total_valid_word += len(strs_gold[j].split(" ")) total_valid_loss += loss.detach().item() valid_pbar.set_description( "(Epoch {}) TASK:{} VALID LOSS:{:.4f} CER:{:.2f}% WER:{:.2f}%" .format((epoch + 1), valid_task_id, total_valid_loss / (i + 1), total_valid_cer * 100 / total_valid_char, total_valid_wer * 100 / total_valid_word)) all_valid_loss.append(total_valid_loss / len(valid_pbar)) print( "(Epoch {}) TASK:{} VALID LOSS:{:.4f} CER:{:.2f}% WER:{:.2f}%" .format((epoch + 1), valid_task_id, total_valid_loss / (len(valid_loader)), total_valid_cer * 100 / total_valid_char, total_valid_wer * 100 / total_valid_word)) metrics = {} metrics["train_loss"] = total_loss / len(train_loader) metrics["valid_loss"] = np.mean(np.array(all_valid_loss)) metrics["valid_losses"] = all_valid_loss metrics["train_cer"] = total_cer metrics["train_wer"] = total_wer metrics["valid_cer"] = total_valid_cer metrics["valid_wer"] = total_valid_wer metrics["history"] = history history.append(metrics) if epoch % constant.args.save_every == 0: save_model(model, (epoch + 1), opt, metrics, label2id, id2label, best_model=False) # save the best model if best_valid_loss > total_valid_loss / len(valid_loader): best_valid_loss = total_valid_loss / len(valid_loader) save_model(model, (epoch + 1), opt, metrics, label2id, id2label, best_model=True) if constant.args.shuffle: print("SHUFFLE") train_sampler.shuffle(epoch)
def train_and_test(training, testing, output_path, **kwargs): model = fasttext.train_supervised(training, **kwargs) results = calculate_metrics(testing, model) model.save_model(output_path) return results
def count_accuracy(self): from utils.metrics import calculate_metrics return calculate_metrics(self.conv_Xs_to_label(), self.ys).accuracy
def main(): plt.rcParams['figure.dpi'] = 300 plt.rcParams['font.size'] = 7 # Classes classes = ["dog", "cat", "Null"] # classes = ["dog", "cat"] # DataFrames actual_df = pd.read_csv("example\\actual.csv") actual_df = preprocess_df(actual_df) detected_df = pd.read_csv("example\\detected.csv") detected_df = preprocess_df(detected_df) detected_df = remove_overlapping_objects(detected_df) # Calculating df = calculate_metrics(actual_df, detected_df, prob_thresh=0, iou_thresh=0.0) df.to_csv("example\\result_df.csv", index=False) # ============ Collect data for sklearn ============= y_true = [] y_pred = [] y_score = [] for i, row in df[df['a_xmin'] != 'Null'].iterrows(): true_class = row['a_label'] y_true.append(true_class) pred_class = row['d_label'] y_pred.append(pred_class) prob = row['d_prob'] if prob == "Null": y_score.append(0) else: y_score.append(float(prob)) y_true = np.array(y_true) y_pred = np.array(y_pred) y_score = np.array(y_score) # for true, pred in zip(y_true, y_pred): # print(true, pred) print("Accuracy ", 100 * (y_true == y_pred).sum() / len(y_true)) # ========= Confusion Matrix =========== cm = sm.confusion_matrix(y_true, y_pred, labels=sorted(classes)) plot_confusion_matrix(cm, classes=sorted(classes)) plt.show() cm_display = sm.ConfusionMatrixDisplay( cm, display_labels=sorted(classes)).plot() plt.show() # ========= Classification Report =========== cp = sm.classification_report(y_true, y_pred, labels=sorted(classes), output_dict=False) print(cp) # ========= PR Curve =========== precision = {} recall = {} thresh = {} for i in classes: precision[i], recall[i], thresh[i] = sm.precision_recall_curve( y_true, y_score, pos_label=i) plt.plot(recall[i], precision[i], lw=2, label=f'{i}') plt.xlabel("recall") plt.ylabel("precision") plt.legend(loc="best") plt.title("precision vs. recall curve") plt.show() print("PR Curve") # for pr, rec, thresh_ in zip(precision["full_lined"], recall["full_lined"], thresh["full_lined"]): # print(pr, rec, thresh_) # ========= ROC Curve =========== fpr = {} tpr = {} thresh = {} roc_auc = {} for i in classes: fpr[i], tpr[i], thresh[i] = sm.roc_curve(y_true, y_score, pos_label=i) roc_auc[i] = sm.auc(fpr[i], tpr[i]) plt.plot(fpr[i], tpr[i], lw=2, label=f'{i} (area = {roc_auc[i]:0.2f})') ns_probs = [0 for _ in range(len(y_true))] ns_fpr, ns_tpr, _ = sm.roc_curve(y_true, ns_probs, pos_label="nolines") plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill') plt.xlabel("false positive rate") plt.ylabel("true positive rate") plt.legend(loc="best") plt.title("ROC curve") plt.show() print("ROC Curve")
fc_hidden_sizes=fc_hs, add_residual_connection=False) model_desc = "_gc_" + str(gc_hs) + "_fc_" + str( fc_hs) + "_lr_" + str(lr) + "_epochs_" + str(epochs) train(model=gcn_model, train_data=train_data, validation_data=valid_data, num_epochs=epochs, learning_rate=lr, metrics_to_log=metrics, model_path=args.model_dir + args.model_prefix + ".pt") valid_metrics_result = calculate_metrics(model=gcn_model, data=valid_data) test_metrics_result = calculate_metrics(model=gcn_model, data=test_data) result = { "gc_hidden_layers": str(gc_hs), "fc_hidden_layers": str(fc_hs), "learning_rate": lr } for m in metrics: result["test_" + m] = test_metrics_result[m] result["valid_" + m] = valid_metrics_result[m] results = results.append(result, ignore_index=True)
val = [float(x) for x in valString.split("_")] return val df = pandas.read_csv('senia.csv').values x = map(lambda x: map(lambda y: generateIntSequence(y), x), df[:, 1:-2]) #print x[0], x[-1] y = map(lambda y: int(y), df[:,-1]) y = numpy.array(y) #print y[0], y[-1] x_metrics = [] for i, x_i in enumerate(x): timestamp = int(df[i, -2]) data = [timestamp, ] + x_i #if i % 1000 == 0: # print data x_metrics.append(calculate_metrics(data)[:-1]) #print x_metrics[0] x_metrics = numpy.array(x_metrics) #x_metrics = normalize(x_metrics) #x_metrics = scale(x_metrics) def f6(x): if x == 7: return 2 elif x > 7: return x - 1 else: return x