def eval_fold( valid_path: str, model_path: str = "cache/roberta-base-fold-0.h5", batch_size: int = 8 ): strategy, tpu = prepare_tpu() if tpu: batch_size *= strategy.num_replicas_in_sync valid_ds, valid_steps = tfrecord_dataset( valid_path, batch_size, strategy, is_train=False) valid_dist_ds = strategy.experimental_distribute_dataset( valid_ds) model_name = Path(model_path).name if model_name.lower().startswith("roberta-base"): config = RobertaConfig.from_dict( ROBERTA_CONFIG) model = DualRobertaModel( model_name="roberta-base", config=config, pretrained=False ) # build model(next(iter(valid_ds))[0], training=False) model.load_weights(model_path) else: raise ValueError("Unknown model.") spearman = SpearmanCorr() @tf.function def predict_batch(inputs): return model(inputs, training=False)[0] preds, labels = [], [] for batch_, labels_ in tqdm(valid_dist_ds, total=valid_steps, ncols=100): tmp = strategy.experimental_run_v2( predict_batch, args=(batch_,) ).values preds.append( tf.concat( tmp, axis=0 ).numpy() ) labels.append(tf.concat( strategy.experimental_local_results(labels_), axis=0 ).numpy()) preds = np.concatenate(preds) labels = np.concatenate(labels) score = spearman(labels, preds)[0] * -1 print(f"Raw Spearman: {score * 100 : .2f}") return labels, preds
def eval_fold( input_path: str = "data/", fold_path: str = "cache/tfrecords/fold_0.jl", model_path: str = "cache/roberta-base-fold-0.h5", tokenizer_path: str = "cache/tfrecords/tokenizer_roberta-base/", batch_size: int = 8 ): df_train = pd.read_csv(input_path + 'train.csv') tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) processor = Preprocessor(tokenizer) labels = df_train.loc[:, OUTPUT_COLUMNS].values inputs = df_train.loc[:, INPUT_COLUMNS].values _, valid_idx = joblib.load(fold_path) # valid_idx = valid_idx[:100] # For faster debug labels, inputs = labels[valid_idx], inputs[valid_idx] tmp = [] for i in tqdm(range(labels.shape[0]), ncols=100): tmp.append(processor.process_one_example( inputs[i, 0], inputs[i, 1], inputs[i, 2]) ) processed_inputs = np.array(tmp) del tmp, inputs model_name = Path(model_path).name if model_name.lower().startswith("roberta-base"): config = RobertaConfig.from_dict( ROBERTA_CONFIG) model = DualRobertaModel( model_name="roberta-base", config=config, pretrained=False ) # build model(get_batch(processed_inputs[:2]), training=False) model.load_weights(model_path) else: raise ValueError("Unknown model.") spearman = SpearmanCorr() @tf.function def predict_batch(inputs): return model(inputs, training=False)[0] preds = [] for i in tqdm(range(0, len(labels), batch_size), ncols=100): input_dicts = processed_inputs[i:i+batch_size] preds.append(predict_batch(get_batch(input_dicts)).numpy()) preds = np.concatenate(preds) score = spearman(labels, preds)[0] * -1 print(f"Raw Spearman: {score * 100 : .2f}") return labels, preds
parser.add_argument('--weight_decay', type=float, default=0.0) parser.add_argument('--adam_epsilon', type=float, default=1e-8) parser.add_argument('--max_grad_norm', type=float, default=1.0) parser.add_argument('--warmup_steps', type=int, default=0) parser.add_argument('--output_hidden_states', type=bool, default=True) parser.add_argument('--num_train_epochs', type=int, default=5) parser.add_argument('--save_steps', type=int, default=60) parser.add_argument('--device', type=str, default='cpu') parser.add_argument('--seed', type=int, default=27) parser.add_argument('--patience', type=int, default=20) args = parser.parse_args() bpe = fastBPE(args) # need remake config with device option for train with another cuda device config = RobertaConfig.from_pretrained(args.config_path) config = config.to_dict() config.update({"device": args.device}) config.update({"output_hidden_states": args.output_hidden_states}) config = RobertaConfig.from_dict(config) rdrsegmenter = VnCoreNLP(args.rdrsegmenter_path, annotators='wseg', max_heap_size='-Xmx500m') vocab = Dictionary() vocab.add_from_file(args.dict_path) model = PhoBERT.from_pretrained(args.folder_model, config=config) model = model.to(args.device) train_qa(args, rdrsegmenter, bpe, vocab, model)
def main(input_path: str = "data/", tokenizer_path: str = "cache/tfrecords/tokenizer_roberta-base/", model_path_pattern: str = "cache/roberta-base-fold-*", best_bins_path: str = "cache/best_bins.jl", batch_size: int = 8, progress_bar: bool = True, add_sigmoid: bool = False, rank: bool = False): df_valid = pd.read_csv(input_path + 'test.csv') tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) processor = Preprocessor(tokenizer) inputs = df_valid.loc[:, INPUT_COLUMNS].values tmp = [] for i in tqdm(range(inputs.shape[0]), ncols=100, disable=not progress_bar): tmp.append( processor.process_one_example(inputs[i, 0], inputs[i, 1], inputs[i, 2])) processed_inputs = np.array(tmp) del tmp, inputs buffer = [] for model_path in glob.glob(model_path_pattern): model_name = Path(model_path).name print(model_path, model_name) if model_name.lower().startswith("roberta-base"): config = RobertaConfig.from_dict(ROBERTA_CONFIG) model = DualRobertaModel(model_name="roberta-base", config=config, pretrained=False) # build model(get_batch(processed_inputs[:2]), training=False) model.load_weights(model_path) else: raise ValueError("Unknown model.") @tf.function def predict_batch(inputs): return model(inputs, training=False)[0] preds = [] for i in tqdm(range(0, len(processed_inputs), batch_size), ncols=100, disable=not progress_bar): input_dicts = processed_inputs[i:i + batch_size] preds.append(predict_batch(get_batch(input_dicts)).numpy()) if add_sigmoid and not rank: buffer.append(expit(np.concatenate(preds))) elif rank: tmp = np.concatenate(preds) buffer.append(tmp.argsort(axis=0).argsort(axis=0) / tmp.shape[0]) else: buffer.append(np.concatenate(preds)) final_preds = np.mean(buffer, axis=0) if add_sigmoid and not rank: best_bins, scaler = joblib.load(best_bins_path) best_bins = np.array(best_bins)[None, :] # post-process final_preds = np.clip(scaler.transform(final_preds), 0., 1.) final_preds = prevent_nan( np.round(final_preds * best_bins) / best_bins) df_sub = pd.DataFrame(final_preds, columns=OUTPUT_COLUMNS) df_sub["qa_id"] = df_valid["qa_id"].values df_sub.to_csv("submission.csv", index=False)