Exemplo n.º 1
0
def eval_fold(
    valid_path: str,
    model_path: str = "cache/roberta-base-fold-0.h5",
    batch_size: int = 8
):
    strategy, tpu = prepare_tpu()
    if tpu:
        batch_size *= strategy.num_replicas_in_sync
    valid_ds, valid_steps = tfrecord_dataset(
        valid_path, batch_size, strategy, is_train=False)
    valid_dist_ds = strategy.experimental_distribute_dataset(
        valid_ds)

    model_name = Path(model_path).name
    if model_name.lower().startswith("roberta-base"):
        config = RobertaConfig.from_dict(
            ROBERTA_CONFIG)
        model = DualRobertaModel(
            model_name="roberta-base", config=config, pretrained=False
        )
        # build
        model(next(iter(valid_ds))[0], training=False)
        model.load_weights(model_path)
    else:
        raise ValueError("Unknown model.")
    spearman = SpearmanCorr()

    @tf.function
    def predict_batch(inputs):
        return model(inputs, training=False)[0]

    preds, labels = [], []
    for batch_, labels_ in tqdm(valid_dist_ds, total=valid_steps, ncols=100):
        tmp = strategy.experimental_run_v2(
            predict_batch,
            args=(batch_,)
        ).values
        preds.append(
            tf.concat(
                tmp, axis=0
            ).numpy()
        )
        labels.append(tf.concat(
            strategy.experimental_local_results(labels_),
            axis=0
        ).numpy())
    preds = np.concatenate(preds)
    labels = np.concatenate(labels)

    score = spearman(labels, preds)[0] * -1
    print(f"Raw Spearman: {score * 100 : .2f}")
    return labels, preds
Exemplo n.º 2
0
def eval_fold(
    input_path: str = "data/",
    fold_path: str = "cache/tfrecords/fold_0.jl",
    model_path: str = "cache/roberta-base-fold-0.h5",
    tokenizer_path: str = "cache/tfrecords/tokenizer_roberta-base/",
    batch_size: int = 8
):
    df_train = pd.read_csv(input_path + 'train.csv')
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    processor = Preprocessor(tokenizer)
    labels = df_train.loc[:, OUTPUT_COLUMNS].values
    inputs = df_train.loc[:, INPUT_COLUMNS].values
    _, valid_idx = joblib.load(fold_path)
    # valid_idx = valid_idx[:100] # For faster debug
    labels, inputs = labels[valid_idx], inputs[valid_idx]
    tmp = []
    for i in tqdm(range(labels.shape[0]), ncols=100):
        tmp.append(processor.process_one_example(
            inputs[i, 0],
            inputs[i, 1],
            inputs[i, 2])
        )
    processed_inputs = np.array(tmp)
    del tmp, inputs

    model_name = Path(model_path).name
    if model_name.lower().startswith("roberta-base"):
        config = RobertaConfig.from_dict(
            ROBERTA_CONFIG)
        model = DualRobertaModel(
            model_name="roberta-base", config=config, pretrained=False
        )
        # build
        model(get_batch(processed_inputs[:2]), training=False)
        model.load_weights(model_path)
    else:
        raise ValueError("Unknown model.")
    spearman = SpearmanCorr()

    @tf.function
    def predict_batch(inputs):
        return model(inputs, training=False)[0]

    preds = []
    for i in tqdm(range(0, len(labels), batch_size), ncols=100):
        input_dicts = processed_inputs[i:i+batch_size]
        preds.append(predict_batch(get_batch(input_dicts)).numpy())
    preds = np.concatenate(preds)

    score = spearman(labels, preds)[0] * -1
    print(f"Raw Spearman: {score * 100 : .2f}")
    return labels, preds
Exemplo n.º 3
0
parser.add_argument('--weight_decay', type=float, default=0.0)
parser.add_argument('--adam_epsilon', type=float, default=1e-8)
parser.add_argument('--max_grad_norm', type=float, default=1.0)
parser.add_argument('--warmup_steps', type=int, default=0)
parser.add_argument('--output_hidden_states', type=bool, default=True)
parser.add_argument('--num_train_epochs', type=int, default=5)
parser.add_argument('--save_steps', type=int, default=60)
parser.add_argument('--device', type=str, default='cpu')
parser.add_argument('--seed', type=int, default=27)
parser.add_argument('--patience', type=int, default=20)

args = parser.parse_args()
bpe = fastBPE(args)

# need remake config with device option for train with another cuda device
config = RobertaConfig.from_pretrained(args.config_path)
config = config.to_dict()
config.update({"device": args.device})
config.update({"output_hidden_states": args.output_hidden_states})
config = RobertaConfig.from_dict(config)

rdrsegmenter = VnCoreNLP(args.rdrsegmenter_path,
                         annotators='wseg',
                         max_heap_size='-Xmx500m')

vocab = Dictionary()
vocab.add_from_file(args.dict_path)

model = PhoBERT.from_pretrained(args.folder_model, config=config)
model = model.to(args.device)
train_qa(args, rdrsegmenter, bpe, vocab, model)
Exemplo n.º 4
0
def main(input_path: str = "data/",
         tokenizer_path: str = "cache/tfrecords/tokenizer_roberta-base/",
         model_path_pattern: str = "cache/roberta-base-fold-*",
         best_bins_path: str = "cache/best_bins.jl",
         batch_size: int = 8,
         progress_bar: bool = True,
         add_sigmoid: bool = False,
         rank: bool = False):
    df_valid = pd.read_csv(input_path + 'test.csv')
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    processor = Preprocessor(tokenizer)
    inputs = df_valid.loc[:, INPUT_COLUMNS].values
    tmp = []
    for i in tqdm(range(inputs.shape[0]), ncols=100, disable=not progress_bar):
        tmp.append(
            processor.process_one_example(inputs[i, 0], inputs[i, 1],
                                          inputs[i, 2]))
    processed_inputs = np.array(tmp)
    del tmp, inputs

    buffer = []
    for model_path in glob.glob(model_path_pattern):
        model_name = Path(model_path).name
        print(model_path, model_name)
        if model_name.lower().startswith("roberta-base"):
            config = RobertaConfig.from_dict(ROBERTA_CONFIG)
            model = DualRobertaModel(model_name="roberta-base",
                                     config=config,
                                     pretrained=False)
            # build
            model(get_batch(processed_inputs[:2]), training=False)
            model.load_weights(model_path)
        else:
            raise ValueError("Unknown model.")

        @tf.function
        def predict_batch(inputs):
            return model(inputs, training=False)[0]

        preds = []
        for i in tqdm(range(0, len(processed_inputs), batch_size),
                      ncols=100,
                      disable=not progress_bar):
            input_dicts = processed_inputs[i:i + batch_size]
            preds.append(predict_batch(get_batch(input_dicts)).numpy())
        if add_sigmoid and not rank:
            buffer.append(expit(np.concatenate(preds)))
        elif rank:
            tmp = np.concatenate(preds)
            buffer.append(tmp.argsort(axis=0).argsort(axis=0) / tmp.shape[0])
        else:
            buffer.append(np.concatenate(preds))

    final_preds = np.mean(buffer, axis=0)
    if add_sigmoid and not rank:
        best_bins, scaler = joblib.load(best_bins_path)
        best_bins = np.array(best_bins)[None, :]
        # post-process
        final_preds = np.clip(scaler.transform(final_preds), 0., 1.)
        final_preds = prevent_nan(
            np.round(final_preds * best_bins) / best_bins)

    df_sub = pd.DataFrame(final_preds, columns=OUTPUT_COLUMNS)
    df_sub["qa_id"] = df_valid["qa_id"].values
    df_sub.to_csv("submission.csv", index=False)