Exemplo n.º 1
0
def test_vectorize():
    dataset_vectorize = TwitterDataset(df.iloc[0:1], sp)
    # vector : [CLS, 'sooo', 'high', SEP, 'MASK']
    # noinspection PyArgumentList
    observed_v_1 = dataset_vectorize.vectorize("sooo high £¤", "neutral")
    assert dataset.sentence_piece.decode(
        observed_v_1[0].tolist()) == 'neutral sooo high  ⁇ '
Exemplo n.º 2
0
def test_get_tokens():
    err = "these two lists must contains the same tokens"
    t_dataset = TwitterDataset(df.iloc[7:8], sp)
    t_dataset.max_seq_len = 3
    expected_sentence = "soooo high"
    vector = t_dataset.vectorize(expected_sentence, 'neutral')
    observed = t_dataset.get_tokens(vector[0])
    assert 'neutral  ' + expected_sentence == observed, err
Exemplo n.º 3
0
def test_generate_masked_lm():
    t_dataset = TwitterDataset(df_test.iloc[0:1], sp)
    t_dataset.max_seq_len = 16
    # Check if mask function worked
    observed = generate_masked_lm(t_dataset[0]['words_embedding'],
                                  t_dataset,
                                  mask_prob=1,
                                  rnd_t_prob=0,
                                  unchanged_prob=0)
    assert not t_dataset[0]['words_embedding'].allclose(observed)
    assert len(observed) == t_dataset.max_seq_len + 5
    assert observed.unique(return_counts=True)[0].allclose(
        torch.LongTensor([0, 2, 3, 8000]))
    assert observed.unique(return_counts=True)[1].allclose(
        torch.LongTensor([6, 1, 2, 12]))

    # Check if unchanged prob has an effect
    observed = generate_masked_lm(t_dataset[0]['words_embedding'],
                                  t_dataset,
                                  mask_prob=1.,
                                  rnd_t_prob=0,
                                  unchanged_prob=1.)
    assert t_dataset[0]['words_embedding'].allclose(observed)
Exemplo n.º 4
0
def test_replace_by_another_token():
    t_dataset = TwitterDataset(df_test.iloc[0:1], sp)

    replaced_ids = [replace_by_another_id(4, t_dataset) for _ in range(30)]
    replaced_tokens = t_dataset.get_tokens(torch.LongTensor(replaced_ids))

    assert 4 not in replaced_ids
    assert t_dataset.get_pad() not in replaced_ids
    assert t_dataset.get_mask() not in replaced_ids
    assert t_dataset.get_cls() not in replaced_ids
    assert t_dataset.get_sep() not in replaced_ids
    assert replaced_tokens
    print(replaced_tokens)
Exemplo n.º 5
0
def pretrain_bert_model(train_path, test_path, pretrain_path, checkpoint_path,
                        sp_path, neptune_api_token, stack_size, bert_dim_model,
                        head_size, pt_lr, batch_size, epochs, corpus_size):
    # set to cuda device if it is available
    if torch.cuda.is_available():
        torch_device = "cuda"
    else:
        torch_device = "cpu"
    current_device = torch.device(torch_device)
    # initialize corpus
    nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])
    if not Path(pretrain_path).is_file():
        train_csv = pd.read_csv(train_path, dtype={'text': 'string'})
        del train_csv['selected_text']
        test_csv = pd.read_csv(test_path, dtype={'text': 'string'})
        pretrain_dt = processing_df(
            pd.concat([train_csv, test_csv], ignore_index=True), nlp)
        pretrain_dt.to_csv(pretrain_path, index=False)
    else:
        pretrain_dt = pd.read_csv(pretrain_path, dtype={'text': 'string'})

    sentence_piece = create_sp_model(pretrain_dt, sp_path, spm)
    sentence_piece.Load(sp_path + '.model')

    # Set the corpus size
    if int(corpus_size) >= 0:
        pretrain_dt = pretrain_dt[:int(corpus_size)]
    # set the length of the different entries and remove certain cases
    set_seq_length(pretrain_dt, sentence_piece)
    print("size of the dataset : {0}".format(len(pretrain_dt)))
    # set parameters
    parameters = {
        "stack_size": int(stack_size),
        "vocabulary_size": sentence_piece.vocab_size() + 1,
        "bert_dim_model": int(bert_dim_model),
        "multi_heads": int(head_size),
        "learning_rate": float(pt_lr),
        "batch_size": int(batch_size),
        "epochs": int(epochs),
        "device": current_device,
        "corpus train size": len(pretrain_dt),
    }
    loss = nn.CrossEntropyLoss(
        ignore_index=sentence_piece.pad_id()).to(current_device)
    neptune.init('smeoni/bert-impl', api_token=neptune_api_token)
    neptune.create_experiment(name='bert_impl-experiment', params=parameters)
    # set the model
    bert = Bert(stack_size=parameters["stack_size"],
                voc_size=parameters["vocabulary_size"],
                dim_model=parameters["bert_dim_model"],
                mh_size=parameters["multi_heads"],
                padding_idx=sentence_piece.pad_id()).to(current_device)
    parameters['model'] = bert
    parameters['optimizer'] = optim.Adam(bert.parameters(),
                                         lr=parameters['learning_rate'],
                                         weight_decay=0.01,
                                         betas=(0.9, 0.999))
    parameters['loss'] = loss
    twitter_dt = TwitterDataset(pretrain_dt, sentence_piece)
    # pre train loop
    pre_train_loop(neptune,
                   twitter_dt,
                   checkpoint_path,
                   train=True,
                   **parameters)
Exemplo n.º 6
0
def fine_tune_bert_model(train_path,
                         ft_train_path,
                         pretrain_model_path,
                         save_model_path,
                         sp_path,
                         neptune_api_token,
                         st_lr,
                         batch_size,
                         epochs,
                         folds,
                         corpus_size):
    # set to cuda device if it is available
    if torch.cuda.is_available():
        torch_device = "cuda"
    else:
        torch_device = "cpu"
    current_device = torch.device(torch_device)

    sentence_piece = spm.SentencePieceProcessor()
    sentence_piece.Load(sp_path + '.model')

    # initialize corpus

    if Path(ft_train_path).is_file():
        train_csv = pd.read_pickle(ft_train_path)
        train_csv.astype(object)
    else:
        train_csv = pd.read_csv(train_path, dtype={'text': 'string'})
        nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])
        train_csv = processing_df(train_csv, nlp)
        train_csv = train_csv[train_csv['sentiment'] != 'neutral']
        train_csv = train_csv.reset_index(drop=True)
        train_csv = filter_selected_text_df(train_csv, nlp, sentence_piece)
        train_csv.to_pickle(ft_train_path)
    # Set the corpus size
    if int(corpus_size) >= 0:
        train_csv = train_csv[:int(corpus_size)]
    # set the length of the different entries and remove certain cases
    set_seq_length(train_csv, sentence_piece)
    print("size of the dataset : {0}".format(len(train_csv)))
    # set parameters
    parameters = {
        "vocabulary_size": sentence_piece.vocab_size() + 1,
        "st_learning_rate": float(st_lr),
        "batch_size": int(batch_size),
        "epochs": int(epochs),
        "device": current_device,
        "corpus train size": len(train_csv),
        "folds": int(folds)
    }
    loss = nn.CrossEntropyLoss().to(current_device)
    neptune.init('smeoni/bert-impl', api_token=neptune_api_token)
    neptune.create_experiment(name='bert_impl-experiment', params=parameters)
    folds = KFold(n_splits=parameters['folds'], shuffle=False)
    cv_ft_loss = []
    cv_ft_jaccard = []
    loaded_model = torch.load(pretrain_model_path)
    for id_fold, fold in enumerate(folds.split(train_csv)):
        # set the model
        bert = parameters['model'] = loaded_model['model'].to(current_device)
        parameters['stack_size'] = loaded_model['stack_size']
        parameters["bert_dim_model"] = loaded_model['bert_dim_model']
        parameters["multi_heads"] = loaded_model["multi_heads"]
        parameters['st_optimizer'] = optim.Adam(bert.parameters(),
                                                lr=parameters['st_learning_rate'])
        parameters['loss'] = loss
        train_fold, eval_fold = fold
        train_dt = TwitterDataset(train_csv.iloc[train_fold], sentence_piece)
        eval_dt = TwitterDataset(train_csv.iloc[eval_fold], sentence_piece)
        # fine tuning loop
        fine_tuning_loop(neptune, train_dt, True, **parameters)
        cv_score_ft, jaccard_score = fine_tuning_loop(neptune, eval_dt, train=False, **parameters)
        # logging and metrics
        torch.save(bert, get_checkpoint_filename(prefix="ft_", id_fold=id_fold,
                                                 path=save_model_path))
        neptune.log_metric('fine tuning loss cross validation', cv_score_ft)
        neptune.log_metric('fine tuning jaccard score cross validation', jaccard_score)
        cv_ft_loss.append(cv_score_ft)
        cv_ft_jaccard.append(jaccard_score)
    mean_cv_ft_loss = mean(cv_ft_loss)
    mean_cv_ft_jaccard = mean(cv_ft_jaccard)
    print("""
loss cross validation score mean :
* fine-tuning :  {0}
loss cross validation scores :
* fine-tuning :  {1}
jaccard cross validation score mean :
* fine-tuning :  {2}
jaccard cross validation scores :
* fine-tuning :  {3}
    """.format(mean_cv_ft_loss, cv_ft_loss, mean_cv_ft_jaccard, cv_ft_jaccard))

    neptune.log_metric('mean fine tuning cross validation', mean_cv_ft_loss)
Exemplo n.º 7
0
def test_generate_batched_masked_lm():
    t_dataset = TwitterDataset(df_test.iloc[0:10], sp)
    batch = next(generate_batches(t_dataset, 4))
    batch_masked_lm = generate_batched_masked_lm(batch['words_embedding'],
                                                 t_dataset)
    assert batch['words_embedding'].shape == batch_masked_lm.shape
Exemplo n.º 8
0
def test_generate_batches():
    test_dataset = TwitterDataset(df.iloc[0:10], sp)
    print(test_dataset.max_seq_len)
    batch = next(generate_batches(test_dataset, 10))
    assert len(batch) == 5
Exemplo n.º 9
0
import pandas as pd
import sentencepiece as spm

from src.bert_impl.dataset.bert_twitter_dataset import TwitterDataset
from src.bert_impl.utils.utils import generate_batches

df = pd.read_csv('./resources/test_data.csv')
sp = spm.SentencePieceProcessor()
sp.Load("./resources/test.model")
dataset = TwitterDataset(df.iloc[:4], sp)
expected_sentiment_list = ['negative', 'neutral']


def test___init_sentiment_vocab():
    err = "these two lists must be equal"
    assert sorted(dataset.st_voc) == expected_sentiment_list, err


def test_vectorize():
    dataset_vectorize = TwitterDataset(df.iloc[0:1], sp)
    # vector : [CLS, 'sooo', 'high', SEP, 'MASK']
    # noinspection PyArgumentList
    observed_v_1 = dataset_vectorize.vectorize("sooo high £¤", "neutral")
    assert dataset.sentence_piece.decode(
        observed_v_1[0].tolist()) == 'neutral sooo high  ⁇ '


def test_generate_batches():
    test_dataset = TwitterDataset(df.iloc[0:10], sp)
    print(test_dataset.max_seq_len)
    batch = next(generate_batches(test_dataset, 10))