def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
     model = BertForNextSentencePrediction(config=config)
     model.eval()
     loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
     result = {
         "loss": loss,
         "seq_relationship_score": seq_relationship_score,
     }
     self.parent.assertListEqual(
         list(result["seq_relationship_score"].size()),
         [self.batch_size, 2])
     self.check_loss_output(result)
def add_sc(data):
    print("Computing Semantic Coherence")
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")
    softmax = torch.nn.Softmax(dim=1)
    model.eval()
    output = []
    for ex in tqdm(data):
        summary = ex["summary"]
        scores = []
        sentences = sent_tokenize(summary)
        if len(sentences) <= 1:
            ex["coherence"] = 1
        else:
            numerator = 0
            denominator = len(sentences) - 1
            for i in range(len(sentences) - 1):
                prev = sentences[i]
                curr = sentences[i + 1]
                s = "[CLS] " + prev + " [SEP] " + curr + " [SEP]"
                tokenized_text = tokenizer.tokenize(s)
                boundary = tokenized_text.index("[SEP]")
                segment_ids = [0] * boundary + [1] * (len(tokenized_text) -
                                                      boundary)
                indexed_tokens = tokenizer.convert_tokens_to_ids(
                    tokenized_text)
                tokens_tensor = torch.tensor([indexed_tokens])
                segments_tensors = torch.tensor([segment_ids])
                with torch.no_grad():
                    prediction = model(tokens_tensor,
                                       token_type_ids=segments_tensors)[0]
                prediction_sm = softmax(prediction)[0].tolist()
                if prediction_sm[0] > 0.5:
                    numerator += 1
            ex["coherence"] = numerator / denominator
Пример #3
0
def main():
    model = BertForNextSentencePrediction.from_pretrained('bert-base-cased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    seq_A = 'I like cookies !'
    seq_B = 'Do you like them ?'
    probs = bert_seq(model, tokenizer, seq_A, seq_B)
    print(probs)
Пример #4
0
 def __init__(self, bert_device):
     #self.tokenizer = tokenizer = BertTokenizer('./models/vocab.txt', do_lower_case=True)
     #self.model = BertForSequenceClassification.from_pretrained('./models/', cache_dir=None, from_tf=False, state_dict=None).to("cuda:0")
     self.bert_device = bert_device
     self.model = BertForNextSentencePrediction.from_pretrained(
         'bert-base-uncased').cuda(self.bert_device)
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     self.max_seq_len = 128  #TODO: Dont hard code this
Пример #5
0
    def load(self, fname=None):
        if fname is not None:
            self.load_path = fname

        if self.pretrained_bert and not Path(self.pretrained_bert).is_file():
            self.model = BertForNextSentencePrediction.from_pretrained(
                self.pretrained_bert, output_attentions=False, output_hidden_states=False)
        elif self.bert_config_file and Path(self.bert_config_file).is_file():
            self.bert_config = BertConfig.from_json_file(str(expand_path(self.bert_config_file)))

            if self.attention_probs_keep_prob is not None:
                self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob
            if self.hidden_keep_prob is not None:
                self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob
            self.model = BertForNextSentencePrediction(config=self.bert_config)
        else:
            raise ConfigError("No pre-trained BERT model is given.")

        self.model.to(self.device)
    def __init__(self, categories: list):
        # load pretrained BERT
        self.categories = categories

        # Load pre-trained model tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForNextSentencePrediction.from_pretrained(
            'bert-base-uncased')

        self.model.eval()
Пример #7
0
def main():
    path = '/home/shuo/Documents/AI_learning/LearningQ/data/teded/teded_crawled_data/'
    path_new = '/home/shuo/Documents/AI_learning/LearningQ/data/teded/teded_crawled_data/'

    analysis = text_analysis()
    # analysis.read_realtion(path)
    analysis.read_videoinfo(path_new)
    # questions=analysis.gather_question()
    question = analysis.video_question
    # for item in question:
    #    print(question[item]['quizzes'][0].keys())
    """
    self.video_question[title]: video_link', 'video_title_length', 'video_description', 'quizzes', 'video_youtube_link
    quizzes: quiz_description', 'question_type', 'quiz_options', 'hint', 'answer'
    multiple-choices open-ended
    """
    scripts = analysis.gather_transcripts(path)
    temp_dic = analysis.build_question_transcripts(path_new)

    temp = []
    for item in temp_dic:
        print(item)
        for quiz in temp_dic[item]['questions']:
            if quiz['question_type'] == 'multiple-choices':
                temp.append(temp_dic[item])
                break
    new_element = get_question_hint_sentence_x(temp[1])
    #for question in temp[-1]['questions']:
    #    print(question)
    #
    #print(temp[-1])
    #print(len(temp))
    tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
    model = BertForNextSentencePrediction.from_pretrained('bert-large-cased')
    #tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
    #model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
    print(temp[1])
    for i in range(len(new_element['question'])):
        first_sentence = new_element['question'][i]['quiz_description'].strip(
            ' ')
        next_sentences = new_element['question'][i]['responding_candidate']

        x = []
        for sentence in next_sentences:
            encoding = tokenizer(first_sentence, sentence, return_tensors='pt')
            outputs = model(**encoding, labels=torch.LongTensor([1]))
            logits = outputs.logits
            probs = softmax(logits, dim=1)[0][0].item()
            x.append(probs)
        a = argmax(x)
        #print(next_sentences)
        print(first_sentence)
        print(next_sentences)
        print(new_element['question'][i]['video_answer_hinted'])
        break
Пример #8
0
 def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
     
     from transformers import BertForNextSentencePrediction, BertTokenizer
     # download model
     self.path_model= download_model('bert.botxo.pytorch', cache_dir, process_func=_unzip_process_func,verbose=verbose)
     # Load pre-trained model tokenizer
     self.tokenizer = BertTokenizer.from_pretrained(self.path_model)
     # Load pre-trained model (weights)
     self.model = BertForNextSentencePrediction.from_pretrained(self.path_model,
                                       output_hidden_states = True, # Whether the model returns all hidden-states.
                                       )
Пример #9
0
    def test_get_probability_of_next_sentence(self):
        tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        model = BertForNextSentencePrediction.from_pretrained('bert-base-cased')

        text1 = "How old are you?"
        text2 = "The Eiffel Tower is in Paris"
        text3 = "I am 22 years old"
        prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2)
        prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3)

        assert_almost_equal(prob1, 0.0149559)
        assert_almost_equal(prob2, 0.9997911)
Пример #10
0
    def test_get_probability_of_next_sentence_portuguesee(self):
        tokenizer = BertTokenizer.from_pretrained('models/neuralmind/bert-base-portuguese-cased')
        model = BertForNextSentencePrediction.from_pretrained('models/neuralmind/bert-base-portuguese-cased')

        text1 = "Quantos anos você tem?"
        text2 = "A Torre Eiffel fica em Paris"
        text3 = "Eu tenho 22 anos"
        prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2)
        prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3)

        assert_almost_equal(prob1, 0.5229671)
        assert_almost_equal(prob2, 0.9979677)
def get_nsp_model_and_optimizer(params, device):

    model_name = params["model_name"]

    assert model_name[:4] == "bert", f"Non-BERT models not supported for NSP. Supplied model name was {model_name}"
    
    nsp_model = BertForNextSentencePrediction.from_pretrained(model_name)

    nsp_model = nsp_model.to(device)

    optimizer = get_weighted_adam_optimizer(nsp_model, params)

    return nsp_model, optimizer
Пример #12
0
    def __init__(self, **kwargs):
        """
		Initialized the BERT model
		:param batch_size: [int] batch size to used for bert
		"""
        super().__init__()
        self.batch_size = kwargs['batch_size']
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForNextSentencePrediction.from_pretrained(
            'bert-base-uncased')
        self.model.eval()
        if torch.cuda.is_available():
            self.model.cuda()
Пример #13
0
 def create_and_check_for_next_sequence_prediction(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     model = BertForNextSentencePrediction(config=config)
     model.to(torch_device)
     model.eval()
     result = model(
         input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels,
     )
     self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
Пример #14
0
    def __init__(self,
                 surprise_weight: float = 2.0,
                 sentence_weight: float = 1.0):
        self.tokenizer = BertTokenizer.from_pretrained(
            self.PRETRAINED_MODEL_NAME)

        self.sentence_model = BertForNextSentencePrediction.from_pretrained(
            self.PRETRAINED_MODEL_NAME)
        self.sentence_model.eval()

        self.language_model = BertForMaskedLM.from_pretrained(
            self.PRETRAINED_MODEL_NAME)
        self.language_model.eval()

        self.surprise_weight = surprise_weight
        self.sentence_weight = sentence_weight
Пример #15
0
    def __init__(self):
        self.filename = 'name'
        self.verbose_response = True
        self.output = ""
        #self.kernel = aiml_std.Kernel()
        self.tree = None
        self.root = None
        self.l = []
        self.score = []
        self.memory = {}
        self.index = -1
        self.incomplete = False

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForNextSentencePrediction.from_pretrained(
            'bert-base-uncased')
Пример #16
0
def prep_for_training(num_training_steps):

    if args.model == "Albert":
        model = AlbertForDebateSequenceClassification.from_pretrained(
            "albert-base-v2", newly_added_config=args)
        tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
    elif args.model == "BertNSP":
        model = BertForNextSentencePrediction.from_pretrained(
            "bert-base-uncased")
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    else:
        raise ValueError("request model is not available")

    model.to(DEVICE)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.01,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(num_training_steps * args.warmup_ratio),
        num_training_steps=num_training_steps,
    )
    return model, optimizer, scheduler, tokenizer
Пример #17
0
def task_2():
    # 任务二:句子预测任务
    sample_1 = ("今天天气怎么样", "今天天气很好")
    sample_2 = ("小明今年几岁了", "我不喜欢学习")

    tokenizer = BertTokenizer.from_pretrained(bert_path)
    sen_code = tokenizer.batch_encode_plus(
        [sample_1, sample_2])  # 上下句结合可以这样传参 List[Tuple[str, str]]
    input_ids = torch.tensor(sen_code["input_ids"])

    model = BertForNextSentencePrediction.from_pretrained(bert_path)

    model.eval()
    outputs = model(input_ids)
    seq_relationship_scores = outputs.logits  # torch.Size([batch, 2])

    # pred_lst = seq_relationship_scores.max(dim=1).indices  # torch.Size([batch, 2])
    pred_lst = seq_relationship_scores.argmax(axis=1)  # torch.Size([batch, 2])
    for pred in pred_lst:
        print(f"预测结果:{pred}")  # 0表示是上下句,1表示不是上下句(第二句明明不是前后句关系,不知道为什么会输出0)
Пример #18
0
 def create_and_check_bert_for_next_sequence_prediction(
         self, config, input_ids, token_type_ids, input_mask,
         sequence_labels, token_labels, choice_labels):
     model = BertForNextSentencePrediction(config=config)
     model.to(torch_device)
     model.eval()
     result = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         next_sentence_label=sequence_labels,
     )
     self.parent.assertListEqual(list(result["logits"].size()),
                                 [self.batch_size, 2])
     self.check_loss_output(result)
Пример #19
0
    def test_get_probability_of_next_sentence(self):
        tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        model = BertForNextSentencePrediction.from_pretrained(
            'bert-base-cased')

        wrong_sentence = "How old are you? The Eiffel Tower is in Paris"
        correct_sentence = "How old are you? I am 22 years old"

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                correct_sentence,
                                                wrong_sentence)

        assert full
        assert partial == 0

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                wrong_sentence,
                                                correct_sentence)

        assert not full
        assert partial == 0
Пример #20
0
    def test_get_probability_of_next_sentence_multilingual(self):
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        model = BertForNextSentencePrediction.from_pretrained('bert-base-multilingual-cased')

        text1 = "How old are you?"
        text2 = "The Eiffel Tower is in Paris"
        text3 = "I am 22 years old"
        prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2)
        prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3)

        assert_almost_equal(prob1, 0.5525756)
        assert_almost_equal(prob2, 0.9784408)

        text1 = "Quantos anos você tem?"
        text2 = "A Torre Eiffel fica em Paris"
        text3 = "Eu tenho 22 anos"
        prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2)
        prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3)

        assert_almost_equal(prob1, 0.8567284)
        assert_almost_equal(prob2, 0.9410717)
Пример #21
0
    def __init__(self, extractor, config):
        super().__init__()

        self.config = config

        if config["hidden"] == 0:
            self.combine = nn.Linear(config["topk"], 1, bias=False)
            with torch.no_grad():
                self.combine.weight = nn.Parameter(
                    torch.ones_like(self.combine.weight) / config["topk"])
        else:
            assert config["hidden"] > 0
            self.combine = nn.Sequential(
                nn.Linear(config["topk"], config["hidden"]), nn.ReLU(),
                nn.Linear(config["hidden"], 1))

        # original model file (requires apex):
        # state = torch.load("/GW/NeuralIR/nobackup/birch-emnlp_bert4ir_v2/models/saved.msmarco_mb_1", map_location="cpu")
        # self.bert = state["model"]

        # saved.msmarco_mb_1 weights exported from the official apex model:
        # self.bert = BertForNextSentencePrediction.from_pretrained("bert-large-uncased")
        # self.bert.load_state_dict(torch.load("/GW/NeuralIR/nobackup/birch-emnlp_bert4ir_v2/models/converted"))
        # converted_weights.msmarco_mb

        # kevin's base model:
        # self.bert = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")
        # saved_bert = torch.load("/GW/NeuralIR/nobackup/birch/models/saved.tmp_1")["model"]
        # self.bert.load_state_dict(saved_bert.state_dict())

        # also /GW/NeuralIR/nobackup/birch-emnlp_bert4ir_v2/models/export/birch-bert-base-kevin
        self.bert = BertForNextSentencePrediction.from_pretrained(
            f"Capreolus/birch-bert-large-{config['pretrained']}")

        if not config["finetune"]:
            self.bert.requires_grad = False
            self.bert_context = torch.no_grad
        else:
            self.bert_context = contextlib.nullcontext
Пример #22
0
    def test_get_probability_of_next_sentence_multilingual(self):
        tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased')
        model = BertForNextSentencePrediction.from_pretrained(
            'bert-base-multilingual-cased')

        wrong_sentence = "How old are you? The Eiffel Tower is in Paris"
        correct_sentence = "How old are you? I am 22 years old"

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                correct_sentence,
                                                wrong_sentence)

        assert full
        assert partial == 0

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                wrong_sentence,
                                                correct_sentence)

        assert not full
        assert partial == 0

        wrong_sentence = "Quantos anos você tem? A Torre Eiffel fica em Paris"
        correct_sentence = "Quantos anos você tem? Eu tenho 22 anos"

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                correct_sentence,
                                                wrong_sentence)

        assert full
        assert partial == 0

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                wrong_sentence,
                                                correct_sentence)

        assert not full
        assert partial == 0
Пример #23
0
    def test_get_probability_of_next_sentence_portuguesee(self):
        tokenizer = BertTokenizer.from_pretrained(
            'models/neuralmind/bert-base-portuguese-cased')
        model = BertForNextSentencePrediction.from_pretrained(
            'models/neuralmind/bert-base-portuguese-cased')

        wrong_sentence = "Quantos anos você tem? A Torre Eiffel fica em Paris"
        correct_sentence = "Quantos anos você tem? Eu tenho 22 anos"

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                correct_sentence,
                                                wrong_sentence)

        assert full
        assert partial == 0

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                wrong_sentence,
                                                correct_sentence)

        assert not full
        assert partial == 0
Пример #24
0
def main():
    path_new = '/home/shuo/Documents/AI_learning/LearningQ/data/teded/teded_crawled_data/'
    correct_path = '/home/shuo/Documents/AI_learning/LearningQ/code/teded/video_hint/question_corrected.txt'
    analysis = text_analysis()
    analysis.read_relation(path_new)
    analysis.read_videoinfo(path_new)
    # questions=analysis.gather_question()
    question = analysis.video_question
    analysis.read_video_questions_from_JSON(correct_path)

    # for item in question:
    #    print(question[item]['quizzes'][0].keys())
    """
    self.video_question[title]: video_link', 'video_title_length', 'video_description', 'quizzes', 'video_youtube_link
    quizzes: quiz_description', 'question_type', 'quiz_options', 'hint', 'answer'
    multiple-choices open-ended
    """
    tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
    model = BertForNextSentencePrediction.from_pretrained('bert-large-cased')
    scripts = analysis.gather_transcripts(path_new)
    temp_dic = analysis.build_question_transcripts(path_new)
    temp_dic = analysis.align_subject(temp_dic)
    cateloged = gather_subjects(temp_dic)
    stats = []

    for c in cateloged:

        if len(cateloged[c]) <= 20:
            temp = cateloged[c]
        else:
            temp = random.sample(cateloged[c], 10)
            print(c + '\n')
    #

        for i in range(2, 5):
            print_stats(temp, model, tokenizer, i, add_answer=False)

            break
Пример #25
0
    def __init__(self,
                 number_of_sentence,
                 adjust_weight,
                 trained_baseline_model=None,
                 transform=True):
        super(MultiBERTsModel, self).__init__()
        self.number_of_sentence = number_of_sentence
        self.adjust_weight = adjust_weight
        self.bertNSP = BertForNextSentencePrediction.from_pretrained(
            'bert-base-chinese')
        self.softmax = nn.Softmax(dim=1)
        #self.linear = nn.Linear(768 * self.number_of_sentence, 1)
        #self.bert = BertModel.from_pretrained('bert-base-chinese')

        if trained_baseline_model:
            self.bert = trained_baseline_model.bert
            self.sp_linear = trained_baseline_model.linear
        else:
            self.bert = BertModel.from_pretrained('bert-base-chinese')
            self.sp_linear = nn.Linear(768, 1)

        if transform:
            self.transform = nn.Linear(768, 768, bias=False)
Пример #26
0
def predict_next(sen1, sen2, tokenizer, model_path):
    model = BertForNextSentencePrediction.from_pretrained(model_path)

    tokenized_sen1 = tokenizer.tokenize(sen1)
    tokenized_sen2 = tokenizer.tokenize(sen2)
    tokenized_text = tokenized_sen1 + tokenized_sen2
    print(tokenized_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids_1 = [0] * len(tokenized_sen1)
    segments_ids_2 = [1] * len(tokenized_sen2)
    segments_ids = segments_ids_1 + segments_ids_2
    segments_tensors = torch.tensor([segments_ids])
    print(segments_tensors)
    with torch.no_grad():
        outputs = model(torch.tensor([indexed_tokens]),
                        token_type_ids=segments_tensors)

    predictions = outputs[0].cpu().numpy()

    result = predictions[0][1] > predictions[0][0]

    print(predictions)
    print(result)
    partition = int(sys.argv[1])
    from_idx = partition * 1000
    to_idx = (partition + 1) * 1000

    test_book_ids = test_book_ids[from_idx:to_idx]
    print(len(test_book_ids), 'books')
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    cls, sep = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]"])
    
    
    device = torch.device("cuda:" + str(partition + 1) if torch.cuda.is_available() else "cpu")
    print(device)
    
    print(torch.cuda.device_count(), "GPUs")

    model = BertForNextSentencePrediction.from_pretrained(model_dir)
    model = model.to(device)
    
    
    for book_id in test_book_ids:
        print(book_id)
        try:
            process_book(bert_tok_dir, pred_scores_dir, model, device, cls, sep, book_id)
        except Exception as e:
            print(book_id, e)
    
    print('Done!')
Пример #28
0
 def _get_next_sentence_prediction(self):
     """
     Initializes the BertForNextSentencePrediction transformer
     """
     self.nsp = BertForNextSentencePrediction.from_pretrained(self.model)
     self.nsp.eval()
Пример #29
0
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, BertForNextSentencePrediction, BertTokenizer
# import gpt_2_simple as gpt2

tokenizer_qa = AutoTokenizer.from_pretrained(
    "deepset/bert-large-uncased-whole-word-masking-squad2")
model_qa = AutoModelForQuestionAnswering.from_pretrained(
    "deepset/bert-large-uncased-whole-word-masking-squad2")

print("BERT For Q/A downloaded")

model_nsp = BertForNextSentencePrediction.from_pretrained('bert-base-cased')
tokenizer_nsp = BertTokenizer.from_pretrained('bert-base-cased')

print("BERT NSP downloaded")

# sess = gpt2.start_tf_sess()
# gpt2.load_gpt2(sess, run_name='run1_topical_token')

# print("GPT2 loaded")
    # model-related
    parser.add_argument("--model",
                        default="albert-base-v2",
                        type=str,
                        help="Model Name")

    parser.add_argument("--batch_size",
                        default=2,
                        type=int,
                        help="Training batch size")

    # data-related
    parser.add_argument("--data_path",
                        default="./data/augmented_data.csv",
                        type=str)

    parser.add_argument("--train_size",
                        default=0.5,
                        type=float,
                        help="Training Size (ratio)")

    args = parser.parse_args()

    from transformers import BertForNextSentencePrediction, BertTokenizer

    args.model = BertForNextSentencePrediction.from_pretrained(
        "bert-base-uncased")
    args.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    seed()
    main(args)