Пример #1
0
def main():
    model = BertForNextSentencePrediction.from_pretrained('bert-base-cased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    seq_A = 'I like cookies !'
    seq_B = 'Do you like them ?'
    probs = bert_seq(model, tokenizer, seq_A, seq_B)
    print(probs)
def add_sc(data):
    print("Computing Semantic Coherence")
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")
    softmax = torch.nn.Softmax(dim=1)
    model.eval()
    output = []
    for ex in tqdm(data):
        summary = ex["summary"]
        scores = []
        sentences = sent_tokenize(summary)
        if len(sentences) <= 1:
            ex["coherence"] = 1
        else:
            numerator = 0
            denominator = len(sentences) - 1
            for i in range(len(sentences) - 1):
                prev = sentences[i]
                curr = sentences[i + 1]
                s = "[CLS] " + prev + " [SEP] " + curr + " [SEP]"
                tokenized_text = tokenizer.tokenize(s)
                boundary = tokenized_text.index("[SEP]")
                segment_ids = [0] * boundary + [1] * (len(tokenized_text) -
                                                      boundary)
                indexed_tokens = tokenizer.convert_tokens_to_ids(
                    tokenized_text)
                tokens_tensor = torch.tensor([indexed_tokens])
                segments_tensors = torch.tensor([segment_ids])
                with torch.no_grad():
                    prediction = model(tokens_tensor,
                                       token_type_ids=segments_tensors)[0]
                prediction_sm = softmax(prediction)[0].tolist()
                if prediction_sm[0] > 0.5:
                    numerator += 1
            ex["coherence"] = numerator / denominator
Пример #3
0
 def __init__(self, bert_device):
     #self.tokenizer = tokenizer = BertTokenizer('./models/vocab.txt', do_lower_case=True)
     #self.model = BertForSequenceClassification.from_pretrained('./models/', cache_dir=None, from_tf=False, state_dict=None).to("cuda:0")
     self.bert_device = bert_device
     self.model = BertForNextSentencePrediction.from_pretrained(
         'bert-base-uncased').cuda(self.bert_device)
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     self.max_seq_len = 128  #TODO: Dont hard code this
    def __init__(self, categories: list):
        # load pretrained BERT
        self.categories = categories

        # Load pre-trained model tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForNextSentencePrediction.from_pretrained(
            'bert-base-uncased')

        self.model.eval()
Пример #5
0
def main():
    path = '/home/shuo/Documents/AI_learning/LearningQ/data/teded/teded_crawled_data/'
    path_new = '/home/shuo/Documents/AI_learning/LearningQ/data/teded/teded_crawled_data/'

    analysis = text_analysis()
    # analysis.read_realtion(path)
    analysis.read_videoinfo(path_new)
    # questions=analysis.gather_question()
    question = analysis.video_question
    # for item in question:
    #    print(question[item]['quizzes'][0].keys())
    """
    self.video_question[title]: video_link', 'video_title_length', 'video_description', 'quizzes', 'video_youtube_link
    quizzes: quiz_description', 'question_type', 'quiz_options', 'hint', 'answer'
    multiple-choices open-ended
    """
    scripts = analysis.gather_transcripts(path)
    temp_dic = analysis.build_question_transcripts(path_new)

    temp = []
    for item in temp_dic:
        print(item)
        for quiz in temp_dic[item]['questions']:
            if quiz['question_type'] == 'multiple-choices':
                temp.append(temp_dic[item])
                break
    new_element = get_question_hint_sentence_x(temp[1])
    #for question in temp[-1]['questions']:
    #    print(question)
    #
    #print(temp[-1])
    #print(len(temp))
    tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
    model = BertForNextSentencePrediction.from_pretrained('bert-large-cased')
    #tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
    #model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
    print(temp[1])
    for i in range(len(new_element['question'])):
        first_sentence = new_element['question'][i]['quiz_description'].strip(
            ' ')
        next_sentences = new_element['question'][i]['responding_candidate']

        x = []
        for sentence in next_sentences:
            encoding = tokenizer(first_sentence, sentence, return_tensors='pt')
            outputs = model(**encoding, labels=torch.LongTensor([1]))
            logits = outputs.logits
            probs = softmax(logits, dim=1)[0][0].item()
            x.append(probs)
        a = argmax(x)
        #print(next_sentences)
        print(first_sentence)
        print(next_sentences)
        print(new_element['question'][i]['video_answer_hinted'])
        break
Пример #6
0
 def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
     
     from transformers import BertForNextSentencePrediction, BertTokenizer
     # download model
     self.path_model= download_model('bert.botxo.pytorch', cache_dir, process_func=_unzip_process_func,verbose=verbose)
     # Load pre-trained model tokenizer
     self.tokenizer = BertTokenizer.from_pretrained(self.path_model)
     # Load pre-trained model (weights)
     self.model = BertForNextSentencePrediction.from_pretrained(self.path_model,
                                       output_hidden_states = True, # Whether the model returns all hidden-states.
                                       )
Пример #7
0
    def test_get_probability_of_next_sentence_portuguesee(self):
        tokenizer = BertTokenizer.from_pretrained('models/neuralmind/bert-base-portuguese-cased')
        model = BertForNextSentencePrediction.from_pretrained('models/neuralmind/bert-base-portuguese-cased')

        text1 = "Quantos anos você tem?"
        text2 = "A Torre Eiffel fica em Paris"
        text3 = "Eu tenho 22 anos"
        prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2)
        prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3)

        assert_almost_equal(prob1, 0.5229671)
        assert_almost_equal(prob2, 0.9979677)
Пример #8
0
    def test_get_probability_of_next_sentence(self):
        tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        model = BertForNextSentencePrediction.from_pretrained('bert-base-cased')

        text1 = "How old are you?"
        text2 = "The Eiffel Tower is in Paris"
        text3 = "I am 22 years old"
        prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2)
        prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3)

        assert_almost_equal(prob1, 0.0149559)
        assert_almost_equal(prob2, 0.9997911)
def get_nsp_model_and_optimizer(params, device):

    model_name = params["model_name"]

    assert model_name[:4] == "bert", f"Non-BERT models not supported for NSP. Supplied model name was {model_name}"
    
    nsp_model = BertForNextSentencePrediction.from_pretrained(model_name)

    nsp_model = nsp_model.to(device)

    optimizer = get_weighted_adam_optimizer(nsp_model, params)

    return nsp_model, optimizer
Пример #10
0
    def __init__(self, **kwargs):
        """
		Initialized the BERT model
		:param batch_size: [int] batch size to used for bert
		"""
        super().__init__()
        self.batch_size = kwargs['batch_size']
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForNextSentencePrediction.from_pretrained(
            'bert-base-uncased')
        self.model.eval()
        if torch.cuda.is_available():
            self.model.cuda()
Пример #11
0
    def __init__(self):
        self.filename = 'name'
        self.verbose_response = True
        self.output = ""
        #self.kernel = aiml_std.Kernel()
        self.tree = None
        self.root = None
        self.l = []
        self.score = []
        self.memory = {}
        self.index = -1
        self.incomplete = False

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForNextSentencePrediction.from_pretrained(
            'bert-base-uncased')
Пример #12
0
    def __init__(self,
                 surprise_weight: float = 2.0,
                 sentence_weight: float = 1.0):
        self.tokenizer = BertTokenizer.from_pretrained(
            self.PRETRAINED_MODEL_NAME)

        self.sentence_model = BertForNextSentencePrediction.from_pretrained(
            self.PRETRAINED_MODEL_NAME)
        self.sentence_model.eval()

        self.language_model = BertForMaskedLM.from_pretrained(
            self.PRETRAINED_MODEL_NAME)
        self.language_model.eval()

        self.surprise_weight = surprise_weight
        self.sentence_weight = sentence_weight
Пример #13
0
    def load(self, fname=None):
        if fname is not None:
            self.load_path = fname

        if self.pretrained_bert and not Path(self.pretrained_bert).is_file():
            self.model = BertForNextSentencePrediction.from_pretrained(
                self.pretrained_bert, output_attentions=False, output_hidden_states=False)
        elif self.bert_config_file and Path(self.bert_config_file).is_file():
            self.bert_config = BertConfig.from_json_file(str(expand_path(self.bert_config_file)))

            if self.attention_probs_keep_prob is not None:
                self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob
            if self.hidden_keep_prob is not None:
                self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob
            self.model = BertForNextSentencePrediction(config=self.bert_config)
        else:
            raise ConfigError("No pre-trained BERT model is given.")

        self.model.to(self.device)
Пример #14
0
def prep_for_training(num_training_steps):

    if args.model == "Albert":
        model = AlbertForDebateSequenceClassification.from_pretrained(
            "albert-base-v2", newly_added_config=args)
        tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
    elif args.model == "BertNSP":
        model = BertForNextSentencePrediction.from_pretrained(
            "bert-base-uncased")
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    else:
        raise ValueError("request model is not available")

    model.to(DEVICE)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.01,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(num_training_steps * args.warmup_ratio),
        num_training_steps=num_training_steps,
    )
    return model, optimizer, scheduler, tokenizer
Пример #15
0
def task_2():
    # 任务二:句子预测任务
    sample_1 = ("今天天气怎么样", "今天天气很好")
    sample_2 = ("小明今年几岁了", "我不喜欢学习")

    tokenizer = BertTokenizer.from_pretrained(bert_path)
    sen_code = tokenizer.batch_encode_plus(
        [sample_1, sample_2])  # 上下句结合可以这样传参 List[Tuple[str, str]]
    input_ids = torch.tensor(sen_code["input_ids"])

    model = BertForNextSentencePrediction.from_pretrained(bert_path)

    model.eval()
    outputs = model(input_ids)
    seq_relationship_scores = outputs.logits  # torch.Size([batch, 2])

    # pred_lst = seq_relationship_scores.max(dim=1).indices  # torch.Size([batch, 2])
    pred_lst = seq_relationship_scores.argmax(axis=1)  # torch.Size([batch, 2])
    for pred in pred_lst:
        print(f"预测结果:{pred}")  # 0表示是上下句,1表示不是上下句(第二句明明不是前后句关系,不知道为什么会输出0)
Пример #16
0
    def test_get_probability_of_next_sentence(self):
        tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        model = BertForNextSentencePrediction.from_pretrained(
            'bert-base-cased')

        wrong_sentence = "How old are you? The Eiffel Tower is in Paris"
        correct_sentence = "How old are you? I am 22 years old"

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                correct_sentence,
                                                wrong_sentence)

        assert full
        assert partial == 0

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                wrong_sentence,
                                                correct_sentence)

        assert not full
        assert partial == 0
Пример #17
0
    def test_get_probability_of_next_sentence_multilingual(self):
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        model = BertForNextSentencePrediction.from_pretrained('bert-base-multilingual-cased')

        text1 = "How old are you?"
        text2 = "The Eiffel Tower is in Paris"
        text3 = "I am 22 years old"
        prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2)
        prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3)

        assert_almost_equal(prob1, 0.5525756)
        assert_almost_equal(prob2, 0.9784408)

        text1 = "Quantos anos você tem?"
        text2 = "A Torre Eiffel fica em Paris"
        text3 = "Eu tenho 22 anos"
        prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2)
        prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3)

        assert_almost_equal(prob1, 0.8567284)
        assert_almost_equal(prob2, 0.9410717)
Пример #18
0
    def __init__(self, extractor, config):
        super().__init__()

        self.config = config

        if config["hidden"] == 0:
            self.combine = nn.Linear(config["topk"], 1, bias=False)
            with torch.no_grad():
                self.combine.weight = nn.Parameter(
                    torch.ones_like(self.combine.weight) / config["topk"])
        else:
            assert config["hidden"] > 0
            self.combine = nn.Sequential(
                nn.Linear(config["topk"], config["hidden"]), nn.ReLU(),
                nn.Linear(config["hidden"], 1))

        # original model file (requires apex):
        # state = torch.load("/GW/NeuralIR/nobackup/birch-emnlp_bert4ir_v2/models/saved.msmarco_mb_1", map_location="cpu")
        # self.bert = state["model"]

        # saved.msmarco_mb_1 weights exported from the official apex model:
        # self.bert = BertForNextSentencePrediction.from_pretrained("bert-large-uncased")
        # self.bert.load_state_dict(torch.load("/GW/NeuralIR/nobackup/birch-emnlp_bert4ir_v2/models/converted"))
        # converted_weights.msmarco_mb

        # kevin's base model:
        # self.bert = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")
        # saved_bert = torch.load("/GW/NeuralIR/nobackup/birch/models/saved.tmp_1")["model"]
        # self.bert.load_state_dict(saved_bert.state_dict())

        # also /GW/NeuralIR/nobackup/birch-emnlp_bert4ir_v2/models/export/birch-bert-base-kevin
        self.bert = BertForNextSentencePrediction.from_pretrained(
            f"Capreolus/birch-bert-large-{config['pretrained']}")

        if not config["finetune"]:
            self.bert.requires_grad = False
            self.bert_context = torch.no_grad
        else:
            self.bert_context = contextlib.nullcontext
Пример #19
0
    def test_get_probability_of_next_sentence_multilingual(self):
        tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased')
        model = BertForNextSentencePrediction.from_pretrained(
            'bert-base-multilingual-cased')

        wrong_sentence = "How old are you? The Eiffel Tower is in Paris"
        correct_sentence = "How old are you? I am 22 years old"

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                correct_sentence,
                                                wrong_sentence)

        assert full
        assert partial == 0

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                wrong_sentence,
                                                correct_sentence)

        assert not full
        assert partial == 0

        wrong_sentence = "Quantos anos você tem? A Torre Eiffel fica em Paris"
        correct_sentence = "Quantos anos você tem? Eu tenho 22 anos"

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                correct_sentence,
                                                wrong_sentence)

        assert full
        assert partial == 0

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                wrong_sentence,
                                                correct_sentence)

        assert not full
        assert partial == 0
Пример #20
0
    def test_get_probability_of_next_sentence_portuguesee(self):
        tokenizer = BertTokenizer.from_pretrained(
            'models/neuralmind/bert-base-portuguese-cased')
        model = BertForNextSentencePrediction.from_pretrained(
            'models/neuralmind/bert-base-portuguese-cased')

        wrong_sentence = "Quantos anos você tem? A Torre Eiffel fica em Paris"
        correct_sentence = "Quantos anos você tem? Eu tenho 22 anos"

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                correct_sentence,
                                                wrong_sentence)

        assert full
        assert partial == 0

        full, partial = analyse_single_wsc_bert(model, tokenizer,
                                                wrong_sentence,
                                                correct_sentence)

        assert not full
        assert partial == 0
Пример #21
0
def main():
    path_new = '/home/shuo/Documents/AI_learning/LearningQ/data/teded/teded_crawled_data/'
    correct_path = '/home/shuo/Documents/AI_learning/LearningQ/code/teded/video_hint/question_corrected.txt'
    analysis = text_analysis()
    analysis.read_relation(path_new)
    analysis.read_videoinfo(path_new)
    # questions=analysis.gather_question()
    question = analysis.video_question
    analysis.read_video_questions_from_JSON(correct_path)

    # for item in question:
    #    print(question[item]['quizzes'][0].keys())
    """
    self.video_question[title]: video_link', 'video_title_length', 'video_description', 'quizzes', 'video_youtube_link
    quizzes: quiz_description', 'question_type', 'quiz_options', 'hint', 'answer'
    multiple-choices open-ended
    """
    tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
    model = BertForNextSentencePrediction.from_pretrained('bert-large-cased')
    scripts = analysis.gather_transcripts(path_new)
    temp_dic = analysis.build_question_transcripts(path_new)
    temp_dic = analysis.align_subject(temp_dic)
    cateloged = gather_subjects(temp_dic)
    stats = []

    for c in cateloged:

        if len(cateloged[c]) <= 20:
            temp = cateloged[c]
        else:
            temp = random.sample(cateloged[c], 10)
            print(c + '\n')
    #

        for i in range(2, 5):
            print_stats(temp, model, tokenizer, i, add_answer=False)

            break
Пример #22
0
    def __init__(self,
                 number_of_sentence,
                 adjust_weight,
                 trained_baseline_model=None,
                 transform=True):
        super(MultiBERTsModel, self).__init__()
        self.number_of_sentence = number_of_sentence
        self.adjust_weight = adjust_weight
        self.bertNSP = BertForNextSentencePrediction.from_pretrained(
            'bert-base-chinese')
        self.softmax = nn.Softmax(dim=1)
        #self.linear = nn.Linear(768 * self.number_of_sentence, 1)
        #self.bert = BertModel.from_pretrained('bert-base-chinese')

        if trained_baseline_model:
            self.bert = trained_baseline_model.bert
            self.sp_linear = trained_baseline_model.linear
        else:
            self.bert = BertModel.from_pretrained('bert-base-chinese')
            self.sp_linear = nn.Linear(768, 1)

        if transform:
            self.transform = nn.Linear(768, 768, bias=False)
Пример #23
0
def predict_next(sen1, sen2, tokenizer, model_path):
    model = BertForNextSentencePrediction.from_pretrained(model_path)

    tokenized_sen1 = tokenizer.tokenize(sen1)
    tokenized_sen2 = tokenizer.tokenize(sen2)
    tokenized_text = tokenized_sen1 + tokenized_sen2
    print(tokenized_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids_1 = [0] * len(tokenized_sen1)
    segments_ids_2 = [1] * len(tokenized_sen2)
    segments_ids = segments_ids_1 + segments_ids_2
    segments_tensors = torch.tensor([segments_ids])
    print(segments_tensors)
    with torch.no_grad():
        outputs = model(torch.tensor([indexed_tokens]),
                        token_type_ids=segments_tensors)

    predictions = outputs[0].cpu().numpy()

    result = predictions[0][1] > predictions[0][0]

    print(predictions)
    print(result)
    partition = int(sys.argv[1])
    from_idx = partition * 1000
    to_idx = (partition + 1) * 1000

    test_book_ids = test_book_ids[from_idx:to_idx]
    print(len(test_book_ids), 'books')
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    cls, sep = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]"])
    
    
    device = torch.device("cuda:" + str(partition + 1) if torch.cuda.is_available() else "cpu")
    print(device)
    
    print(torch.cuda.device_count(), "GPUs")

    model = BertForNextSentencePrediction.from_pretrained(model_dir)
    model = model.to(device)
    
    
    for book_id in test_book_ids:
        print(book_id)
        try:
            process_book(bert_tok_dir, pred_scores_dir, model, device, cls, sep, book_id)
        except Exception as e:
            print(book_id, e)
    
    print('Done!')
Пример #25
0
 def _get_next_sentence_prediction(self):
     """
     Initializes the BertForNextSentencePrediction transformer
     """
     self.nsp = BertForNextSentencePrediction.from_pretrained(self.model)
     self.nsp.eval()
Пример #26
0
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, BertForNextSentencePrediction, BertTokenizer
# import gpt_2_simple as gpt2

tokenizer_qa = AutoTokenizer.from_pretrained(
    "deepset/bert-large-uncased-whole-word-masking-squad2")
model_qa = AutoModelForQuestionAnswering.from_pretrained(
    "deepset/bert-large-uncased-whole-word-masking-squad2")

print("BERT For Q/A downloaded")

model_nsp = BertForNextSentencePrediction.from_pretrained('bert-base-cased')
tokenizer_nsp = BertTokenizer.from_pretrained('bert-base-cased')

print("BERT NSP downloaded")

# sess = gpt2.start_tf_sess()
# gpt2.load_gpt2(sess, run_name='run1_topical_token')

# print("GPT2 loaded")
    # model-related
    parser.add_argument("--model",
                        default="albert-base-v2",
                        type=str,
                        help="Model Name")

    parser.add_argument("--batch_size",
                        default=2,
                        type=int,
                        help="Training batch size")

    # data-related
    parser.add_argument("--data_path",
                        default="./data/augmented_data.csv",
                        type=str)

    parser.add_argument("--train_size",
                        default=0.5,
                        type=float,
                        help="Training Size (ratio)")

    args = parser.parse_args()

    from transformers import BertForNextSentencePrediction, BertTokenizer

    args.model = BertForNextSentencePrediction.from_pretrained(
        "bert-base-uncased")
    args.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    seed()
    main(args)
Пример #28
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument("--trained_model_dir",
                        default="",
                        type=str,
                        help="Where is the fine-tuned BERT model?")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError(
            "Training is currently the only implemented execution option. Please set `do_train`."
        )

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        #raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
        print("WARNING: Output directory already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    #train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_dataset = BERTDataset(args.data_dir,
                                    tokenizer,
                                    seq_len=args.max_seq_length)
        num_train_optimization_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    if args.trained_model_dir:
        if os.path.exists(os.path.join(args.output_dir, WEIGHTS_NAME)):
            previous_state_dict = torch.load(
                os.path.join(args.output_dir, WEIGHTS_NAME))
        else:
            from collections import OrderedDict
            previous_state_dict = OrderedDict()
        distant_state_dict = torch.load(
            os.path.join(args.trained_model_dir, WEIGHTS_NAME))
        previous_state_dict.update(
            distant_state_dict
        )  # note that the final layers of previous model and distant model must have different attribute names!
        model = BertForNextSentencePrediction.from_pretrained(
            args.trained_model_dir, state_dict=previous_state_dict)
    else:
        model = BertForNextSentencePrediction.from_pretrained(args.bert_model)
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if args.fp16:
        try:
            from apex.optimizers import FusedAdam
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False)

        model, optimizer = amp.initialize(model, optimizer, opt_level="O2")

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            #TODO: check if this works with current data generator from disk that relies on next(file)
            # (it doesn't return item back by index)
            train_sampler = DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, attention_masks, token_type_ids, next_sentence_labels = batch
                output = model(input_ids,
                               attention_mask=attention_masks,
                               token_type_ids=token_type_ids,
                               labels=next_sentence_labels)
                loss = output['loss']
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

        # Save a trained model
        logger.info("** ** * Saving fine - tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())
Пример #29
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_data_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The input training data file (a text file).")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--eval_data_file",
        default=None,
        type=str,
        help=
        "An optional input evaluation data file to evaluate the perplexity on (a text file)."
    )
    parser.add_argument(
        "--model_name_or_path",
        default="bert-base-cased",
        type=str,
        help="The model checkpoint for weights initialization.")
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help=
        "Optional pretrained config name or path if not the same as model_name_or_path"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help=
        "Optional pretrained tokenizer name or path if not the same as model_name_or_path"
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)"
    )
    parser.add_argument(
        "--block_size",
        default=-1,
        type=int,
        help="Optional input sequence length after tokenization."
        "The training dataset will be truncated in block of this size for training."
        "Default to the model max input length for single sentence inputs (take into account special tokens)."
    )
    # ====== 学習 ======
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    # バリデーション
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Run evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    # ====== 学習オプション ======

    parser.add_argument("--method",
                        default="method1",
                        type=str,
                        help="NSP method.")
    parser.add_argument("--nsp_swap_ratio",
                        default=0.5,
                        type=float,
                        help="random Swap ratio of next sntences.")
    parser.add_argument("--per_gpu_train_batch_size",
                        default=4,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=4,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    # 重み減衰
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=1.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        '--save_total_limit',
        type=int,
        default=None,
        help=
        'Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default'
    )
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    args = parser.parse_args()

    if args.eval_data_file is None and args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup CUDA, GPU & distributed training
    # device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    device = torch.device('cuda:0')
    args.n_gpu = torch.cuda.device_count()
    # args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        -1, device, args.n_gpu, bool(False), args.fp16)

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer

    # config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = BertConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None)
    bert_tokenizer = BertTokenizer.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)
    jp_tokenizer = JumanTokenizer()

    if args.block_size <= 0:
        args.block_size = bert_tokenizer.max_len_single_sentence  # Our input block size will be the max possible for the model
    else:
        args.block_size = min(args.block_size,
                              bert_tokenizer.max_len_single_sentence)

    if args.method == "method1" or args.method == "method3":
        model = BertForNextSentencePrediction.from_pretrained(
            args.model_name_or_path,
            from_tf=bool('.ckpt' in args.model_name_or_path),
            config=config,
            cache_dir=args.cache_dir if args.cache_dir else None)

        # ====== BERT一部パラメータ凍結 =======
        # - BERTエンコーダ最終層,プーラーのみ凍結回避
        bert_last_layer = copy.deepcopy(model.bert.encoder.layer[-1])
        bert_pooler = copy.deepcopy(model.bert.pooler)
        # - BERT凍結
        for param in model.bert.parameters():
            param.requires_grad = False
        # - 非凍結レイヤーで置換
        model.bert.encoder.layer[-1] = bert_last_layer
        model.bert.pooler = bert_pooler
        # =====================================

    else:
        # 未完成
        model = BertSepInputNSPModel.from_pretrained(
            args.model_name_or_path,
            from_tf=bool('.ckpt' in args.model_name_or_path),
            config=config,
            cache_dir=args.cache_dir if args.cache_dir else None)
        # ====== BERT一部パラメータ凍結 =======
        # - BERTエンコーダ最終層,プーラーのみ凍結回避
        bert_last_layer = copy.deepcopy(
            model.bert_for_double.bert_model.encoder.layer[-1])
        bert_pooler = copy.deepcopy(model.bert_for_double.bert_model.pooler)
        # - BERT凍結
        for param in model.bert_for_double.bert_model.parameters():
            param.requires_grad = False
        # - 非凍結レイヤーで置換
        model.bert_for_double.bert_model.encoder.layer[-1] = bert_last_layer
        model.bert_for_double.bert_model.pooler = bert_pooler
        # =====================================

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                bert_tokenizer,
                                                jp_tokenizer,
                                                evaluate=False)

        global_step, tr_loss = train(args, train_dataset, model,
                                     bert_tokenizer, jp_tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        bert_tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        bert_tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)

        if args.method == "method1" or args.method == "method3":
            model = BertForNextSentencePrediction.from_pretrained(
                args.model_name_or_path,
                from_tf=bool('.ckpt' in args.model_name_or_path),
                config=config,
                cache_dir=args.cache_dir if args.cache_dir else None)

            # ====== BERT一部パラメータ凍結 =======
            # - BERTエンコーダ最終層,プーラーのみ凍結回避
            bert_last_layer = copy.deepcopy(model.bert.encoder.layer[-1])
            bert_pooler = copy.deepcopy(model.bert.pooler)
            # - BERT凍結
            for param in model.bert.parameters():
                param.requires_grad = False
            # - 非凍結レイヤーで置換
            model.bert.encoder.layer[-1] = bert_last_layer
            model.bert.pooler = bert_pooler
            # =====================================

        else:
            # method2
            # 未完成
            model = BertSepInputNSPModel.from_pretrained(
                args.model_name_or_path,
                from_tf=bool('.ckpt' in args.model_name_or_path),
                config=config,
                cache_dir=args.cache_dir if args.cache_dir else None)
            # ====== BERT一部パラメータ凍結 =======
            # - BERTエンコーダ最終層,プーラーのみ凍結回避
            bert_last_layer = copy.deepcopy(
                model.bert_for_double.bert_model.encoder.layer[-1])
            bert_pooler = copy.deepcopy(
                model.bert_for_double.bert_model.pooler)
            # - BERT凍結
            for param in model.bert_for_double.bert_model.parameters():
                param.requires_grad = False
            # - 非凍結レイヤーで置換
            model.bert_for_double.bert_model.encoder.layer[
                -1] = bert_last_layer
            model.bert_for_double.bert_model.pooler = bert_pooler
            # =====================================

        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split(
                '/')[-1] if checkpoint.find('checkpoint') != -1 else ""

            if args.method == "method1" or args.method == "method3":
                model = BertForNextSentencePrediction.from_pretrained(
                    args.model_name_or_path,
                    from_tf=bool('.ckpt' in args.model_name_or_path),
                    config=config,
                    cache_dir=args.cache_dir if args.cache_dir else None)

                # ====== BERT一部パラメータ凍結 =======
                # - BERTエンコーダ最終層,プーラーのみ凍結回避
                bert_last_layer = copy.deepcopy(model.bert.encoder.layer[-1])
                bert_pooler = copy.deepcopy(model.bert.pooler)
                # - BERT凍結
                for param in model.bert.parameters():
                    param.requires_grad = False
                # - 非凍結レイヤーで置換
                model.bert.encoder.layer[-1] = bert_last_layer
                model.bert.pooler = bert_pooler
                # =====================================

            else:
                # method2
                # 未完成
                model = BertSepInputNSPModel.from_pretrained(
                    args.model_name_or_path,
                    from_tf=bool('.ckpt' in args.model_name_or_path),
                    config=config,
                    cache_dir=args.cache_dir if args.cache_dir else None)
                # ====== BERT一部パラメータ凍結 =======
                # - BERTエンコーダ最終層,プーラーのみ凍結回避
                bert_last_layer = copy.deepcopy(
                    model.bert_for_double.bert_model.encoder.layer[-1])
                bert_pooler = copy.deepcopy(
                    model.bert_for_double.bert_model.pooler)
                # - BERT凍結
                for param in model.bert_for_double.bert_model.parameters():
                    param.requires_grad = False
                # - 非凍結レイヤーで置換
                model.bert_for_double.bert_model.encoder.layer[
                    -1] = bert_last_layer
                model.bert_for_double.bert_model.pooler = bert_pooler
                # =====================================

            model.to(args.device)
            result = evaluate(args,
                              model,
                              bert_tokenizer,
                              jp_tokenizer,
                              prefix=prefix)
            result = dict(
                (k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)

    return results
Пример #30
0
    print('Reading training data file...')

    df = pd.read_csv(training_data_loc,
                     usecols=[
                         'para1_tokens', 'para2_tokens', 'para1_len',
                         'para2_len', 'label'
                     ])

    df = df[(df['para1_len'] > 0) & (df['para2_len'] > 0)]

    df['para1_tokens'] = df['para1_tokens'].apply(literal_eval)
    df['para2_tokens'] = df['para2_tokens'].apply(literal_eval)

    print('Loading tokenizer and BertNSP...')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    BertNSP = BertForNextSentencePrediction.from_pretrained(
        'bert-base-uncased')

    cls, sep = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]"])

    # Use appropriate locations
    try:
        with open(os.path.join(output_loc, 'input_tokens.pkl'), 'rb') as f:
            input_tokens = pickle.load(f)
        with open(os.path.join(output_loc, 'input_seg_ids.pkl'), 'rb') as f:
            input_seg_ids = pickle.load(f)
        with open(os.path.join(output_loc, 'labels.pkl'), 'rb') as f:
            labels = pickle.load(f)

    except:
        print('Generating training input...')
        input_tokens = list()