def convert_data_to_feature():
    #載入問題資料集
    q = open('Dataset/Query_Train/Final_question.txt', "r", encoding="utf-8")
    questions = q.readlines()
    q.close()
    #載入答案資料集
    a = open('Dataset/Train_Label/FinalDomainLabel.txt', "r", encoding="utf-8")
    answers = a.readlines()
    a.close()
    assert len(answers) == len(questions)
    # ans_dic 表示answer的類別
    ans_dic = make_ans_dic(answers)
    # question_dic 表示question的類別
    question_dic = make_question_dic(questions)

    tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt')
    q_tokens = []
    max_seq_len = 0

    for q in question_dic.data:
        bert_ids = tokenizer.build_inputs_with_special_tokens(
            tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q)))
        if (len(bert_ids) > max_seq_len):
            max_seq_len = len(bert_ids)
        q_tokens.append(bert_ids)

    print("最長問句長度:", max_seq_len)
    assert max_seq_len <= 512  # 小於BERT-base長度限制
    # 補齊長度
    for q in q_tokens:
        while len(q) < max_seq_len:
            q.append(0)
    a_labels = []
    for a in ans_dic.data:
        a_labels.append(ans_dic.to_id(a))
    # BERT input embedding
    answer_lables = a_labels
    input_ids = q_tokens
    input_masks = [[1] * max_seq_len for i in range(len(question_dic))]
    input_segment_ids = [[0] * max_seq_len for i in range(len(question_dic))]
    assert len(input_ids) == len(question_dic) and len(input_ids) == len(
        input_masks) and len(input_ids) == len(input_segment_ids)

    data_features = {
        'input_ids': input_ids,
        'input_masks': input_masks,
        'input_segment_ids': input_segment_ids,
        'answer_lables': answer_lables,
        'question_dic': question_dic,
        'answer_dic': ans_dic
    }

    output = open('Dataset/data_features_domain.pkl', 'wb')
    pickle.dump(data_features, output)
    return data_features
예제 #2
0
    def __init__(self,
                 bert_tokenizer: BertTokenizer,
                 jp_tokenizer: JumanTokenizer,
                 args,
                 file_path='train',
                 block_size=512):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, 'cached_lm_' + str(block_size) + '_' + filename)

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            with open(cached_features_file, 'rb') as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            with open(file_path, encoding="utf-8") as f:
                docs = f.readlines()

            exsamples = []
            for _, line in enumerate(docs):
                text = line.rstrip(os.linesep)

                # separate text into tokens
                tokenized_text = bert_tokenizer.convert_tokens_to_ids(
                    bert_tokenizer.tokenize(" ".join(
                        jp_tokenizer.tokenize(text))))

                # add special tokkens : [CLS] and [SEP]
                added_special = bert_tokenizer.build_inputs_with_special_tokens(
                    tokenized_text)

                # Zero-pad up to the sequence length.
                diff = block_size - len(added_special)
                if diff < 0:
                    added_special = added_special[:diff]
                else:
                    # padding を 0 -> -1に変更
                    padding = [-1] * (block_size - len(added_special))
                    added_special += padding

                assert len(added_special) == block_size

                self.examples.append(added_special)

            logger.info("Saving features into cached file %s",
                        cached_features_file)
            with open(cached_features_file, 'wb') as handle:
                pickle.dump(self.examples,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
예제 #3
0
    def __init__(self,
                 bert_tokenizer: BertTokenizer,
                 jp_tokenizer: JumanTokenizer,
                 args,
                 file_path='train',
                 block_size=512):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, 'dialogue_for_nsp' + '_cached_lm_' + str(block_size) +
            '_' + filename)

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            with open(cached_features_file, 'rb') as handle:
                self.examples, \
                self.token_type_ids, \
                self.attention_mask, \
                self.next_sentence_label = pickle.load(handle)
        else:
            # キャッシュされたデータファイルがなければテキストファイルからデータセットを作成
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            # [CLS] A A A [SEP] B B B [SEP]
            self.token_type_ids = []
            #   0   0 0 0   0   1 1 1  1
            self.attention_mask = []
            #   1   1 1 1   1   1 1 1  1    0 0 0 0 ...
            self.next_sentence_label = []
            # [0, 1] 0: isNext, 1: notNext
            with open(file_path, encoding="utf-8") as f:
                docs = f.readlines()

            exsamples = []

            ZEN = "".join(chr(0xff01 + i) for i in range(94))
            HAN = "".join(chr(0x21 + i) for i in range(94))

            HAN2ZEN = str.maketrans(HAN, ZEN)

            num_doc = len(docs)
            for idx, line in enumerate(docs):

                text = line.rstrip(os.linesep)

                if text == "":
                    continue
                try:
                    next_text = docs[idx + 1].rstrip(os.linesep)
                except IndexError:
                    continue
                if next_text == "":
                    continue

                if random.random() > args.nsp_swap_ratio:
                    while True:
                        rand_idx = random.randrange(0, num_doc)
                        next_text = docs[rand_idx].rstrip(os.linesep)
                        if (not next_text == "") and (rand_idx != idx + 1):
                            break
                    nsp_label = 1
                    # random sequence
                else:
                    nsp_label = 0
                    # continuation sequence
                # jumanエラー対策
                text = text.replace(' ', ' ')
                next_text = next_text.replace(' ', ' ')
                text = mojimoji.han_to_zen(text,
                                           kana=False,
                                           digit=True,
                                           ascii=True)
                next_text = mojimoji.han_to_zen(next_text,
                                                kana=False,
                                                digit=True,
                                                ascii=True)
                text = text.translate(HAN2ZEN)
                next_text = next_text.translate(HAN2ZEN)
                # 元テキストを区切った状態に

                if len(text.encode('utf-8')) > 4096 or len(
                        next_text.encode('utf-8')) > 4096:
                    continue

                first_tokenized_text = bert_tokenizer.convert_tokens_to_ids(
                    bert_tokenizer.tokenize(" ".join(
                        jp_tokenizer.tokenize(text))))
                second_tokenized_text = bert_tokenizer.convert_tokens_to_ids(
                    bert_tokenizer.tokenize(" ".join(
                        jp_tokenizer.tokenize(next_text))))

                fst_len = len(first_tokenized_text)
                scd_len = len(second_tokenized_text)
                # for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
                #    self.examples.append(bert_tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
                # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
                # If your dataset is small, first you should loook for a bigger one :-) and second you
                # can change this behavior by adding (model specific) padding.

                # add special tokens
                # A A A (B B B) ->  [CLS] A A A [SEP] (B B B [SEP])
                added_special = bert_tokenizer.build_inputs_with_special_tokens(
                    token_ids_0=first_tokenized_text,
                    token_ids_1=second_tokenized_text)
                # token type ids
                type_ids = [0] * (2 + fst_len)
                scd_type = [1] * (1 + scd_len)
                type_ids += scd_type

                attention_mask = [1] * len(added_special)

                # Zero-pad up to the sequence length.
                diff = block_size - len(added_special)
                if diff < 0:
                    added_special = added_special[:diff]
                    type_ids = type_ids[:diff]
                    attention_mask = attention_mask[:diff]
                else:
                    padding = [0] * (block_size - len(added_special))
                    padding_1 = [0] * (block_size - len(added_special))
                    padding_2 = [0] * (block_size - len(added_special))
                    added_special += padding
                    type_ids += padding_1
                    attention_mask += padding_2

                assert len(added_special) == block_size
                assert len(type_ids) == block_size
                assert len(attention_mask) == block_size

                self.examples.append(added_special)
                self.token_type_ids.append(type_ids)
                self.attention_mask.append(attention_mask)
                self.next_sentence_label.append(nsp_label)

            logger.info("Saving features into cached file %s",
                        cached_features_file)
            with open(cached_features_file, 'wb') as handle:
                pickle.dump([
                    self.examples, self.token_type_ids, self.attention_mask,
                    self.next_sentence_label
                ],
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
예제 #4
0
def convert_data_to_feature():
    with open('Taipei_QA_new.txt', 'r', encoding='utf-8') as f:
        data = f.read()
    qa_pairs = data.split("\n")

    questions = []
    answers = []
    for qa_pair in qa_pairs:
        qa_pair = qa_pair.split()
        try:
            a, q = qa_pair
            questions.append(q)
            answers.append(a)
        except:
            continue

    assert len(answers) == len(questions)

    ans_dic = make_ans_dic(answers)
    question_dic = make_question_dic(questions)

    tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt')

    q_tokens = []
    max_seq_len = 0
    for q in question_dic.data:
        bert_ids = tokenizer.build_inputs_with_special_tokens(
            tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q)))
        if (len(bert_ids) > max_seq_len):
            max_seq_len = len(bert_ids)
        q_tokens.append(bert_ids)
        # print(tokenizer.convert_ids_to_tokens(tokenizer.build_inputs_with_special_tokens(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q)))))

    print("最長問句長度:", max_seq_len)
    assert max_seq_len <= 512  # 小於BERT-base長度限制

    # 補齊長度
    for q in q_tokens:
        while len(q) < max_seq_len:
            q.append(0)

    a_labels = []
    for a in ans_dic.data:
        a_labels.append(ans_dic.to_id(a))
        # print (ans_dic.to_id(a))

    # BERT input embedding
    answer_lables = a_labels
    input_ids = q_tokens
    input_masks = [[1] * max_seq_len for i in range(len(question_dic))]
    input_segment_ids = [[0] * max_seq_len for i in range(len(question_dic))]
    assert len(input_ids) == len(question_dic) and len(input_ids) == len(
        input_masks) and len(input_ids) == len(input_segment_ids)

    data_features = {
        'input_ids': input_ids,
        'input_masks': input_masks,
        'input_segment_ids': input_segment_ids,
        'answer_lables': answer_lables,
        'question_dic': question_dic,
        'answer_dic': ans_dic
    }

    output = open('trained_model/data_features.pkl', 'wb')
    pickle.dump(data_features, output)
    return data_features
예제 #5
0
def toBertIds(q_input):
    tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt')
    return tokenizer.build_inputs_with_special_tokens(
        tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q_input)))
def convert_data_to_feature(FileName):
    # 載入字典
    tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt')

    # 載入資料
    Labels = []
    Sentences = []
    with open(FileName,'r',encoding='utf-8') as f:
        data = f.read()
    LS_pairs = data.split("\n")

    for LS_pair in LS_pairs:
        if LS_pair != "":
            try:
                L = LS_pair[:1]
                S = LS_pair[2:]
                Labels.append(int(L))
                Sentences.append(S)
            except:
                continue
    
    assert len(Labels) == len(Sentences)

    # BERT input embedding
    max_seq_len = 0     # 紀錄最大長度
    input_ids = []
    original_length = []    # 紀錄原本長度
    for S in Sentences:
        # 將句子切割成一個個token
        word_piece_list = tokenizer.tokenize(S)
        # 將token轉成字典中的id
        input_id = tokenizer.convert_tokens_to_ids(word_piece_list)
        # 補上[CLS]和[SEP]
        input_id = tokenizer.build_inputs_with_special_tokens(input_id)

        if(len(input_id)>max_seq_len):
            max_seq_len = len(input_id)
        input_ids.append(input_id)

    print("最長句子長度:",max_seq_len)
    assert max_seq_len <= 512 # 小於BERT-base長度限制

    # 補齊長度
    for c in input_ids:
        # 紀錄原本長度
        length = len(c)
        original_length.append(length)
        while len(c)<max_seq_len:
            c.append(0)
    
    segment_ids = [[0]*max_seq_len for i in range(len(Sentences))]         # token_type_ids # segment_ids存儲的是句子的id,id為0就是第一句,id為1就是第二句
    position_ids = []                                                      # attention_mask # position_ids:1代表是真實的單詞id,0代表補全位
    for i in range(len(Sentences)):
        position_id = []
        for j in range(original_length[i]):
            position_id.append(1)
        while len(position_id)<max_seq_len:
            position_id.append(0)
        position_ids.append(position_id)

    assert len(input_ids) == len(segment_ids) and len(input_ids) == len(position_ids) and len(input_ids) == len(Labels)

    data_features = {'input_ids':input_ids,
                    'segment_ids':segment_ids,
                    'position_ids':position_ids,
                    'labels':Labels}

    return data_features
예제 #7
0
print_tokenizer_special(tokenizer)

x = IncrementalDataset(idr.dataset,
                       transform=transforms.Compose([
                           Padding(max_length=idr.max_length),
                           ToTensor(tokenizer)
                       ]))
a = DataLoader(x, batch_size=10, sampler=SubsetRandomSampler(x.sampler))
for i, s in enumerate(a):
    print(i)
    print(s)

tokenizer.ids_to_tokens[0]
tokenizer.convert_ids_to_tokens
tokenizer.vocab.keys()
tokenizer.build_inputs_with_special_tokens([95, 209], [95, 209])
tokenizer.pretrained_vocab_files_map

tokenizer.encode("<BOS> I like tea")
s = idr.dataset.tokens[0]
tokenizer.decode(tokenizer.encode(" ".join(s)))
encoded_tensor = torch.Tensor([
    tokenizer.encode(idr.dataset.partial[j])
    for j in range(len(idr.dataset.partial))
])
encoded_tensor = torch.Tensor(tokenizer.encode(idr.dataset.partial[0]))

model = BertModel.from_pretrained("bert-base-uncased",
                                  output_hidden_states=True)
model.eval()
with torch.no_grad():
예제 #8
0
def convert_data_to_feature(FileName):
    with open(FileName, 'r', encoding='utf-8') as f:
        data = f.read()
    qa_pairs = data.split("\n")

    questions = []
    answers = []
    for qa_pair in qa_pairs:
        qa_pair = qa_pair.split()
        try:
            a, q = qa_pair
            questions.append(q)
            answers.append(a)
        except:
            continue

    assert len(answers) == len(questions)

    ans_dic = make_ans_dic(answers)
    question_dic = make_question_dic(questions)

    tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt')

    q_tokens = []
    max_seq_len = 0
    for q in question_dic.data:
        # print(tokenizer.tokenize(q))
        bert_ids = tokenizer.build_inputs_with_special_tokens(
            tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q)))
        if (len(bert_ids) > max_seq_len):
            max_seq_len = len(bert_ids)
        q_tokens.append(bert_ids)
        # print(bert_ids)
        # print(tokenizer.convert_ids_to_tokens(tokenizer.build_inputs_with_special_tokens(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q)))))

    print("最長問句長度:", max_seq_len)
    assert max_seq_len <= 512  # 小於BERT-base長度限制

    original_length = []
    # 補齊長度
    for q in q_tokens:
        # 紀錄原本長度
        length = len(q)
        original_length.append(length)
        while len(q) < max_seq_len:
            q.append(0)

    a_labels = []
    for a in ans_dic.data:
        a_labels.append(ans_dic.to_id(a))
        # print (ans_dic.to_id(a))

    # BERT input embedding
    input_ids = q_tokens
    input_segment_ids = [
        [0] * max_seq_len for i in range(len(question_dic))
    ]  # token_type_ids # segment_ids存儲的是句子的id,id為0就是第一句,id為1就是第二句
    input_masks = []  # position_ids:1代表是真實的單詞id,0代表補全位
    for i in range(len(question_dic)):
        position_ids = []
        for j in range(original_length[i]):
            position_ids.append(1)
        while len(position_ids) < max_seq_len:
            position_ids.append(0)
        input_masks.append(position_ids)
    answer_lables = a_labels
    assert len(input_ids) == len(question_dic) and len(input_ids) == len(
        input_masks) and len(input_ids) == len(input_segment_ids)
    data_features = {
        'input_ids': input_ids,
        'input_segment_ids': input_segment_ids,
        'input_masks': input_masks,
        'answer_lables': answer_lables,
        'question_dic': question_dic,
        'answer_dic': ans_dic
    }

    # 因為train_data保證可以拿到所有類別的資料,且之後要做predict時才能將預測的結果做正確轉換
    if FileName == 'train_data.txt':
        fp = open('trained_model/data_features.pkl', 'wb')
        pickle.dump(data_features, fp)
        fp.close()

    return data_features