def paragraphs2batch(paragraphs: List[str], tokenizer: BertTokenizerFast) -> \ Tuple[List[List[str]], Dict]: """ Convert a list of paragraphs to a batch. This essentially does these things: 1. Tokenize paragraphs. 2. Pad all input_ids tensors, remove excessively long input_ids. 3. Generate the correct attention_masks. :param paragraphs: List of paragraphs. :param tokenizer: The BERT tokenizer. :return: Tokenized paragraphs and the batch that can be used as model inputs. """ all_tokenized = [] input_ids = [] attention_mask = [] for p in paragraphs: tokenized = tokenizer.tokenize(p) all_tokenized.append(tokenized) one_hot = tokenizer.convert_tokens_to_ids(tokenized) input_ids.append(torch.tensor(one_hot, dtype=torch.long)) attention_mask.append(torch.ones_like(input_ids[-1], dtype=torch.float)) input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) input_ids = input_ids[:, :max_input] attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0.) attention_mask = attention_mask[:, :max_input] return all_tokenized, { 'input_ids': input_ids, 'attention_mask': attention_mask, }
def _collate_xtokens(xtoken_df: pd.DataFrame, xtokenizer: BertTokenizerFast, pad) -> pd.DataFrame: sent_groups = xtoken_df.groupby(xtoken_df.sent_id) num_sentences = len(sent_groups) max_sent_len = max([len(sent_df) for sent_id, sent_df in sent_groups]) data_rows = [] tq = tqdm(total=num_sentences, desc="Sentence") for sent_id, sent_df in sent_groups: sent_index = list(sent_df.sent_id) sent_token_index = list(sent_df.token_id) sent_tokens = list(sent_df.token) sent_xtokens = list(sent_df.xtoken) sent_xtoken_ids = xtokenizer.convert_tokens_to_ids(sent_xtokens) pad_len = max_sent_len - len(sent_index) sent_index.extend(sent_index[-1:] * pad_len) sent_tokens.extend([pad] * pad_len) sent_token_index.extend([-1] * pad_len) sent_xtokens.extend([xtokenizer.pad_token] * pad_len) sent_xtoken_ids.extend([xtokenizer.pad_token_id] * pad_len) data_rows.extend(list(row) for row in zip(sent_index, sent_token_index, sent_tokens, sent_xtokens, sent_xtoken_ids)) tq.update(1) tq.close() return pd.DataFrame(data_rows, columns=['sent_idx', 'token_idx', 'token', 'xtoken', 'xtoken_id'])
def main(): args = set_args() logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda device = 'cuda' if args.cuda else 'cpu' logger.info('using device:{}'.format(device)) os.environ["CUDA_VISIBLE_DEVICES"] = args.device tokenizer = BertTokenizerFast(vocab_file=args.vocab_path, sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]") # tokenizer = BertTokenizer(vocab_file=args.voca_path) model = GPT2LMHeadModel.from_pretrained(args.model_path) model = model.to(device) model.eval() if args.save_samples_path: if not os.path.exists(args.save_samples_path): os.makedirs(args.save_samples_path) samples_file = open(args.save_samples_path + '/samples.txt', 'a', encoding='utf8') samples_file.write("聊天记录{}:\n".format(datetime.now())) # 存储聊天记录,每个utterance以token的id的形式进行存储 history = [] print('开始和chatbot聊天,输入CTRL + Z以退出') while True: try: text = input("user:"******"你好" if args.save_samples_path: samples_file.write("user:{}\n".format(text)) text_ids = tokenizer.encode(text, add_special_tokens=False) history.append(text_ids) input_ids = [tokenizer.cls_token_id] # 每个input以[CLS]为开头 for history_id, history_utr in enumerate( history[-args.max_history_len:]): input_ids.extend(history_utr) input_ids.append(tokenizer.sep_token_id) input_ids = torch.tensor(input_ids).long().to(device) input_ids = input_ids.unsqueeze(0) response = [] # 根据context,生成的response # 最多生成max_len个token for _ in range(args.max_len): outputs = model(input_ids=input_ids) logits = outputs.logits next_token_logits = logits[0, -1, :] # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率 for id in set(response): next_token_logits[id] /= args.repetition_penalty next_token_logits = next_token_logits / args.temperature # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token next_token_logits[tokenizer.convert_tokens_to_ids( '[UNK]')] = -float('Inf') filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp) # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标 next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) if next_token == tokenizer.sep_token_id: # 遇到[SEP]则表明response生成结束 break response.append(next_token.item()) input_ids = torch.cat((input_ids, next_token.unsqueeze(0)), dim=1) # his_text = tokenizer.convert_ids_to_tokens(curr_input_tensor.tolist()) # print("his_text:{}".format(his_text)) history.append(response) text = tokenizer.convert_ids_to_tokens(response) print("chatbot:" + "".join(text)) if args.save_samples_path: samples_file.write("chatbot:{}\n".format("".join(text))) except KeyboardInterrupt: if args.save_samples_path: samples_file.close() break
def squad_features( context: str, question: str, answer: Union[str, None], start_char_pos: Union[int, None], tokenizer: BertTokenizerFast) -> Tuple[List[int], List[int], int, int]: """ Squad feature extractor Implement the feature extractor from a Squad sample for your model Return values should follow [CLS + question + SEP + context + SEP] form. In addition, because start_char_pos is based on character index, you should convert it to proper token index. Check the test cases to know the functionality in detail. Note: input_ids and token_type_ids follows the transfomer library documentation https://huggingface.co/transformers/glossary.html Arguments: context -- Context string question -- Question string anwser -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos start_char_pos -- Character index which the answer starts from in the context. If the answer is None, this argument is also None. tokenizer -- Tokenizer to encode text strings. Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast Returns: input_ids -- Input ids token_type_ids -- Token type ids start_token_pos -- Token index which the answer starts from in the input_ids list. None if no answer is given. end_token_pos -- Token index which the answer ends by in the input_ids list. This includes the last token which located in the index. None if no answer is given. """ ### YOUR CODE HERE (~18 lines) input_ids: List[int] = None token_type_ids: List[int] = None start_token_pos: int = None end_token_pos: int = None token_question = tokenizer.tokenize(question) #token_context = tokenizer.tokenize(context) tokens = ["[CLS]"] + token_question + ["[SEP]"] token_type_ids = [0] * len(tokens) #set_trace() # Answer available if start_char_pos is not None: token_answer = tokenizer.tokenize(answer) back_context_ = tokenizer.tokenize(context[start_char_pos:]) if _is_whitespace(context[start_char_pos + len(answer)]) is False: if back_context_[len(token_answer) - 1] is not token_answer[-1]: back_context = tokenizer.tokenize(context[start_char_pos + len(answer):]) back_context[0] = "##" + back_context[0] else: back_context = back_context_[len(token_answer):] else: back_context = back_context_[len(token_answer):] if start_char_pos == 0: front_context = [] token_answer = tokenizer.tokenize(answer) else: if _is_whitespace(context[start_char_pos - 1]): front_context = tokenizer.tokenize(context[:start_char_pos]) # if previous chr of answer is not space else: front_context = tokenizer.tokenize(context[:start_char_pos]) token_answer[0] = "##" + token_answer[0] start_token_pos = len(tokens) + len(front_context) end_token_pos = start_token_pos + len(token_answer) - 1 token_context = front_context + token_answer + back_context token_type_ids = token_type_ids + [1] * (len(token_context) + 1) tokens = tokens + token_context + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(tokens) # No answer case else: token_context = tokenizer.tokenize(context) tokens = tokens + token_context + ["[SEP]"] token_type_ids = token_type_ids + [1] * (len(token_context) + 1) input_ids = tokenizer.convert_tokens_to_ids(tokens) start_token_pos = None end_token_pos = None #token_answer = tokenizer.tokenize(answer) #if len(token_answer) > 1: ### END YOUR CODE return input_ids, token_type_ids, start_token_pos, end_token_pos
def squad_features( context: str, question: str, answer: Union[str, None], start_char_pos: Union[int, None], tokenizer: BertTokenizerFast) -> Tuple[List[int], List[int], int, int]: """ Squad feature extractor Implement the feature extractor from a Squad sample for your model Return values should follow [CLS + question + SEP + context + SEP] form. In addition, because start_char_pos is based on character index, you should convert it to proper token index. Check the test cases to know the functionality in detail. Note: input_ids and token_type_ids follows the transfomer library documentation https://huggingface.co/transformers/glossary.html Arguments: context -- Context string question -- Question string answer -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos start_char_pos -- Character index which the answer starts from in the context. If the answer is None, this argument is also None. tokenizer -- Tokenizer to encode text strings. Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast Returns: input_ids -- Input ids token_type_ids -- Token type ids start_token_pos -- Token index which the answer starts from in the input_ids list. None if no answer is given. end_token_pos -- Token index which the answer ends by in the input_ids list. This includes the last token which located in the index. None if no answer is given. """ ### YOUR CODE HERE (~18 lines) encoded_dict = tokenizer.encode_plus(question, context) input_ids = encoded_dict["input_ids"] token_type_ids = encoded_dict["token_type_ids"] input_ids_tokens = tokenizer.convert_ids_to_tokens(input_ids) # print("Input (tokens): ", input_ids_tokens) if answer is None and start_char_pos is None: start_token_pos = None end_token_pos = None return input_ids, token_type_ids, start_token_pos, end_token_pos start_token_pos, end_token_pos = 0, 0 start_token_pos += token_type_ids.count(0) start_token_pos += len(tokenizer.tokenize(context[:start_char_pos])) end_token_pos += len(tokenizer.tokenize(answer)) + start_token_pos - 1 # Extract tokenized answer part only tokenized_answer = " ".join( tokenizer.convert_ids_to_tokens( input_ids[start_token_pos:end_token_pos + 1])) subword_prefix_original = "##" if "##" in tokenized_answer else "" subword_prefix = "##" tokenized_answer = tokenized_answer.replace('#', '') if tokenized_answer != answer.lower( ) and start_token_pos == end_token_pos and answer in tokenized_answer: # A single word but different subword tokenization case new_subword_list = [ subword_prefix_original + tokenized_answer[:len(answer)], subword_prefix + tokenized_answer[len(answer):] ] # print('new_subword_list : ', new_subword_list) input_ids = input_ids[: start_token_pos] + tokenizer.convert_tokens_to_ids( new_subword_list) + input_ids[end_token_pos + 1:] token_type_ids.append(1) # print("Input ids: ", input_ids) # input_ids_tokens = tokenizer.convert_ids_to_tokens(input_ids) # print("Input (tokens) (ADJUSTED): ", input_ids_tokens) # print("Segmend Ids: ", token_type_ids) # print('START_CHAR_POS: ', start_char_pos) # print("ANSWER: ", answer) # print("START: ", start_token_pos) # print("END: ", end_token_pos) # print("ANSWER SPAN: ", input_ids_tokens[start_token_pos:end_token_pos+1]) assert len(input_ids) == len(token_type_ids) ### END YOUR CODE return input_ids, token_type_ids, start_token_pos, end_token_pos
def squad_features( context: str, question: str, answer: Union[str, None], start_char_pos: Union[int, None], tokenizer: BertTokenizerFast ) -> Tuple[List[int], List[int], int, int]: """ Squad feature extractor Implement the feature extractor from a Squad sample for your model Return values should follow [CLS + question + SEP + context + SEP] form. In addition, because start_char_pos is based on character index, you should convert it to proper token index. Check the test cases to know the functionality in detail. Note: input_ids and token_type_ids follows the transfomer library documentation https://huggingface.co/transformers/glossary.html Arguments: context -- Context string question -- Question string anwser -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos start_char_pos -- Character index which the answer starts from in the context. If the answer is None, this argument is also None. tokenizer -- Tokenizer to encode text strings. Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast Returns: input_ids -- Input ids token_type_ids -- Token type ids start_token_pos -- Token index which the answer starts from in the input_ids list. None if no answer is given. end_token_pos -- Token index which the answer ends by in the input_ids list. This includes the last token which located in the index. None if no answer is given. """ input_ids: List[int] = None token_type_ids: List[int] = None start_token_pos: int = None end_token_pos: int = None encoded_dict = tokenizer.encode_plus(question, context) input_ids = encoded_dict['input_ids'] token_type_ids = encoded_dict['token_type_ids'] tokens = tokenizer.tokenize(context) words_idx = -1 num_tokens_before_context = input_ids.index(102) + 1 try: words_idx = tokens.index('##words') except: words_idx = -1 if words_idx != -1: tokens[words_idx] = '##word' tokens.insert(words_idx + 1, '##s') id_word = tokenizer.convert_tokens_to_ids('##word') id_s = tokenizer.convert_tokens_to_ids('##s') input_ids[num_tokens_before_context + words_idx] = id_word input_ids.insert(num_tokens_before_context + words_idx + 1, id_s) token_type_ids.insert(num_tokens_before_context + words_idx + 1, 1) if answer == None: return input_ids, token_type_ids, None, None context = context.lower() token2char_map = {} start = 0 for j in range(len(tokens)): for i in range(len(tokens[j])): if tokens[j][i] == '#': continue else: break token = tokens[j][i:] start = context.find(token,start) end = start + len(token) token2char_map[j] = [start, end-1] start = end for i in range(len(tokens)): if token2char_map[i][0]>=start_char_pos: start_token_pos = i break end_token_pos = len(tokens) - 1 for i in range(start_token_pos, len(tokens)): if token2char_map[i][0]>=start_char_pos+len(answer): end_token_pos = i-1 break start_token_pos += num_tokens_before_context end_token_pos += num_tokens_before_context return input_ids, token_type_ids, start_token_pos, end_token_pos
def create_from_document(doc_idx, doc, all_docs, max_seq_length, tokenizer: BertTokenizerFast): """ I heavily rely on the implementation of BERT to generate training data: github.com/google-research/bert/blob/master/create_pretraining_data.py The main differences are: - I do not keep short sentences with any probability - The masking of tokens will be done dynamically, during training (just like the experiment made in RoBERTa paper) This function also assumes that all documents are tokenized (WordPiece Token Strings). """ instances = [] # Account for 1x [CLS] and 2x [SEP] target_seq_length = max_seq_length - 3 # We'll use the same strategy as in the original # BERT paper, creating instances with a target max length # and using segments (groups of sentences) for that # # We create sentences pairs for next sentence prediction # where 50% of times the second sequence is the real next one. current_chunk = [] current_length = 0 i = 0 # A reference where we stopped in the current doc. while i < len(doc): segment = doc[i] current_chunk.append(segment) current_length += len(segment) if i == len(doc) - 1 or current_length >= target_seq_length: if current_chunk: sentence_a = [] sentence_b = [] sentence_a_end = randint(1, max(1, len(current_chunk) - 1)) for ai in range(sentence_a_end): sentence_a.extend(current_chunk[ai]) is_random_next = False chance = random() if len(all_docs) > 1 and \ (len(current_chunk) == 1 or chance < 0.5): sentence_b_tgt_len = target_seq_length - len(sentence_a) # Let's get a random sentence is_random_next = True random_doc_idx = -1 for _ in range(10): random_doc_idx = randint(0, len(all_docs) - 1) if random_doc_idx != doc_idx: break # We select the document and a random position to start # We use len(random_doc) // 2 to make room for a bugger # sentence random_doc = all_docs[random_doc_idx] random_start = randint(0, len(random_doc) // 2) for j in range(random_start, len(random_doc)): sentence_b.extend(random_doc[j]) if len(sentence_b) >= sentence_b_tgt_len: break # We free the tokens we'll not use for this instance i -= len(current_chunk) - sentence_a_end else: # It will be an actual next sentence for j in range(sentence_a_end, len(current_chunk)): sentence_b.extend(current_chunk[j]) truncate_seq_pair(sentence_a, sentence_b, target_seq_length) assert len(sentence_a) >= 1 assert len(sentence_b) >= 1 final_seq = ['[CLS]'] + \ sentence_a + \ ['[SEP]'] + \ sentence_b + \ ['[SEP]'] segment_ids = [0] * (len(sentence_a) + 2) segment_ids += [1] * (len(sentence_b) + 1) input_ids = tokenizer.convert_tokens_to_ids(final_seq) instances.append({ 'input_ids': input_ids, 'token_type_ids': segment_ids, 'is_random_next': is_random_next }) current_chunk = [] current_length = 0 i += 1 return instances