def collate_fn(batch_data): tokenizer = BertTokenizer('./data/bert/nezha-base-www/vocab.txt') max_len = max([len(x[0]) for x in batch_data]) + 2 input_ids, token_type_ids, attention_mask, labels = [], [], [], [] for text, label in batch_data: inputs = tokenizer.encode_plus(text=text, max_length=max_len, pad_to_max_length=True, is_pretokenized=True, return_token_type_ids=True, return_attention_mask=True, truncation=True) label = tokenizer.encode_plus(text=label, max_length=max_len, pad_to_max_length=True, is_pretokenized=True, return_token_type_ids=False, return_attention_mask=False, truncation=True) input_ids.append(inputs['input_ids']) token_type_ids.append(inputs['token_type_ids']) attention_mask.append(inputs['attention_mask']) labels.append(label['input_ids']) input_ids = torch.tensor(input_ids).long() token_type_ids = torch.tensor(token_type_ids).long() attention_mask = torch.tensor(attention_mask).float() labels = torch.tensor(labels).long() return input_ids, token_type_ids, attention_mask, labels
def _iter_row(df, inputs: dict, task: str, tokenizer: BertTokenizer, train: bool, train_val_split_ratio: float, label2id: Optional[dict] = None) -> Tuple[dict, bool]: targets = [] for _, row in tqdm(df.iterrows(), total=len(df), desc=f'Preprocess {task}'): text_a = row[1] if task == 'ocnli': target_idx = 3 text_b = row[2] output_ids = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=True) else: target_idx = 2 output_ids = tokenizer.encode_plus(text_a, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=True) inputs['input_ids'].append(output_ids['input_ids']) inputs['token_type_ids'].append(output_ids['token_type_ids']) inputs['attention_mask'].append(output_ids['attention_mask']) if train_val_split_ratio is not None: targets.append(row[target_idx]) else: targets.append(list(label2id.keys())[0]) targets_series = pd.Series(targets) if label2id is None: train = True targets, label2id = convert_label_to_id(targets_series) else: targets, = convert_label_to_id(targets_series, label2id) inputs['targets'] = targets return label2id, train
def convert_input_example(example: InputExample, tokenizer: BertTokenizer, max_seq_len): set_type = example.set_type text = example.text label = example.label encode_dict = tokenizer.encode_plus(text=text, max_length=max_seq_len, pad_to_max_length=True, return_token_type_ids=True, return_attention_mask=True, truncation=True, padding=True) token_ids = encode_dict['input_ids'] attention_masks = encode_dict['attention_mask'] token_type_ids = encode_dict['token_type_ids'] out_len = len(encode_dict['input_ids']) pad_len = max_seq_len - out_len token_ids = encode_dict['input_ids'] + [0] * pad_len attention_masks = encode_dict['attention_mask'] + [0] * pad_len token_type_ids = encode_dict['token_type_ids'] + [0] * pad_len feature = BertFeature( # bert inputs token_ids=token_ids, attention_masks=attention_masks, token_type_ids=token_type_ids, ) return feature
def trans_title_ent_to_bert_ipt(title: str, ent_info, tokenizer: BertTokenizer, attr2max_len: Dict, used_attrs: List[str]): """ :param title: :param ent_info: :param tokenizer: :param attr2max_len: :param used_attrs: :return:Dict[attr_name:{"input_ids":..}] """ ipt = {} for attr_name in used_attrs: attr_value = ent_info[attr_name] ########################################################################################################## if not DataUtil.is_null(ent_info["company"]): attr_value = ent_info["company"] + "。" + attr_value if not DataUtil.is_null(ent_info["place"]): attr_value = ent_info["place"] + "。" + attr_value ######################################################################################################### ipt[attr_name] = tokenizer.encode_plus( title, attr_value, max_length=attr2max_len[attr_name], padding="max_length", truncation=True, return_tensors="pt") return ipt
def find_answer(tokenizer: BertTokenizer, answer_model: BertForQuestionAnswering, query: str, text: str) -> str: with torch.no_grad(): start, end = answer_model(**tokenizer.encode_plus( query, text, max_length=256, truncation=True, return_tensors="pt")) start_pos = torch.argmax(start).item() end_pos = torch.argmax(end).item() if start_pos >= end_pos: start = torch.softmax(start, dim=1) end = torch.softmax(end, dim=1) k = -2 start_args = torch.argsort(start).tolist()[0] end_args = torch.argsort(end).tolist()[0] calc_score = lambda start_pos, end_pos: start[0][start_pos] * end[0][ end_pos] s_score, e_score = 0, 0 s_pos, e_pos = start_pos, end_pos while s_score == 0 or e_score == 0: s_pos = start_args[k] e_pos = end_args[k] s_score = 0 if s_pos > end_pos else calc_score(s_pos, end_pos) e_score = 0 if e_pos < start_pos else calc_score(start_pos, e_pos) k -= 1 if s_score > e_score: start_pos = s_pos else: end_pos = e_pos return tokenizer.decode(tokenizer.encode(query, text)[start_pos:end_pos])
def preprocess(self, data): """ Receives text in form of json and converts it into an encoding for the inference stage :param data: Input to be passed through the layers for prediction :return: output - preprocessed encoding """ text = data[0].get("data") if text is None: text = data[0].get("body") text = text.decode("utf-8") tokenizer = BertTokenizer( self.VOCAB_FILE) # .from_pretrained("bert-base-cased") encoding = tokenizer.encode_plus( text, max_length=32, add_special_tokens=True, # Add '[CLS]' and '[SEP]' return_token_type_ids=False, padding="max_length", return_attention_mask=True, return_tensors="pt", # Return PyTorch tensors truncation=True, ) return encoding
def read_data(config: dict, tokenizer: BertTokenizer, debug=False) -> str: train_file_path = os.path.join( '../tcdata/nlp_round2_data/pretrain_data.tsv') test_file_path = os.path.join( '../tcdata/nlp_round1_data/gaiic_track3_round1_testB_20210317.tsv') train_df = pd.read_csv(train_file_path, header=None, sep='\t') test_df = pd.read_csv(test_file_path, header=None, sep='\t') if debug: train_df = train_df.head(1000) test_df = test_df.head(1000) data_df = {'train': train_df, 'test': test_df} processed_data = {} for data_type, df in data_df.items(): inputs = defaultdict(list) for i, row in tqdm(df.iterrows(), desc=f'Preprocessing {data_type} data', total=len(df)): label = 0 if data_type == 'test' else row[2] sentence_a, sentence_b = row[0], row[1] inputs_dict = tokenizer.encode_plus(sentence_a, sentence_b, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=True) inputs['input_ids'].append(inputs_dict['input_ids']) inputs['token_type_ids'].append(inputs_dict['token_type_ids']) inputs['attention_mask'].append(inputs_dict['attention_mask']) inputs['labels'].append(label) processed_data[data_type] = inputs return processed_data
def preprocess_for_finbert(data, vocab_file, max_length=MAX_SEQ_LENGTH): tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=True) input_ids = [] token_type_ids = [] attention_masks = [] for sent in data: encoded_sent = tokenizer.encode_plus(text=sent, add_special_tokens=True, max_length=max_length, truncation=True, padding='max_length', return_token_type_ids=True, return_attention_mask=True) input_ids.append(encoded_sent.get('input_ids')) token_type_ids.append(encoded_sent.get('token_type_ids')) attention_masks.append(encoded_sent.get('attention_mask')) # Convert lists to tensors input_ids = torch.tensor(input_ids) token_type_ids = torch.tensor(token_type_ids) attention_masks = torch.tensor(attention_masks) return input_ids, token_type_ids, attention_masks
def get_single_loader_from_raw_text(src_lines, trg_lines, tokenizer: BertTokenizer, batch_size): tensor_datasets = {'train': [], 'valid': []} input_ids = [] token_type_ids = [] attention_mask = [] label_list = [] assert len(src_lines) == len(trg_lines) for ndx, row in enumerate(src_lines): if ndx % 100 == 0: logger.info('%d/%d processed.' % (ndx, len(src_lines))) src = src_lines[ndx] trg = trg_lines[ndx] output = tokenizer.encode_plus(src, max_length=512, pad_to_max_length=True) input_ids.append(output['input_ids']) # token_type_ids.append(output['token_type_ids']) attention_mask.append(output['attention_mask']) labels_encode = tokenizer.encode_plus(trg, max_length=512, pad_to_max_length=True) label_list.append(labels_encode['input_ids']) data_set = { 'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': label_list } train_max_num = int(len(data_set['input_ids'])) for name in ['input_ids', 'attention_mask', 'labels']: tensor_datasets['train'].append( torch.LongTensor(data_set[name][0:train_max_num])) train_data_set = TensorDataset(*tensor_datasets['train']) train_data_loader = DataLoader(train_data_set, batch_size=batch_size, shuffle=True) return train_data_loader
def convert_attribution_example(ex_idx, example: AttributionExample, max_seq_len, tokenizer: BertTokenizer, polarity2id, tense2id): """ convert attribution example to attribution feature """ set_type = example.set_type raw_text = example.text raw_label = example.label trigger = example.trigger tokens = fine_grade_tokenize(raw_text, tokenizer) trigger_loc = (trigger[1] + 1, trigger[1] + len(trigger[0])) labels = [tense2id[raw_label[0]], polarity2id[raw_label[1]]] encode_dict = tokenizer.encode_plus(text=tokens, max_length=max_seq_len, pad_to_max_length=True, is_pretokenized=True, return_token_type_ids=True, return_attention_mask=True) token_ids = encode_dict['input_ids'] attention_masks = encode_dict['attention_mask'] token_type_ids = encode_dict['token_type_ids'] window_size = 20 # 左右各取 20 的窗口作为 trigger 触发的语境 pooling_masks_range = range( max(1, trigger_loc[0] - window_size), min(min(1 + len(raw_text), max_seq_len - 1), trigger_loc[1] + window_size)) pooling_masks = [0] * max_seq_len for i in pooling_masks_range: pooling_masks[i] = 1 for i in range(trigger_loc[0], trigger_loc[1] + 1): pooling_masks[i] = 0 if ex_idx < 3 and set_type == 'train': logger.info(f"*** {set_type}_example-{ex_idx} ***") logger.info(f'text: {" ".join(tokens)}') logger.info(f"token_ids: {token_ids}") logger.info(f"attention_masks: {attention_masks}") logger.info(f"token_type_ids: {token_type_ids}") logger.info(f'trigger loc: {trigger_loc}') logger.info(f'labels: {labels}') feature = AttributionFeature(token_ids=token_ids, attention_masks=attention_masks, token_type_ids=token_type_ids, trigger_loc=trigger_loc, pooling_masks=pooling_masks, labels=labels) return feature
def find_answer(tokenizer: BertTokenizer, model: BertForQuestionAnswering, context: str, question: str): input_data = tokenizer.encode_plus(question, context, return_tensors="pt") with torch.no_grad(): out = model(**input_data) start, end = out[0], out[1] start = torch.argmax(start).item() end = torch.argmax(end).item() return tokenizer.decode(tokenizer.encode(question, context)[start:end])
def preprocess(tokenizer: BertTokenizer, x: Dict) -> Dict: choices_features = [] for key, option in x["options"].items(): text_a = x["stem"] text_b = option inputs = tokenizer.encode_plus( text_a, text_b, add_special_tokens=True, max_length=MAX_LEN, ) input_ids, token_type_ids = inputs["input_ids"], inputs[ "token_type_ids"] attention_mask = [1] * len(input_ids) pad_token_id = tokenizer.pad_token_id padding_length = MAX_LEN - len(input_ids) input_ids = input_ids + ([pad_token_id] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([pad_token_id] * padding_length) assert len( input_ids) == MAX_LEN, "Error with input length {} vs {}".format( len(input_ids), MAX_LEN) assert len(attention_mask ) == MAX_LEN, "Error with input length {} vs {}".format( len(attention_mask), MAX_LEN) assert len(token_type_ids ) == MAX_LEN, "Error with input length {} vs {}".format( len(token_type_ids), MAX_LEN) choices_features.append({ "input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids, }) label = label_map.get(x["answer_key"], -1) label = torch.tensor(label).long() return { "id": x["id"], "label": label, "input_ids": torch.tensor([cf["input_ids"] for cf in choices_features]), "attention_mask": torch.tensor([cf["attention_mask"] for cf in choices_features]), "token_type_ids": torch.tensor([cf["token_type_ids"] for cf in choices_features]), }
def find_yesno_answer(tokenizer: BertTokenizer, question_model: BertForSequenceClassification, question: str, text: str) -> str: input_ids = tokenizer.encode_plus(question, text, return_tensors="pt") with torch.no_grad(): out = question_model(**input_ids)[0] no, yes, none = torch.softmax(out, dim=1).tolist()[0] if none > 0.5: return "не знаю" elif no > yes: return "нет" else: return "да"
def tokenize(tokenizer: BertTokenizer, sentence: str, max_len: int = 50): # For single sequences: # tokens: [CLS] I am a boy . [SEP] # type_ids: 0 0 0 0 0 0 0 tokens = tokenizer.encode_plus(sentence, max_length=max_len, pad_to_max_length=True) assert len(tokens['input_ids']) == max_len assert len(tokens['token_type_ids']) == max_len assert len(tokens['attention_mask']) == max_len return tokens
def _read_data(self, pretrain_csv_path, tokenizer: BertTokenizer) -> dict: pretrain_df = pd.read_csv(pretrain_csv_path, header=None, sep='\t') inputs = defaultdict(list) for i, row in tqdm(pretrain_df.iterrows(), desc='', total=len(pretrain_df)): sentence = row[0].strip() sentence = re.sub(r"[%s]+" % punctuation, '[SEP]', sentence) inputs_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=True) inputs['input_ids'].append(inputs_dict['input_ids']) inputs['token_type_ids'].append(inputs_dict['token_type_ids']) inputs['attention_mask'].append(inputs_dict['attention_mask']) return inputs
def preprocess(tokenizer: BertTokenizer, x: Dict) -> Dict: choices_features = [] option: str for option in x["options"]: text_a = x["article"] if x["question"].find("_") != -1: text_b = x["question"].replace("_", option) else: text_b = x["question"] + " " + option inputs = tokenizer.encode_plus( text_a, text_b, add_special_tokens=True, max_length=MAX_LEN ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] attention_mask = [1] * len(input_ids) pad_token_id = tokenizer.pad_token_id padding_length = MAX_LEN - len(input_ids) input_ids = input_ids + ([pad_token_id] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([pad_token_id] * padding_length) assert len(input_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(input_ids), MAX_LEN) assert len(attention_mask) == MAX_LEN, "Error with input length {} vs {}".format(len(attention_mask), MAX_LEN) assert len(token_type_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(token_type_ids), MAX_LEN) choices_features.append({ "input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids, }) labels = label_map.get(x["answer"], -1) label = torch.tensor(labels).long() return { "id": x["id"], "label": label, "input_ids": torch.tensor([cf["input_ids"] for cf in choices_features]), "attention_mask": torch.tensor([cf["attention_mask"] for cf in choices_features]), "token_type_ids": torch.tensor([cf["token_type_ids"] for cf in choices_features]), }
def read_data(train_file_path, tokenizer: BertTokenizer) -> dict: train_data = open(train_file_path, 'r', encoding='utf8').readlines() inputs = defaultdict(list) for row in tqdm(train_data, desc=f'Preprocessing train data', total=len(train_data)): sentence = row.strip() inputs_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=True) inputs['input_ids'].append(inputs_dict['input_ids']) inputs['token_type_ids'].append(inputs_dict['token_type_ids']) inputs['attention_mask'].append(inputs_dict['attention_mask']) return inputs
def preprocess(tokenizer: BertTokenizer, x: Dict) -> Dict: # Given two sentences, x["string1"] and x["string2"], this function returns BERT ready inputs. inputs = tokenizer.encode_plus( x["string1"], x["string2"], add_special_tokens=True, max_length=MAX_LEN, ) # First `input_ids` is a sequence of id-type representation of input string. # Second `token_type_ids` is sequence identifier to show model the span of "string1" and "string2" individually. input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] attention_mask = [1] * len(input_ids) # BERT requires sequences in the same batch to have same length, so let's pad! padding_length = MAX_LEN - len(input_ids) pad_id = tokenizer.pad_token_id input_ids = input_ids + ([pad_id] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([pad_id] * padding_length) # Super simple validation. assert len( input_ids) == MAX_LEN, "Error with input length {} vs {}".format( len(input_ids), MAX_LEN) assert len( attention_mask) == MAX_LEN, "Error with input length {} vs {}".format( len(attention_mask), MAX_LEN) assert len( token_type_ids) == MAX_LEN, "Error with input length {} vs {}".format( len(token_type_ids), MAX_LEN) # Convert them into PyTorch format. label = torch.tensor(int(x["quality"])).long() input_ids = torch.tensor(input_ids) attention_mask = torch.tensor(attention_mask) token_type_ids = torch.tensor(token_type_ids) # DONE! return { "label": label, "input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids }
def explain_handle(self, model_wraper, text, target=1): """Captum explanations handler Args: data_preprocess (Torch Tensor): Preprocessed data to be used for captum raw_data (list): The unprocessed data to get target from the request Returns: dict : A dictionary response with the explanations response. """ vis_data_records_base = [] model_wrapper = AGNewsmodelWrapper(self.model) tokenizer = BertTokenizer(self.VOCAB_FILE) model_wrapper.eval() model_wrapper.zero_grad() encoding = tokenizer.encode_plus(self.text, return_attention_mask=True, return_tensors="pt", add_special_tokens=False) input_ids = encoding["input_ids"] attention_mask = encoding["attention_mask"] input_ids = input_ids.to(self.device) attention_mask = attention_mask.to(self.device) input_embedding_test = model_wrapper.model.bert_model.embeddings( input_ids) preds = model_wrapper(input_embedding_test, attention_mask) out = np.argmax(preds.cpu().detach(), axis=1) out = out.item() ig_1 = IntegratedGradients(model_wrapper) attributions, delta = ig_1.attribute( # pylint: disable=no-member input_embedding_test, n_steps=500, return_convergence_delta=True, target=1, ) tokens = tokenizer.convert_ids_to_tokens( input_ids[0].cpu().numpy().tolist()) feature_imp_dict = {} feature_imp_dict["words"] = tokens attributions_sum = self.summarize_attributions(attributions) feature_imp_dict["importances"] = attributions_sum.tolist() feature_imp_dict["delta"] = delta[0].tolist() self.add_attributions_to_visualizer(attributions, tokens, self.score_func(preds), out, 2, 1, delta, vis_data_records_base) return [feature_imp_dict]
def read_data(train_file_path, tokenizer: BertTokenizer) -> dict: # train_df = pd.read_csv(train_file_path, header=None, sep='\t') train_df = pd.read_csv(train_file_path, header=None) inputs = defaultdict(list) for i, row in tqdm(train_df.iterrows(), desc=f'Preprocessing train data', total=len(train_df)): # sentence_a, sentence_b = row[0], row[1] sentence = row[0] inputs_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=True) inputs['input_ids'].append(inputs_dict['input_ids']) inputs['token_type_ids'].append(inputs_dict['token_type_ids']) inputs['attention_mask'].append(inputs_dict['attention_mask']) return inputs
def read_data(train_file_path, tokenizer: BertTokenizer, debug=False) -> dict: train_data = pd.read_csv(train_file_path, header=None, sep='\t') if debug: train_data = train_data.head(1000) data_dict = defaultdict(list) for i, row in tqdm(train_data.iterrows(), desc=f'Preprocessing train data', total=train_data.shape[0]): text_a, text_b = row[0], row[1] inputs_dict = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=True) data_dict['input_ids'].append(inputs_dict['input_ids']) data_dict['token_type_ids'].append(inputs_dict['token_type_ids']) data_dict['attention_mask'].append(inputs_dict['attention_mask']) return data_dict
class DS(Dataset): def __init__(self, lines, vocab_path="vocab/vocab.txt", max_length=1024): self.data = lines self.tok = BertTokenizer(vocab_file=vocab_path) self.max_length = max_length def __len__(self): return len(self.data) def __getitem__(self, index): line = self.data[index] line = self.tok.encode_plus( line, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt", ) return line
class BertVector: def __init__(self): self.tokenizer = BertTokenizer(vocab_file) self.model = car_aq_model() def encode(self, sentence, max_sentence_len): bert_input = self.tokenizer.encode_plus( sentence, add_special_tokens=True, max_length=max_sentence_len, padding='max_length', # return_attention_mask=True ) input_ids = tf.convert_to_tensor([bert_input['input_ids']]) token_type_ids = tf.convert_to_tensor([bert_input['token_type_ids']]) attention_mask = tf.convert_to_tensor([bert_input['attention_mask']]) outputs = self.model(input_ids,token_type_ids,attention_mask) return outputs.numpy()
def get_features(text: str, tokenizer: BertTokenizer, max_length: int) -> Dict[str, np.array]: text = text.lower() inputs = tokenizer.encode_plus(text, "", add_special_tokens=True, max_length=max_length) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] attention_mask = [1] * len(input_ids) padding_length = max_length - len(input_ids) input_ids = input_ids + ([0] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([0] * padding_length) return { "input_ids": np.array(input_ids), "token_type_ids": np.array(token_type_ids), "attention_mask": np.array(attention_mask), }
def encode(self, tokenizer: BertTokenizer) -> Tuple[np.ndarray, ...]: """ Encode this example as BERT input Args: tokenizer: BERT tokenizer to use for encoding Returns: Tuple of encoded BERT inputs: label, input_ids, attention_mask, token_type_ids """ inputs = tokenizer.encode_plus( self.question, self.answer, add_special_tokens=True, max_length=MAX_SEQUENCE_LENGTH, pad_to_max_length=True ) label = np.array(self.label) input_ids = np.array(inputs.input_ids) attention_mask = np.array(inputs.attention_mask) token_type_ids = np.array(inputs.token_type_ids) return label, input_ids, attention_mask, token_type_ids
def convert_ocnli_example(ex_idx, example: OcnliExample, max_seq_len, tokenizer: BertTokenizer): """ convert trigger examples to trigger features """ set_type = example.set_type text_a = example.text_a text_b = example.text_b raw_label = example.label tokens_a = fine_grade_tokenize(text_a, tokenizer) tokens_b = fine_grade_tokenize(text_b, tokenizer) labels = raw_label encode_dict = tokenizer.encode_plus(text=tokens_a, text_pair=tokens_b, max_length=max_seq_len, pad_to_max_length=True, is_pretokenized=True, return_token_type_ids=True, return_attention_mask=True) token_ids = encode_dict['input_ids'] attention_masks = encode_dict['attention_mask'] token_type_ids = encode_dict['token_type_ids'] if ex_idx < 3 and set_type == 'train': logger.info(f"*** {set_type}_example-{ex_idx} ***") logger.info(f'text_a: {" ".join(tokens_a)}') logger.info(f"token_ids: {token_ids}") logger.info(f"attention_masks: {attention_masks}") logger.info(f"token_type_ids: {token_type_ids}") feature = OcnliFeature(token_ids=token_ids, attention_masks=attention_masks, token_type_ids=token_type_ids, labels=labels) return feature
def main(): parser = argparse.ArgumentParser() parser.add_argument("--device", default="0", type=str, required=False, help="生成设备") parser.add_argument("--length", default=-1, type=int, required=False, help="生成长度") parser.add_argument("--batch_size", default=1, type=int, required=False, help="生成的batch size") parser.add_argument("--nsamples", default=10, type=int, required=False, help="生成几个样本") parser.add_argument("--temperature", default=1, type=float, required=False, help="生成温度") parser.add_argument("--topk", default=8, type=int, required=False, help="最高几选一") parser.add_argument("--topp", default=0, type=float, required=False, help="最高积累概率") parser.add_argument( "--model_config", default="config/model_config.json", type=str, required=False, help="模型参数", ) parser.add_argument( "--tokenizer_path", default="vocab/vocab.txt", type=str, required=False, help="词表路径", ) parser.add_argument( "--model_path", default="model/epoch=0-step=99.ckpt", type=str, required=False, help="模型路径", ) parser.add_argument("--prefix", default="我", type=str, required=False, help="生成文章的开头") parser.add_argument("--no_wordpiece", action="store_true", help="不做word piece切词") parser.add_argument("--segment", action="store_true", help="中文以词为单位") parser.add_argument("--fast_pattern", action="store_true", help="采用更加快的方式生成文本") parser.add_argument("--save_samples", action="store_true", help="保存产生的样本") parser.add_argument("--save_samples_path", default=".", type=str, required=False, help="保存样本的路径") parser.add_argument("--repetition_penalty", default=1.0, type=float, required=False) args = parser.parse_args() print("args:\n" + args.__repr__()) os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 length = args.length batch_size = args.batch_size nsamples = args.nsamples temperature = args.temperature topk = args.topk topp = args.topp repetition_penalty = args.repetition_penalty device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = BertTokenizer(vocab_file=args.tokenizer_path) model_config = GPT2Config.from_json_file(args.model_config) model = GPT2LMHeadModel(config=model_config) state_dict = { key[6:]: value for key, value in torch.load(args.model_path, map_location="cpu") ["state_dict"].items() } model.load_state_dict(state_dict) model.to(device) model.eval() for i in range(10): raw_text = args.prefix encoded = tokenizer.encode_plus(raw_text)["input_ids"] out = sample_sequence( model, encoded, length=512, n_ctx=1024, tokenizer=tokenizer, temperature=temperature, top_k=topk, top_p=topp, repitition_penalty=repetition_penalty, device=device, ) print(tokenizer.decode(out))
for sent in sentences: input_ids = tokenizer.encode(sent, add_special_tokens=True) max_len = max(max_len, len(input_ids)) print('Max sentence length: ', max_len) MAX_LEN = 64 # Tokenize all of the sentences and map the tokens to their word IDs. input_ids = [] attention_masks = [] for sent in sentences: encoded_dict = tokenizer.encode_plus( sent, add_special_tokens = True, max_length = 64, padding='max_length', return_token_type_ids=False, return_attention_mask = True, return_tensors = 'pt', ) # Add the encoded sentence to the list. input_ids.append(encoded_dict['input_ids']) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(encoded_dict['attention_mask']) # Convert the lists into tensors. input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) labels = torch.tensor(labels)
def preprocess(tokenizer: BertTokenizer, x: Dict) -> Dict: choices_features = [] option: str for option in x["options"]: text_a = x["article"] if x["question"].find("_") != -1: text_b = x["question"].replace("_", option) else: text_b = x["question"] + " " + option # 1) tokenize a raw text, # 2) replace tokens with corresponding ids, # 3) insert special tokens for BERT. # Use BertTokenizer to encode (tokenize / indexize) two sentences. inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=MAX_LEN) # Output of `tokenizer.encode_plus` is a dictionary. input_ids, token_type_ids = inputs["input_ids"], inputs[ "token_type_ids"] # For BERT, we need `attention_mask` along with `input_ids` as input. attention_mask = [1] * len(input_ids) # Pad sequences. pad_token_id = tokenizer.pad_token_id padding_length = MAX_LEN - len(input_ids) input_ids = input_ids + ([pad_token_id] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([pad_token_id] * padding_length) # assert len( input_ids) == MAX_LEN, "Error with input length {} vs {}".format( len(input_ids), MAX_LEN) assert len(attention_mask ) == MAX_LEN, "Error with input length {} vs {}".format( len(attention_mask), MAX_LEN) assert len(token_type_ids ) == MAX_LEN, "Error with input length {} vs {}".format( len(token_type_ids), MAX_LEN) choices_features.append({ "input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids, }) labels = label_map.get(x["answer"], -1) # Just a python list to `torch.tensor` label = torch.tensor(labels).long() # What we return will one instance in batch which `LightningModule.train_step` receives. return { "id": x["id"], "label": label, "input_ids": torch.tensor([cf["input_ids"] for cf in choices_features]), "attention_mask": torch.tensor([cf["attention_mask"] for cf in choices_features]), "token_type_ids": torch.tensor([cf["token_type_ids"] for cf in choices_features]), }
def preprocess(tokenizer: BertTokenizer, x: Dict) -> Dict: # `x` contains that one sample from lineflow dataset. # Example: # { # "id": "075e483d21c29a511267ef62bedc0461", # "answer_key": "A", # "options": {"A": "ignore", # "B": "enforce", # "C": "authoritarian", # "D": "yell at", # "E": "avoid"}, # "stem": "The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?"} # } # Use BertTokenizer to encode (tokenize / indexize) two sentences. inputs = tokenizer.encode_plus( x["string1"], x["string2"], add_special_tokens=True, max_length=MAX_LEN, ) # Output of `tokenizer.encode_plus` is a dictionary. input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] # For BERT, we need `attention_mask` along with `input_ids` as input. attention_mask = [1] * len(input_ids) # We are going to pad sequences. padding_length = MAX_LEN - len(input_ids) pad_id = tokenizer.pad_token_id input_ids = input_ids + ([pad_id] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([pad_id] * padding_length) input_ids, masked_lm_positions, masked_lm_ids = create_masked_lm_predictions( input_ids, masked_lm_prob, max_predictions_per_seq, tokenizer, rng) masked_lm_weights = [1.0] * len(masked_lm_ids) padding_length = max_predictions_per_seq - len(masked_lm_positions) masked_lm_positions = masked_lm_positions + ([0] * padding_length) masked_lm_ids = masked_lm_ids + ([pad_id] * padding_length) masked_lm_weights = masked_lm_weights + ([0.0] * padding_length) assert len( input_ids) == MAX_LEN, "Error with input length {} vs {}".format( len(input_ids), MAX_LEN) assert len( attention_mask) == MAX_LEN, "Error with input length {} vs {}".format( len(attention_mask), MAX_LEN) assert len( token_type_ids) == MAX_LEN, "Error with input length {} vs {}".format( len(token_type_ids), MAX_LEN) assert len( masked_lm_positions ) == max_predictions_per_seq, "Error with input length {} vs {}".format( len(masked_lm_positions), max_predictions_per_seq) assert len( masked_lm_ids ) == max_predictions_per_seq, "Error with input length {} vs {}".format( len(masked_lm_ids), max_predictions_per_seq) # Just a python list to `torch.tensor` label = torch.tensor(int(x["quality"])).long() input_ids = torch.tensor(input_ids) attention_mask = torch.tensor(attention_mask) token_type_ids = torch.tensor(token_type_ids) masked_lm_positions = torch.tensor(masked_lm_positions) masked_lm_ids = torch.tensor(masked_lm_ids) masked_lm_weights = torch.tensor(masked_lm_weights) # What we return will one instance in batch which `LightningModule.train_step` receives. return { "label": label, "input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids, "masked_lm_weights": masked_lm_weights, "masked_lm_positions": masked_lm_positions, "masked_lm_ids": masked_lm_ids }