def str2id(tokenizer: BertTokenizer, sys_utter: str, usr_utter: str, source: str) -> Tuple[List[int], List[int]]: """Convert system, user utterance and source tokens to ids based on BertTokenizer. Args: tokenizer: BertTokenizer sys_utter: system utterance usr_utter: user utterance source: slot + value Returns: input_ids and token_type_ids """ sys_utter_tokens = tokenizer.tokenize(sys_utter) usr_utter_tokens = tokenizer.tokenize(usr_utter) source_tokens = tokenizer.tokenize(source) sys_utter_ids = tokenizer.convert_tokens_to_ids(sys_utter_tokens) usr_utter_ids = tokenizer.convert_tokens_to_ids(usr_utter_tokens) source_ids = tokenizer.convert_tokens_to_ids(source_tokens) input_ids = ([tokenizer.cls_token_id] + sys_utter_ids + [tokenizer.sep_token_id] + usr_utter_ids + [tokenizer.sep_token_id] + source_ids + [tokenizer.sep_token_id]) token_type_ids = ([0] + [0] * (len(sys_utter_ids) + 1) + [1] * (len(usr_utter_ids) + 1) + [0] * (len(source_ids) + 1)) return input_ids, token_type_ids
class MLMModel: def __init__(self): self.model: BertForMaskedLM = BertForMaskedLM.from_pretrained( pretrained_model_name_or_path='Foodbert/foodbert/data/mlm_output/checkpoint-final') with open('Foodbert/foodbert/data/used_ingredients.json', 'r') as f: used_ingredients = json.load(f) self.tokenizer = BertTokenizer(vocab_file='Foodbert/foodbert/data/bert-base-cased-vocab.txt', do_lower_case=False, max_len=128, never_split=used_ingredients) self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') self.model.to(self.device) def predict_substitutes(self, sentence, ingredient_name, with_masking=True): search_id = self.tokenizer.mask_token_id if with_masking else \ self.tokenizer.convert_tokens_to_ids([ingredient_name])[0] sentence = sentence.replace('!', ' !').replace('?', ' ?').replace('.', ' .').replace(':', ' :').replace(',', ' ,') sentence = ' ' + sentence + ' ' all_ordered_substitutes = [] masked_sentence = sentence.replace(f' {ingredient_name} ', ' [MASK] ') input_ids = torch.tensor(self.tokenizer.encode(masked_sentence, add_special_tokens=True)).unsqueeze(0).to(device=self.device) prediction_scores = self.model(input_ids, masked_lm_labels=input_ids)[1][0] ingredient_scores = prediction_scores[input_ids[0] == search_id] for i in range(len(ingredient_scores)): ingredient_score = ingredient_scores[i] softmax_scores = ingredient_score.softmax(dim=0) indices = torch.sort(ingredient_score, descending=True).indices ordered_substitutes = self.tokenizer.convert_ids_to_tokens(indices) softmax_scores = softmax_scores[indices].tolist() all_ordered_substitutes.append((ordered_substitutes, softmax_scores)) return all_ordered_substitutes
def generate_embedding( self, model: transformers.BertModel, tokenizer: transformers.BertTokenizer, product: pd.Series, feature_columns: List[str], ) -> torch.Tensor: model.eval() if (Project.exported_objects_dir / f"{product['product_id']}.obj").exists(): return self.load_already_geneated_embedding(product=product) product_description = self.generate_product_description( product=product, feature_columns=feature_columns) marked_text = "[CLS] " + product_description + " [SEP]" tokenized_text = tokenizer.tokenize(marked_text) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) segments_ids = [1] * len(tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) with torch.no_grad(): outputs = model(tokens_tensor, segments_tensors) hidden_states = outputs[2] token_vecs = hidden_states[-2][0] sentence_embedding = torch.mean(token_vecs, dim=0) torch.save( sentence_embedding, Project.exported_objects_dir / f"{product['product_id']}.obj", ) return sentence_embedding
def _tokenize_bert_sentence(text: str, tokenizer: BertTokenizer) -> Tuple: """ Given a sentence and a BertTokenizer, tokenizes the text, maps to BERT vocab indices, and make the segment IDs for the tokens before returning the tensors on GPU (add flag for GPU disable soon). :param text: The sentence being tokenized. A single sentence as str. :param tokenizer: The BertTokenizer object instantiated. :return token_tensor, segments_tensor: The tensors containing the segment IDs and the tokens themselves. On GPU. """ # Split the sentence into tokens. tokenized_text = tokenizer.encode(text, add_special_tokens=True) # Map the token strings to their vocabulary indices. indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Mark each of the tokens as belonging to sentence "1" (single # sentence). segments_ids = [1] * len(tokenized_text) # Convert inputs to PyTorch tensors, place on GPU. token_tensor = torch.tensor([indexed_tokens]).to('cuda:0') segments_tensor = torch.tensor([segments_ids]).to('cuda:0') return token_tensor, segments_tensor
def main(): args = config_parse() device = 'cuda' if torch.cuda.is_available() and args.use_cuda else 'cpu' if not os.path.exists(args.output_path): os.mkdir(args.output_path) model, n_ctx = load_pretrained_model(args) if args.vocab_path: tokenizer = BertTokenizer(vocab_file=args.vocab_path) else: tokenizer = BertTokenizer.from_pretrained( args.pretrained_tokenizer_model) global pad_id pad_id = tokenizer.convert_tokens_to_ids('[PAD]') if args.seed: set_random_seed(args) if args.raw_file_path: logger.info("start processing raw data....") process_raw_data(args, tokenizer, n_ctx) check_model_parameters(model) raw_token = load_train_data(args) train_data, dev_data = train_test_split(raw_token, test_size=.2) logger.info( f"raw data: {len(raw_token)}, train_data: {len(train_data)}, dev_data: {len(dev_data)}" ) if args.do_train: train(model, device, train_data, args) if args.do_eval: evaluate(model, device, dev_data, args)
def bert_pretraining(dataset, config): bert_tokenizer = BertTokenizer('./bert-base-chinese' + '/vocab.txt') model = BertModel.from_pretrained('./bert-base-chinese') model.eval() model.to(config.device) for batch in batch_slice(dataset, config.train_batch_size): tokens_tensor = [] for instance in batch: instance.ids = bert_tokenizer.convert_tokens_to_ids(instance.chars) tokens_tensor.append(torch.tensor(instance.ids)) tokens_tensor = pad_sequence(tokens_tensor).T attention_mask = torch.ne(tokens_tensor, torch.zeros_like(tokens_tensor)) tokens_tensor = tokens_tensor.to(config.device) attention_mask = attention_mask.to(config.device) with torch.no_grad(): outputs = model(tokens_tensor, attention_mask=attention_mask) encoded_layers = outputs[0] for index, instance in enumerate(batch): instance.embeddings = encoded_layers[ index, 0:len(instance.ids), :].cpu().numpy()
def load_and_cache_examples( task: str, tokenizer: BertTokenizer, model_path: str, data_dir: str, overwrite_cache: bool, max_seq_length: int, model_type: str, cache_root: str = "../../data/preprocessed", product: bool = False, ): processor = processors[task]() output_mode = output_modes[task] # if args.active_learning: # Load data features from cache or dataset file # if active learning, the train data will be saved inside each learning iteration directory cached_features_file = os.path.join(cache_root, "cached_{}_{}_{}".format("inference", list( filter(None, model_path.split("/"))).pop(), str(task), ), ) # if os.path.exists(cached_features_file) and not overwrite_cache: # logger.info("Loading features from cached file %s", cached_features_file) # features = torch.load(cached_features_file) # examples = None # else: logger.info("Creating features from dataset file at %s", data_dir) label_list = processor.get_labels() examples = processor.get_examples(data_dir=data_dir, product=product) log_param(" Num examples training", len(examples)) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=max_seq_length, output_mode=output_mode, pad_on_left=bool(model_type in ["xlnet"]), pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if model_type in ["xlnet"] else 0, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.as_tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.as_tensor( [f.attention_mask for f in features], dtype=torch.long ) all_token_type_ids = torch.as_tensor( [f.token_type_ids for f in features], dtype=torch.long ) # if output_mode == "classification": # all_labels = torch.tensor([f.label for f in features], dtype=torch.long) # elif output_mode == "regression": # all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset( all_input_ids, all_attention_mask, all_token_type_ids ) return dataset, examples
def data_augment(tokenizer: BertTokenizer, augument_size=3): assert Text_Example aug_data = [] entity_data = pickle.load(open(statist_entity_data_path, "rb")) all_train_data = pickle.load(open(pickle_all_train_data_path, "rb")) for data in all_train_data: text = data.text anns = data.anns if len(anns) < 3: aug_data.append(data) continue new_text = text for aug in range(augument_size): sample_anns_idx = random.sample(range(len(anns)), 2) for idx in sample_anns_idx: _tag, ann = anns[idx] ann_replace_list = entity_data[_tag][len(ann)] if len(ann_replace_list) < 2: break repalce_ann = random.choice(ann_replace_list) while repalce_ann == ann: repalce_ann = random.choice(ann_replace_list) new_text = text.replace(ann, repalce_ann, 1) new_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in new_text] new_text_example = data new_text_example.text = new_text new_text_example.token_ids = new_token_ids
def featurize(self, df): bert_model = BertModel.from_pretrained(self.data_path) bert_tokenizer = BertTokenizer(self.data_path + "/vocab.txt", do_lower_case=False, do_basic_tokenize=False) mecab = MeCab.Tagger('-Ochasen') data_list = df.rdd.collect() label_list = [] vec_list = [] for data in data_list: tmp_list = [] node_list = data[1] for word in node_list: tmp_list.append(word) if len(tmp_list) != 0: label_list.append(float(data[0])) bert_tokens = bert_tokenizer.tokenize( " ".join(["[CLS]"] + tmp_list + ["[SEP]"])) token_ids = bert_tokenizer.convert_tokens_to_ids(bert_tokens) tokens_tensor = torch.tensor(token_ids).unsqueeze(0) all_outputs = bert_model(tokens_tensor) embedding = all_outputs[-2].detach().numpy()[0] vec = np.mean(embedding, axis=0).tolist() vec_list.append(Vectors.dense(vec)) zip_list = zip(label_list, vec_list) new_df = self.spark.createDataFrame(zip_list, ("label", "features")) return new_df
def get_indices_and_masks(sent_tokens: List[str], in_sent_start: int, in_sent_end: int, tokenizer: BertTokenizer, mask_mention: bool = False) \ -> Tuple[List[int], List[float], int, int]: if in_sent_start not in range(len(sent_tokens)) or\ in_sent_end not in range(1, len(sent_tokens) + 1): raise ValueError( f'wrong input: tokens {sent_tokens} don\'t contain pos' f' ({in_sent_start}, {in_sent_end}).') if mask_mention: for n in range(in_sent_start, in_sent_end): sent_tokens[n] = tokenizer.mask_token sent_subword_idxs = [] sent_subwords = [] sent_hypo_mask = [] new_in_sent_start, new_in_sent_end = None, None for n, tok in enumerate(sent_tokens): if n == in_sent_start: new_in_sent_start = len(sent_subwords) subtokens = tokenizer.tokenize(tok) sent_subwords.extend(subtokens) subtok_idxs = tokenizer.convert_tokens_to_ids(subtokens) sent_subword_idxs.extend(subtok_idxs) # NOTE: absence of + 1 because absence of [CLS] token in the beginning mask_value = float(in_sent_start <= n < in_sent_end) sent_hypo_mask.extend([mask_value] * len(subtok_idxs)) if n == in_sent_end - 1: new_in_sent_end = len(sent_subwords) + 1 return sent_subword_idxs, sent_hypo_mask, new_in_sent_start, new_in_sent_end
def get_embedding(phrases: List[str], emb_mat: torch.Tensor, tokenizer: BertTokenizer, debug: bool = False) -> torch.Tensor: # emb_mat: [vocab_size, emb_size] # returns: [num_phrases, emb_size] subtok_ids, subtok_masks = [], [] max_len = 0 for w in phrases: subtok_toks = tokenizer.tokenize(w) subtok_ids.append(tokenizer.convert_tokens_to_ids(subtok_toks)) num_subtoks = len(subtok_ids[-1]) subtok_masks.append([1.] * num_subtoks) if debug: print(f"subtok_ids('{w}') = {subtok_ids[-1]}") print( f'{[tokenizer._convert_id_to_token(s) for s in subtok_ids[-1]]}' ) max_len = max_len if max_len > num_subtoks else num_subtoks # subtok_ids, subtok_masks: [num_phrases, max_len] subtok_ids = torch.tensor( [sw_list + [-1] * (max_len - len(sw_list)) for sw_list in subtok_ids]) subtok_masks = torch.tensor( [m + [0.] * (max_len - len(m)) for m in subtok_masks]) # subtok_sizes: [num_phrases] subtok_sizes = torch.sum(subtok_masks, 1) if debug: print(subtok_sizes) # emb_mat[subtok_ids]: [num_phrases, max_len, emb_size] return torch.sum(emb_mat[subtok_ids] * subtok_masks.unsqueeze(2), axis=1) \ / subtok_sizes.unsqueeze(1)
def tokenize_and_pad_samples(genes, labels): k = len(genes[0][0]) if k == 4: kmer_filepath = '/home/brian/Downloads/fourmers.txt' elif k == 6: kmer_filepath = '/home/brian/Downloads/hexamers.txt' elif k == 8: kmer_filepath = '/home/brian/Downloads/octamers.txt' formatted_samples = [['[CLS]'] + sample + ['[SEP]'] for sample in genes] formatted_labels = [[0] + l + [0] for l in labels] tokenizer = BertTokenizer(kmer_filepath, max_len=MAX_LEN) print("TOKENIZER LENGTH", len(tokenizer)) attention_masks = [ np.concatenate([np.ones(len(l)), np.zeros(MAX_LEN - len(l))]) for l in formatted_labels ] #seq_ids = tokenizer.convert_tokens_to_ids(formatted_samples) seq_ids = [ tokenizer.convert_tokens_to_ids(sample) for sample in formatted_samples ] seq_ids = pad_sequences(seq_ids, maxlen=MAX_LEN, truncating='post', padding='post') return seq_ids, attention_masks, formatted_labels
def chat(folder_bert, voc, testing=False): tf.random.set_seed(1) tokenizer = BertTokenizer(vocab_file=folder_bert + voc) if testing: tokens = tokenizer.tokenize("jeg tror det skal regne") print(tokens) ids = tokenizer.convert_tokens_to_ids(tokens) print(ids) print("Vocab size:", len(tokenizer.vocab)) config = BertConfig.from_json_file(folder_bert + "/config.json") model = BertLMHeadModel.from_pretrained(folder_bert, config=config) while (1): text = input(">>User: "******"Bot: {}".format(tokenizer.decode(sample_output[0]))) print("Bot: {}".format( tokenizer.decode(sample_output[:, input_ids.shape[-1]:][0], skip_special_tokens=True)))
class JapaneseWorker: def __init__(self): self.juman_tokenizer = JumanTokenizer() self.bert_tokenizer = BertTokenizer(config['DEFAULT']['vocab_path'], do_basic_tokenize=False) self.cls_id = self.bert_tokenizer.vocab['[CLS]'] self.mask_id = self.bert_tokenizer.vocab['[MASK]'] self.bert_model = 'model/Japanese/' self.cp = 'checkpoint/jp/cp_step_1200000.pt' self.opt = 'checkpoint/jp/opt_step_1200000.pt' @staticmethod def linesplit(src): """ :param src: type str, String type article :return: type list, punctuation seperated sentences """ def remove_newline(x): x = x.replace('\n', '') return x def remove_blank(x): x = x.replace(' ', '') return x def remove_unknown(x): unknown = ['\u3000'] for h in unknown: x = x.replace(h, '') return x src = remove_blank(src) src = remove_newline(src) src = remove_unknown(src) src_line = re.split('。(?<!」)|!(?<!」)|?(?!」)', src) src_line = [x for x in src_line if x is not ''] return src_line def tokenizer(self, src): """ :param src: type list, punctuation seperated sentences :return: token: type list, numberized tokens token_id: type list, tokens """ token = [] token_id = [] def _preprocess_text(text): return text.replace(" ", "") # for Juman for sentence in src: preprocessed_text = _preprocess_text(sentence) juman_tokens = self.juman_tokenizer(preprocessed_text) tokens = self.bert_tokenizer.tokenize(" ".join(juman_tokens)) tokens = ["[CLS]"] + tokens + ["[SEP]"] ids = self.bert_tokenizer.convert_tokens_to_ids(tokens) token += tokens token_id += ids return token, token_id
class NemoBertTokenizer(TokenizerSpec): def __init__( self, pretrained_model=None, vocab_file=None, do_lower_case=True, max_len=None, do_basic_tokenize=True, never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"), ): if pretrained_model: self.tokenizer = BertTokenizer.from_pretrained(pretrained_model) if "uncased" not in pretrained_model: self.tokenizer.basic_tokenizer.do_lower_case = False else: self.tokenizer = BertTokenizer(vocab_file, do_lower_case, do_basic_tokenize) self.vocab_size = len(self.tokenizer.vocab) self.never_split = never_split def text_to_tokens(self, text): tokens = self.tokenizer.tokenize(text) return tokens def tokens_to_text(self, tokens): text = self.tokenizer.convert_tokens_to_string(tokens) return remove_spaces(handle_quotes(text.strip())) def token_to_id(self, token): return self.tokens_to_ids([token])[0] def tokens_to_ids(self, tokens): ids = self.tokenizer.convert_tokens_to_ids(tokens) return ids def ids_to_tokens(self, ids): tokens = self.tokenizer.convert_ids_to_tokens(ids) return tokens def text_to_ids(self, text): tokens = self.text_to_tokens(text) ids = self.tokens_to_ids(tokens) return ids def ids_to_text(self, ids): tokens = self.ids_to_tokens(ids) tokens_clean = [t for t in tokens if t not in self.never_split] text = self.tokens_to_text(tokens_clean) return text def pad_id(self): return self.tokens_to_ids(["[PAD]"])[0] def bos_id(self): return self.tokens_to_ids(["[CLS]"])[0] def eos_id(self): return self.tokens_to_ids(["[SEP]"])[0]
def main(): args = setup_train_args() # 日志同时输出到文件和console global logger logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda device = 'cuda' if args.cuda else 'cpu' logger.info('using device:{}'.format(device)) # 为CPU设置种子用于生成随机数,以使得结果是确定的 # 为当前GPU设置随机种子;如果使用多个GPU,应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子。 # 当得到比较好的结果时我们通常希望这个结果是可以复现 if args.seed: set_random_seed(args) # 设置使用哪些显卡进行训练 os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 初始化tokenizer tokenizer = BertTokenizer(vocab_file=args.vocab_path) # tokenizer的字典大小 vocab_size = len(tokenizer) global pad_id pad_id = tokenizer.convert_tokens_to_ids(PAD) # 创建modle的输出目录 if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) # 加载dialogue GPT2模型 model, n_ctx = create_model(args, vocab_size) model.to(device) # 对原始数据进行预处理,将原始语料转换成对应的token_id if args.raw: preprocess_raw_data(args, tokenizer, n_ctx) # 是否使用多块GPU进行并行运算 multi_gpu = False if args.cuda and torch.cuda.device_count() > 1: logger.info("Let's use GPUs to train") model = DataParallel(model, device_ids=[int(i) for i in args.device.split(',')]) multi_gpu = True # 记录模型参数数量 num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() logger.info('number of model parameters: {}'.format(num_parameters)) # 加载数据 logger.info("loading traing data") with open(args.train_tokenized_path, "r", encoding="utf8") as f: data = f.read() data_list = data.split("\n") train_list, test_list = train_test_split(data_list, test_size=0.2, random_state=1) # 开始训练 train(model, device, train_list, multi_gpu, args) # 测试模型 evaluate(model, device, test_list, multi_gpu, args)
def main(): args = set_interact_args() logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda # args.cuda = False device = 'cuda' if args.cuda else 'cpu' logger.info('using device:{}'.format(device)) os.environ["CUDA_VISIBLE_DEVICES"] = args.device tokenizer = BertTokenizer(vocab_file=args.voca_path) model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path) model.to(device) model.eval() print('***********************Summary model start************************') while True: try: text = input() for i in range(5): if len(text): text = text[:1000] input_ids = [tokenizer.cls_token_id] # 每个input以[CLS]为开头 input_ids.extend(tokenizer.encode(text)) input_ids.append(tokenizer.sep_token_id) curr_input_tensor = torch.tensor(input_ids).long().to(device) generated = [] # 最多生成max_len个token for _ in range(args.max_len): outputs = model(input_ids=curr_input_tensor) next_token_logits = outputs[0][-1, :] # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率 for id in set(generated): next_token_logits[id] /= args.repetition_penalty next_token_logits = next_token_logits / args.temperature # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token next_token_logits[tokenizer.convert_tokens_to_ids( '[UNK]')] = -float('Inf') filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp) # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标 next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) if next_token == tokenizer.sep_token_id: # 遇到[SEP]则表明response生成结束 break generated.append(next_token.item()) curr_input_tensor = torch.cat( (curr_input_tensor, next_token), dim=0) text = tokenizer.convert_ids_to_tokens(generated) print("summary:" + "".join(text)) except KeyboardInterrupt: break
def example_to_input(lemma_list: List[str], tags_list: List[int], tok: BertTokenizer): subword_list, tags_map = tok.convert_tokens_to_ids(tok.tokenize('[CLS]')), [] for w in lemma_list: tags_map.append(len(subword_list)) subword_list += tok.convert_tokens_to_ids(tok.tokenize(w)) subword_list += tok.convert_tokens_to_ids(tok.tokenize('[SEP]')) mapped_tags = [0] * len(subword_list) # mapped_pos = [0] * len(subword_list) # mapped_lemmas = ["[UNK]"] * len(subword_list) # mapped_altern = [[]] * len(subword_list) for i, j in enumerate(tag_map): mapped_tags[j] = tags_list[i] # mapped_pos[j] = example['pos'][i] # mapped_lemmas[j] = example['lemmas'][i] # mapped_altern[j] = example['alternatives'][i] return subword_list, mapped_tags
def create_model(model_class: BertPreTrainedModel, encoder_config: BertConfig, tokenizer: BertTokenizer, encoder_path=None, entity_types: dict = None, relation_types: dict = None, prop_drop: float = 0.1, meta_embedding_size: int = 25, size_embeddings_count: int = 10, ed_embeddings_count: int = 300, token_dist_embeddings_count: int = 700, sentence_dist_embeddings_count: int = 50, mention_threshold: float = 0.5, coref_threshold: float = 0.5, rel_threshold: float = 0.5, position_embeddings_count: int = 700, cache_path=None): params = dict( config=encoder_config, # JEREX model parameters cls_token=tokenizer.convert_tokens_to_ids('[CLS]'), entity_types=len(entity_types), relation_types=len(relation_types), prop_drop=prop_drop, meta_embedding_size=meta_embedding_size, size_embeddings_count=size_embeddings_count, ed_embeddings_count=ed_embeddings_count, token_dist_embeddings_count=token_dist_embeddings_count, sentence_dist_embeddings_count=sentence_dist_embeddings_count, mention_threshold=mention_threshold, coref_threshold=coref_threshold, rel_threshold=rel_threshold, tokenizer=tokenizer, cache_dir=cache_path, ) if encoder_path is not None: model = model_class.from_pretrained(encoder_path, **params) else: model = model_class(**params) # conditionally increase position embedding count if encoder_config.max_position_embeddings < position_embeddings_count: old = model.bert.embeddings.position_embeddings new = nn.Embedding(position_embeddings_count, encoder_config.hidden_size) new.weight.data[:encoder_config. max_position_embeddings, :] = old.weight.data model.bert.embeddings.position_embeddings = new model.bert.embeddings.register_buffer( "position_ids", torch.arange(position_embeddings_count).expand((1, -1))) encoder_config.max_position_embeddings = position_embeddings_count return model
def transformer_preprocess(src_path, tgt_path, tokenized_file, vocab_file='./config/vocab_en.txt', ctx=200): ''' tokenize the dataset for NLG (GPT2), write the tokenized id into the tokenized_file. more details can be found in https://github.com/yangjianxin1/GPT2-chitchat ''' def clean_inside(s): s = s.replace('<user0>', '') s = s.replace('<user1>', '') s = s.strip() s = clean(s) return s # create the Bert tokenizer of the GPT2 model tokenizer = BertTokenizer(vocab_file=vocab_file) src_data, tgt_data = read_file(src_path), read_file(tgt_path) src_data = [' '.join(i) for i in src_data] tgt_data = [' '.join(i) for i in tgt_data] assert len(src_data) == len( tgt_data ), f'[!] length of src and tgt: {len(src_data)}/{len(tgt_data)}' # combine them corpus = [] longest = 0 for s, t in tqdm(list(zip(src_data, tgt_data))): item = [tokenizer.cls_token_id ] # [CLS] for each dialogue in the begining s = s + ' __eou__ ' + t s = clean_inside(s) utterances = s.split('__eou__') for utterance in utterances: words = nltk.word_tokenize(utterance) item.extend( [tokenizer.convert_tokens_to_ids(word) for word in words]) item.append(tokenizer.sep_token_id) if len(item) > longest: longest = len(item) item = item[:ctx] corpus.append(item) # write into the file with open(tokenized_file, 'w') as f: for i in range(len(corpus)): words = [str(word) for word in corpus[i]] f.write(f'{" ".join(words)}') if i < len(corpus) - 1: f.write('\n') print( f'[!] Preprocess the data for the transformers(GPT2), the longest sentence :{longest}, write the data into {tokenized_file}.' )
def convert_data_to_feature(): #載入問題資料集 q = open('Dataset/Query_Train/Final_question.txt', "r", encoding="utf-8") questions = q.readlines() q.close() #載入答案資料集 a = open('Dataset/Train_Label/FinalDomainLabel.txt', "r", encoding="utf-8") answers = a.readlines() a.close() assert len(answers) == len(questions) # ans_dic 表示answer的類別 ans_dic = make_ans_dic(answers) # question_dic 表示question的類別 question_dic = make_question_dic(questions) tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt') q_tokens = [] max_seq_len = 0 for q in question_dic.data: bert_ids = tokenizer.build_inputs_with_special_tokens( tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q))) if (len(bert_ids) > max_seq_len): max_seq_len = len(bert_ids) q_tokens.append(bert_ids) print("最長問句長度:", max_seq_len) assert max_seq_len <= 512 # 小於BERT-base長度限制 # 補齊長度 for q in q_tokens: while len(q) < max_seq_len: q.append(0) a_labels = [] for a in ans_dic.data: a_labels.append(ans_dic.to_id(a)) # BERT input embedding answer_lables = a_labels input_ids = q_tokens input_masks = [[1] * max_seq_len for i in range(len(question_dic))] input_segment_ids = [[0] * max_seq_len for i in range(len(question_dic))] assert len(input_ids) == len(question_dic) and len(input_ids) == len( input_masks) and len(input_ids) == len(input_segment_ids) data_features = { 'input_ids': input_ids, 'input_masks': input_masks, 'input_segment_ids': input_segment_ids, 'answer_lables': answer_lables, 'question_dic': question_dic, 'answer_dic': ans_dic } output = open('Dataset/data_features_domain.pkl', 'wb') pickle.dump(data_features, output) return data_features
def __init__(self, bert_tokenizer: BertTokenizer, jp_tokenizer: JumanTokenizer, args, file_path='train', block_size=512): assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, 'cached_lm_' + str(block_size) + '_' + filename) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) with open(cached_features_file, 'rb') as handle: self.examples = pickle.load(handle) else: logger.info("Creating features from dataset file at %s", directory) self.examples = [] with open(file_path, encoding="utf-8") as f: docs = f.readlines() exsamples = [] for _, line in enumerate(docs): text = line.rstrip(os.linesep) # separate text into tokens tokenized_text = bert_tokenizer.convert_tokens_to_ids( bert_tokenizer.tokenize(" ".join( jp_tokenizer.tokenize(text)))) # add special tokkens : [CLS] and [SEP] added_special = bert_tokenizer.build_inputs_with_special_tokens( tokenized_text) # Zero-pad up to the sequence length. diff = block_size - len(added_special) if diff < 0: added_special = added_special[:diff] else: # padding を 0 -> -1に変更 padding = [-1] * (block_size - len(added_special)) added_special += padding assert len(added_special) == block_size self.examples.append(added_special) logger.info("Saving features into cached file %s", cached_features_file) with open(cached_features_file, 'wb') as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
def bert_text_preparation(text: str, tokenizer: BertTokenizer): marked_text = "[CLS] " + text + " [SEP]" tokenized_text = tokenizer.tokenize(marked_text) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) segments_ids = [1] * len(indexed_tokens) tokens_tensor = torch.tensor([indexed_tokens]) segments_tensor = torch.tensor([segments_ids]) return tokenized_text, tokens_tensor, segments_tensor
def build_feature(tokenizer: transformers.BertTokenizer, examples: list, max_length: int = None): ''' @param tokenizer (transformers.BertTokenizer): tokenzier to convert token to ids @param examples (list): input examples @param maxlength (int): set max length to cut off example sequence @return examples (list): new examples with input feature ''' if max_length is not None: length = max_length else: length = 1e3 for example in examples: context = tokenizer.convert_tokens_to_ids( example['context'][:min(length, len(example['context']))]) # print(context) question = tokenizer.convert_tokens_to_ids( example['question'][:min(length, len(example['question']))]) # print(question) out = tokenizer.prepare_for_model(context, question, return_token_type_ids=True, return_attention_mask=True) inputs = out['input_ids'] token_type_ids = out['token_type_ids'] attention_mask = out['attention_mask'] # print(inputs) # print(token_type_ids) # print(attention_mask) example['input_feature'] = inputs example['token_type_ids'] = token_type_ids example['attention_mask'] = attention_mask return examples
class testAnswerGeneration(): def __init__(self): self.tokenizer = BertTokenizer( vocab_file='bert-base-chinese-vocab.txt') self.config = BertConfig.from_pretrained('trained_model/1/config.json') self.model = BertForMaskedLM.from_pretrained( 'trained_model/1/pytorch_model.bin', from_tf=bool('.ckpt' in 'bert-base-chinese'), config=self.config) self.model.eval() def to_input_id(self, sentence_input): return self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(sentence_input)) def getAnswer(self, context, question): input_id = self.to_input_id("[CLS] " + context + " [SEP] " + question + " [SEP]") count = 0 answer = "" maskpos = len(input_id) # 標出要預測答案的位置 input_id.append(103) # 補齊長度 while len(input_id) < 512: input_id.append(0) # 限制答案最大長度為10 while (count < 10): input_id_tensor = torch.LongTensor([input_id]) outputs = self.model(input_id_tensor) predictions = outputs[0] predicted_index = torch.argmax( predictions[0, maskpos]).item() # 生出最有可能的token_id predicted_token = self.tokenizer.convert_ids_to_tokens( predicted_index) # id轉token # 當預測為[SEP]的時候,就結束生成答案 if predicted_token == '[SEP]': break answer = answer + predicted_token # 將生成的token連接起來 input_id[maskpos] = predicted_index # 用生成的token_id取代當前的[MASK]的id maskpos += 1 if maskpos < 512: input_id[maskpos] = 103 # 標出下一個預測的[MASK]的id else: break count += 1 return answer
def tensorize_example(example: dict, config: dict, tokenizer: BertTokenizer, genres: dict) -> CoNLLCorefResolution: clusters = example["clusters"] gold_mentions = sorted(tuple(m) for m in util.flatten(clusters)) gold_mention_map = {m: i for i, m in enumerate(gold_mentions)} cluster_ids = [0] * len(gold_mentions) for cluster_id, cluster in enumerate(clusters): for mention in cluster: cluster_ids[gold_mention_map[tuple(mention)]] = cluster_id + 1 cluster_ids = torch.tensor(cluster_ids, dtype=torch.int64) sentences = example["sentences"] num_words = sum(len(s) + 2 for s in sentences) speakers = example["speakers"] speaker_dict = util.get_speaker_dict(util.flatten(speakers), config['max_num_speakers']) max_sentence_length = config['max_segment_len'] text_len = torch.tensor([len(s) for s in sentences], dtype=torch.int64) input_ids, input_mask, speaker_ids = [], [], [] for i, (sentence, speaker) in enumerate(zip(sentences, speakers)): sentence = ['[CLS]'] + sentence + ['[SEP]'] sent_input_ids = tokenizer.convert_tokens_to_ids(sentence) sent_input_mask = [-1] + [1] * (len(sent_input_ids) - 2) + [-1] sent_speaker_ids = [1] + [speaker_dict.get(s, 3) for s in speaker] + [1] while len(sent_input_ids) < max_sentence_length: sent_input_ids.append(0) sent_input_mask.append(0) sent_speaker_ids.append(0) input_ids.append(sent_input_ids) speaker_ids.append(sent_speaker_ids) input_mask.append(sent_input_mask) input_ids = torch.tensor(input_ids, dtype=torch.int64) input_mask = torch.tensor(input_mask, dtype=torch.int64) speaker_ids = torch.tensor(speaker_ids, dtype=torch.int64) assert num_words == torch.sum( torch.abs(input_mask)), (num_words, torch.sum(torch.abs(input_mask))) doc_key = example["doc_key"] subtoken_map = torch.tensor(example.get("subtoken_map", None), dtype=torch.int64) sentence_map = torch.tensor(example['sentence_map'], dtype=torch.int64) genre = genres.get(doc_key[:2], 0) genre = torch.tensor([genre], dtype=torch.int64) gold_starts, gold_ends = tensorize_mentions(gold_mentions) return CoNLLCorefResolution(doc_key, input_ids, input_mask, text_len, speaker_ids, genre, gold_starts, gold_ends, cluster_ids, sentence_map, subtoken_map)
class CustomBertVocab(object): def __init__(self, lang='en'): """Basic Vocabulary object""" self.lang = lang self.vocab_size = 0 self.tokenizer = None def load(self, bert_vocab_path): """load 词汇表""" self.tokenizer = BertTokenizer( vocab_file=bert_vocab_path, never_split=['<num>', '<url>', '<img>', '</s>']) self.vocab_size = self.tokenizer.vocab_size def encode(self, words: list): """words 编码""" ids = [] for word in words: ids.append(self.tokenizer.convert_tokens_to_ids(word)) return ids def decode(self, ids, decode_type: str): """ids 解码""" sentence = [] for id in ids: if isinstance(id, torch.Tensor): word = self.tokenizer.convert_ids_to_tokens(id.item()) else: word = self.tokenizer.convert_ids_to_tokens(id) if decode_type == 'predict': if word not in [ EOS_TOKEN, SOS_TOKEN, PAD_TOKEN, IMG_TOKEN, MSP_TOKEN ]: sentence.append(word) if word == PAD_TOKEN or word == EOS_TOKEN: break else: # context question sentence.append(word) if word == PAD_TOKEN: break if self.lang == 'zh': return ''.join(sentence) return ' '.join(sentence)
def __init__(self, conf: GPT2ChatbotConf, tokenizer: BertTokenizer): self.conf = conf self.tokenizer = tokenizer self.speaker_ids = tokenizer.convert_tokens_to_ids( ["[speaker1]", "[speaker2]"]) self.pool = Pool(1) # get all chatlog logger.info("read raw data...") self.chat_log = self._get_chatlog() logger.info("num data:{}".format(len(self.chat_log))) self.data_iter = iter(self.chat_log) self.steps = len(self.chat_log) // self.conf.batch_size # 创建一个数据进程 if self.conf.use_multi_proc: batch_examples = self._get_batch_examples() self.proc = self.pool.apply_async( func=LCCCDataGenerator.get_batch_data, args=(batch_examples, tokenizer, self.speaker_ids))
def get_encoder_embedding(phrases: List[str], bert: BertModel, tokenizer: BertTokenizer, embed_wo_special_tokens: bool) -> torch.Tensor: subtok_ids_list, hypo_mask_list = [], [] for phr in phrases: subtok_ids_list.append( tokenizer.convert_tokens_to_ids(['[CLS]'] + tokenizer.tokenize(phr) + ['[SEP]'])) hypo_mask_list.append([1.0] * len(subtok_ids_list[-1])) if embed_wo_special_tokens: hypo_mask_list[-1][0] = 0.0 hypo_mask_list[-1][-1] = 0.0 batch = HypoDataset.torchify_and_pad(subtok_ids_list, hypo_mask_list) subtok_ids_batch, hypo_mask_batch, attn_mask_batch = to_device(*batch) h = bert(subtok_ids_batch, attention_mask=attn_mask_batch)[0] m = hypo_mask_batch.unsqueeze(2) phrase_representations = torch.sum(h * m, 1) / torch.sum(m, 1) return phrase_representations
class PredictionModel: def __init__(self): self.model: BertModel = BertModel.from_pretrained( pretrained_model_name_or_path= 'foodbert/data/mlm_output/checkpoint-final') with open('foodbert/data/used_ingredients.json', 'r') as f: used_ingredients = json.load(f) self.tokenizer = BertTokenizer( vocab_file='foodbert/data/bert-base-cased-vocab.txt', do_lower_case=False, max_len=128, never_split=used_ingredients) self.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') self.model.to(self.device) def predict_embeddings(self, sentences): dataset = InstructionsDataset(tokenizer=self.tokenizer, sentences=sentences) dataloader = DataLoader(dataset, batch_size=100, pin_memory=True) embeddings = [] ingredient_ids = [] for batch in dataloader: batch = batch.to(self.device) with torch.no_grad(): embeddings_batch = self.model(batch) embeddings.extend(embeddings_batch[0]) ingredient_ids.extend(batch) return torch.stack(embeddings), ingredient_ids def compute_embedding_for_ingredient(self, sentence, ingredient_name): embeddings, ingredient_ids = self.predict_embeddings([sentence]) embeddings_flat = embeddings.view((-1, 768)) ingredient_ids_flat = torch.stack(ingredient_ids).flatten() food_id = self.tokenizer.convert_tokens_to_ids(ingredient_name) food_embedding = embeddings_flat[ingredient_ids_flat == food_id].cpu().numpy() return food_embedding[0]