def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3, ) input_ids[:, -1] = self.eos_token_id # Eos Token decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) config = PegasusConfig( vocab_size=self.vocab_size, d_model=self.hidden_size, encoder_layers=self.num_hidden_layers, decoder_layers=self.num_hidden_layers, encoder_attention_heads=self.num_attention_heads, decoder_attention_heads=self.num_attention_heads, encoder_ffn_dim=self.intermediate_size, decoder_ffn_dim=self.intermediate_size, dropout=self.hidden_dropout_prob, attention_dropout=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, eos_token_id=self.eos_token_id, bos_token_id=self.bos_token_id, pad_token_id=self.pad_token_id, ) inputs_dict = prepare_pegasus_inputs_dict(config, input_ids, decoder_input_ids) return config, inputs_dict
def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size) attention_mask = None if self.use_attention_mask: attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2) lm_labels = None if self.use_labels: lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size) config = PegasusConfig( vocab_size=self.vocab_size, d_model=self.d_model, decoder_layers=self.decoder_layers, decoder_ffn_dim=self.decoder_ffn_dim, encoder_attention_heads=self.encoder_attention_heads, decoder_attention_heads=self.decoder_attention_heads, eos_token_id=self.eos_token_id, bos_token_id=self.bos_token_id, use_cache=self.use_cache, pad_token_id=self.pad_token_id, decoder_start_token_id=self.decoder_start_token_id, max_position_embeddings=self.max_position_embeddings, is_encoder_decoder=self.is_encoder_decoder, ) return ( config, input_ids, attention_mask, lm_labels, )
def __init__(self, model: str = None): log.info(model) torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") log.info(torch_device) if model is None: model = "t5" self.modelName = model # path to all the files that will be used for inference self.path = f"./app/api/{model}/" self.model_path = self.path + "pytorch_model.bin" self.config_path = self.path + "config.json" # Selecting the correct model based on the passed madel input. Default t5 if model == "t5": self.config = T5Config.from_json_file(self.config_path) self.model = T5ForConditionalGeneration(self.config) self.tokenizer = T5Tokenizer.from_pretrained(self.path) self.model.eval() self.model.load_state_dict(torch.load(self.model_path, map_location=torch_device)) elif model == "google/pegasus-newsroom": self.config = PegasusConfig.from_json_file(self.config_path) # self.model = PegasusForConditionalGeneration(self.config) # self.tokenizer = PegasusTokenizer.from_pretrained(self.path) self.model = PegasusForConditionalGeneration.from_pretrained(model).to(torch_device) self.tokenizer = PegasusTokenizer.from_pretrained(model) elif model == "facebook/bart-large-cnn": self.config = BartConfig.from_json_file(self.config_path) # self.model = PegasusForConditionalGeneration(self.config) # self.tokenizer = PegasusTokenizer.from_pretrained(self.path) self.model = BartForConditionalGeneration.from_pretrained(model).to(torch_device) self.tokenizer = BartTokenizer.from_pretrained(model) else: raise Exception("This model is not supported") self.text = str()
def __init__(self, parent): self.config = PegasusConfig( vocab_size=99, d_model=24, encoder_layers=2, decoder_layers=2, encoder_attention_heads=2, decoder_attention_heads=2, encoder_ffn_dim=32, decoder_ffn_dim=32, max_position_embeddings=48, add_final_layer_norm=True, )
def get_pipeline_config(self): return PegasusConfig( vocab_size=200, d_model=self.hidden_size, encoder_layers=self.num_hidden_layers, decoder_layers=self.num_hidden_layers, encoder_attention_heads=self.num_attention_heads, decoder_attention_heads=self.num_attention_heads, encoder_ffn_dim=self.intermediate_size, decoder_ffn_dim=self.intermediate_size, dropout=self.hidden_dropout_prob, attention_dropout=self.attention_probs_dropout_prob, max_position_embeddings=200, eos_token_id=self.eos_token_id, bos_token_id=self.bos_token_id, pad_token_id=self.pad_token_id, )
def main(): pagesus_pretrain_path = './page_arciv/' tokenizer = PegasusTokenizer.from_pretrained(pagesus_pretrain_path) config_path = os.path.join(pagesus_pretrain_path, 'config.json') psus_config = PegasusConfig.from_json_file(config_path) MAX_LEN = 1024 decode_max_len = 256 data = load_data('./final_test_data_list.json') model = build_model(pagesus_pretrain_path, psus_config, MAX_LEN, decode_max_len) model.load_weights('./pagesus_section/best_model.hdf5') autotitle = AutoTitle(start_id=tokenizer.pad_token_id, end_id=tokenizer.eos_token_id, maxlen=256, max_decode_len=decode_max_len, model=model) result = just_predict(autotitle, tokenizer, MAX_LEN, data) with open('./pred_result.json', 'w', encoding='utf-8') as f: f.write(json.dumps(result, ensure_ascii=False, cls=NpEncoder))
def convert_pegasus_to_bart( tf_weights: dict, cfg_updates: dict) -> PegasusForConditionalGeneration: cfg_kwargs = DEFAULTS.copy() cfg_kwargs.update(cfg_updates) cfg = PegasusConfig(**cfg_updates) bart = PegasusForConditionalGeneration(cfg) sd = bart.model.state_dict() mapping = {} for k, v in tf_weights.items(): new_k = rename_state_dict_key(k) if new_k not in sd: raise ValueError( f"could not find new key {new_k} in state dict. (converted from {k})" ) if "dense" in k or "proj" in new_k: v = v.T mapping[new_k] = torch.tensor(v, dtype=sd[new_k].dtype) assert v.shape == sd[ new_k].shape, f"{new_k}, {k}, {v.shape}, {sd[new_k].shape}" # make sure embedding.padding_idx is respected mapping["shared.weight"][cfg.pad_token_id] = torch.zeros_like( mapping["shared.weight"][cfg.pad_token_id + 1]) mapping["encoder.embed_tokens.weight"] = mapping["shared.weight"] mapping["decoder.embed_tokens.weight"] = mapping["shared.weight"] empty_biases = { k: torch.zeros_like(v) for k, v in sd.items() if k.endswith("bias") and k not in mapping } mapping.update(**empty_biases) missing, extra = bart.model.load_state_dict(mapping, strict=False) unexpected_missing = [ k for k in missing if k not in ["encoder.embed_positions.weight", "decoder.embed_positions.weight"] ] assert unexpected_missing == [], f"no matches found for the following torch keys {unexpected_missing}" assert extra == [], f"no matches found for the following tf keys {extra}" return bart
def main(): pagesus_pretrain_path = './page_arciv/' tokenizer = PegasusTokenizer.from_pretrained(pagesus_pretrain_path) config_path = os.path.join(pagesus_pretrain_path, 'config.json') psus_config = PegasusConfig.from_json_file(config_path) MAX_LEN = 1920 decode_max_len = 600 batch_size = 2 data = load_data( '/home_zyz/abstract_generate/final_abdata/union_add_noabs_cleaned_1920.json' ) random.shuffle(data) print(len(data)) print(data[0][0]) print(data[0][1]) valid_data = data[:5] train_data = data[5:] train_generator = data_generator(train_data, batch_size, MAX_LEN, decode_max_len, tokenizer) K.clear_session() strategy = tf.distribute.MirroredStrategy() print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) with strategy.scope(): model = build_model(pagesus_pretrain_path, psus_config, MAX_LEN, decode_max_len) epochs = 50 autotitle = AutoTitle(start_id=tokenizer.pad_token_id, end_id=tokenizer.eos_token_id, maxlen=599, max_decode_len=decode_max_len, model=model) evaluator = Evaluator(tokenizer, MAX_LEN, autotitle, valid_data) model.fit(train_generator.forfit(), steps_per_epoch=len(train_generator) - 1, epochs=epochs, callbacks=[evaluator])
return {'Total': total_num, 'Trainable': trainable_num} text = "四海网讯,近日,有媒体报道称:章子怡真怀孕了!报道还援引知情人士消息称,“章子怡怀孕大概四五个月,预产期是年底前后,现在已经不接工作了。”这到底是怎么回事?消息是真是假?针对此消息,23日晚8时30分," \ "华西都市报记者迅速联系上了与章子怡家里关系极好的知情人士,这位人士向华西都市报记者证实说:“子怡这次确实怀孕了。她已经36岁了,也该怀孕了。章子怡怀上汪峰的孩子后,子怡的父母亲十分高兴。子怡的母亲," \ "已开始悉心照料女儿了。子怡的预产期大概是今年12月底。”当晚9时,华西都市报记者为了求证章子怡怀孕消息,又电话联系章子怡的亲哥哥章子男,但电话通了," \ "一直没有人接听。有关章子怡怀孕的新闻自从2013年9月份章子怡和汪峰恋情以来,就被传N遍了!不过,时间跨入2015年,事情却发生着微妙的变化。2015年3月21日,章子怡担任制片人的电影《从天儿降》开机," \ "在开机发布会上几张合影,让网友又燃起了好奇心:“章子怡真的怀孕了吗?”但后据证实,章子怡的“大肚照”只是影片宣传的噱头。过了四个月的7月22日,《太平轮》新一轮宣传,章子怡又被发现状态不佳,不时深呼吸," \ "不自觉想捂住肚子,又觉得不妥。然后在8月的一天,章子怡和朋友吃饭,在酒店门口被风行工作室拍到了,疑似有孕在身!今年7月11日,汪峰本来在上海要举行演唱会,后来因为台风“灿鸿”取消了。而消息人士称," \ "汪峰原来打算在演唱会上当着章子怡的面宣布重大消息,而且章子怡已经赴上海准备参加演唱会了,怎知遇到台风,只好延期,相信9月26日的演唱会应该还会有惊喜大白天下吧。 " device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = BertTokenizer.from_pretrained( 'uer/pegasus-base-chinese-cluecorpussmall') config = PegasusConfig() # config.activation_dropout = 0.1 config.activation_function = 'relu' config.d_model = 384 config.decoder_attention_heads = 6 config.decoder_ffn_dim = 1536 config.decoder_start_token_id = 101 config.decoder_layers = 6 config.dropout = 0.0 # 关闭dropout,保证每次预测结果相同 config.encoder_attention_heads = 6 config.encoder_ffn_dim = 1536 config.encoder_layers = 6 config.forced_eos_token_id = 102 config.scale_embedding = True config.vocab_size = 21128