def setUp(self): super().setUp() # We have a SentencePiece fixture for testing tokenizer = PegasusTokenizer(SAMPLE_VOCAB, offset=0, mask_token_sent=None, mask_token="[MASK]") tokenizer.save_pretrained(self.tmpdirname)
def exec(self, text): src_text = [text] model_name = self.model #model_name = 'google/pegasus-xsum' #model_name = 'google/pegasus-large' #model_name = 'google/pegasus-cnn_dailymail' #model_name = 'google/pegasus-pubmed' #model_name = 'google/pegasus-wikihow' #model_name = 'google/pegasus-newsroom' #model_name = 'google/pegasus-multi_news' #model_name = 'google/pegasus-reddit_tifu' #model_name = 'google/pegasus-arxiv' torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to( torch_device) batch = tokenizer.prepare_seq2seq_batch( src_text, truncation=True, padding='longest').to(torch_device) result = model.generate(**batch) tgt_text = tokenizer.batch_decode(result, skip_special_tokens=True) if self.model == "google/pegasus-cnn_dailymail": tgt_text[0] = re.sub('<n>', ' ', tgt_text[0]) return tgt_text[0]
def get_model_tokenizer(model_name): import torch torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' if "pegasus" in model_name: #its a pegasus model from transformers import PegasusForConditionalGeneration, PegasusTokenizer tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to( torch_device) return model, tokenizer elif "bart-large" in model_name: # its a bart-model from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig tokenizer = BartTokenizer.from_pretrained(model_name) model = BartForConditionalGeneration.from_pretrained(model_name).to( torch_device) return model, tokenizer elif "bart-custom-large" in model_name: from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig tokenizer = BartTokenizer.from_pretrained(model_name) model = BartForConditionalGeneration.from_pretrained(model_name).to( torch_device) return model, tokenizer else: # T5 or distilbart from transformers import AutoTokenizer, AutoModelWithLMHead tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelWithLMHead.from_pretrained(model_name).to( torch_device) return model, tokenizer
class ParaPhrasing: """Class loads pegasus model for text augmentation""" model_name = 'tuner007/pegasus_paraphrase' torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to( torch_device) @staticmethod def paraphrases(input_text, num_return_sequences=10, num_beams=10): """ generates variations for a given sentence/text :param input_text: sentence or text :param num_return_sequences: Number of variations to be returned :param num_beams: Number of beams for beam search. 1 means no beam search :return: list of variations of the input text """ if isinstance(input_text, str): input_text = [input_text] batch = ParaPhrasing.tokenizer.prepare_seq2seq_batch( input_text, truncation=True, padding='longest', max_length=60).to(ParaPhrasing.torch_device) translated = ParaPhrasing.model.generate( **batch, max_length=60, num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5) tgt_text = ParaPhrasing.tokenizer.batch_decode( translated, skip_special_tokens=True) return tgt_text
def _test_TFPegasus(self, size, large=False): from transformers import PegasusTokenizer, TFPegasusModel tokenizer = PegasusTokenizer.from_pretrained(size) model = TFPegasusModel.from_pretrained(size) input_ids = \ tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="tf").input_ids decoder_input_ids = \ tokenizer("Studies show that", return_tensors="tf").input_ids input_dict = { "input_ids": input_ids, "decoder_input_ids": decoder_input_ids } # this comes from TFPegasusEncoder/Decoder like: # self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 # while this is mean to come from config tf tells us that those are model inputs # this might be new in tensformers-2.4.2, we did not notice before that extra_input = { "tf_pegasus_model/model/decoder/mul/y:0": np.array([32.], dtype=np.float32), "tf_pegasus_model/model/encoder/mul/y:0": np.array([32.], dtype=np.float32) } spec, input_dict = self.spec_and_pad( input_dict, max_length=model.config.max_length) outputs = ["last_hidden_state"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large, extra_input=extra_input)
def __init__(self, model: str = None): log.info(model) torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") log.info(torch_device) if model is None: model = "t5" self.modelName = model # path to all the files that will be used for inference self.path = f"./app/api/{model}/" self.model_path = self.path + "pytorch_model.bin" self.config_path = self.path + "config.json" # Selecting the correct model based on the passed madel input. Default t5 if model == "t5": self.config = T5Config.from_json_file(self.config_path) self.model = T5ForConditionalGeneration(self.config) self.tokenizer = T5Tokenizer.from_pretrained(self.path) self.model.eval() self.model.load_state_dict(torch.load(self.model_path, map_location=torch_device)) elif model == "google/pegasus-newsroom": self.config = PegasusConfig.from_json_file(self.config_path) # self.model = PegasusForConditionalGeneration(self.config) # self.tokenizer = PegasusTokenizer.from_pretrained(self.path) self.model = PegasusForConditionalGeneration.from_pretrained(model).to(torch_device) self.tokenizer = PegasusTokenizer.from_pretrained(model) elif model == "facebook/bart-large-cnn": self.config = BartConfig.from_json_file(self.config_path) # self.model = PegasusForConditionalGeneration(self.config) # self.tokenizer = PegasusTokenizer.from_pretrained(self.path) self.model = BartForConditionalGeneration.from_pretrained(model).to(torch_device) self.tokenizer = BartTokenizer.from_pretrained(model) else: raise Exception("This model is not supported") self.text = str()
def compute(sm): # Import the Pegasus Model model_name = 'google/pegasus-xsum' torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to( torch_device) sm_len = len(sm) sen_list = splitText(sm, sm_len) # Get sections to be summarized try: batches = [] for s in sen_list: # Preparation batch = tokenizer.prepare_seq2seq_batch( [s], truncation=True, padding='longest').to(torch_device) batches.append(batch) except: return "" temp = [] for b in batches: # Summary generation translated = model.generate(**b) temp.append(translated) final_summary = [] for t in temp: # Put together the summaries from the different sections final_summary.append( tokenizer.batch_decode(t, skip_special_tokens=True)[0]) return final_summary
def __init__(self, config): self.model_name = 'google/pegasus-reddit_tifu' self.device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f"using device: {self.device}") self.tokenizer = PegasusTokenizer.from_pretrained(self.model_name, force_download=True) self.model = PegasusForConditionalGeneration.from_pretrained( self.model_name, force_download=True).to(self.device)
def load_model(self): model = PegasusForConditionalGeneration.from_pretrained( os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'model')) tokenizer = PegasusTokenizer.from_pretrained( os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'tokenizer')) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) return model, tokenizer, device
def __init__(self, args, device): super().__init__(args, device) assert args.pretrained_model_name in self.PRETRAINED_MODEL_NAMES self.pretrained_model_name = args.pretrained_model_name logging.info(f'Loading Pegasus ({self.pretrained_model_name})') self.model = PegasusForConditionalGeneration.from_pretrained( self.pretrained_model_name).to(self.device) self.tokenizer: PegasusTokenizer = PegasusTokenizer.from_pretrained( self.pretrained_model_name)
def generate_summary(text, model_name): torch_device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to( torch_device) batch = tokenizer.prepare_seq2seq_batch( text, truncation=True, padding="longest", return_tensors="pt").to(torch_device) translated = model.generate(**batch) return tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
def generate_summary(context): model_name = 'google/pegasus-xsum' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name) batch = tokenizer.prepare_seq2seq_batch(src_texts='context', truncation=True, padding='max-length', return_tensors="pt") translated = model.generate(**batch) tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True) return tgt_text
def single_document_summarization(self, src_text): tokenizer = PegasusTokenizer.from_pretrained(self.model_name) model = PegasusForConditionalGeneration.from_pretrained( self.model_name).to(torch_device) batch = tokenizer(src_text, truncation=True, padding=True, return_tensors='pt').to(self.torch_device) translated = model.generate(**batch) generated_summary = tokenizer.batch_decode(translated, skip_special_tokens=True) return generated_summary
def get_summary(text): try: model_name = 'google/pegasus-xsum' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device) src_text=[""""""+text+""""""] batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device) translated = model.generate(**batch) target = tokenizer.batch_decode(translated, skip_special_tokens=True) except : print("API Error occured") return (-100) return target[0]
def test_tokenization_pegasus(self): # Given self.base_tokenizer = PegasusTokenizer.from_pretrained( 'google/pegasus-cnn_dailymail', cache_dir=self.test_dir) self.rust_tokenizer = PyPegasusTokenizer(get_from_cache( 'https://cdn.huggingface.co/google/pegasus-cnn_dailymail/spiece.model' ), do_lower_case=False) output_baseline = [] for example in self.examples: output_baseline.append( self.base_tokenizer.encode_plus( example.text_a, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When # Note: the original sentence piece tokenizer strips trailing spaces output_rust = self.rust_tokenizer.encode_list( [example.text_a.strip() for example in self.examples], max_len=256, truncation_strategy='longest_first', stride=0) # Then for idx, (rust, baseline) in enumerate(zip(output_rust, output_baseline)): if rust.token_ids != baseline['input_ids']: if len(rust.token_ids) == len(baseline['input_ids']): if Counter(rust.token_ids) != Counter( baseline['input_ids']): raise AssertionError( f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' f'Sentence a: {self.examples[idx].text_a} \n' f'Sentence b: {self.examples[idx].text_b} \n' f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' f'Rust: {rust.token_ids} \n' f'Python {baseline["input_ids"]}') else: raise AssertionError( f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' f'Sentence a: {self.examples[idx].text_a} \n' f'Sentence b: {self.examples[idx].text_b} \n' f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' f'Rust: {rust.token_ids} \n' f'Python {baseline["input_ids"]}') assert ( rust.special_tokens_mask == baseline['special_tokens_mask'])
def load_BART_or_PEGASUS(mname): if 'bart' in mname.lower(): from transformers import BartTokenizer, BartForConditionalGeneration model = BartForConditionalGeneration.from_pretrained(mname) tokenizer = BartTokenizer.from_pretrained(mname) elif 'pegasus' in mname.lower(): from transformers import PegasusTokenizer, PegasusForConditionalGeneration model = PegasusForConditionalGeneration.from_pretrained(mname) tokenizer = PegasusTokenizer.from_pretrained(mname) else: raise NotImplementedError("UNKOWN model name.") return model, tokenizer
def summarizeP(src_text, variant="xsum", device=None): model_name = "google/pegasus-" model_name += variant torch_device = ('cuda' if torch.cuda.is_available() else 'cpu') if device is None else device tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to( torch_device) batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest').to(torch_device) translated = model.generate(**batch) tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True) return tgt_text
def load_data(dataset_dir, data_name, tokenizer_name='bart-large-cnn', batch_size=7, split='test', max_sample_num=34, max_length=500): if data_name == 'xsum': dataset = load_dataset(data_name, cache_dir=dataset_dir, split=split) print("Assume only use one subset of the dataset") if len(dataset) > max_sample_num: dataset = dataset.shuffle() elif data_name == 'cnndm' or data_name == "cnn_dailymail": # dataset = load_dataset('cnn_dailymail', '3.0.0', cache_dir=dataset_dir, split=split) # import tensorflow_datasets as tfds # cnndm_dir = '/mnt/data0/user/data/better_cnndm/formal_data/test' dataset = yield_cnndm() else: raise NotImplementedError("Unkown dataset") if 'bart' in tokenizer_name: tokenizer = BartTokenizer.from_pretrained(tokenizer_name) elif 'gpt' in tokenizer_name: from transformers import GPT2Tokenizer tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif 'pegasus' in tokenizer_name: from transformers import PegasusTokenizer tokenizer = PegasusTokenizer.from_pretrained(tokenizer_name) print("Load PEGASUS tokenizer...") else: raise NotImplementedError cur_src_txt, cur_tgt_txt = [], [] cnt = 0 for example in dataset: if data_name == 'xsum': doc = example[dataset_meta[data_name]['key_doc']] summary = example[dataset_meta[data_name]['key_sum']] elif data_name == 'cnn_dailymail' or data_name == 'cnndm': doc, summary = example else: raise NotImplementedError cur_src_txt.append(doc) cur_tgt_txt.append(summary) if len(cur_src_txt) == batch_size: assert len(cur_src_txt) == len(cur_tgt_txt) batch = tokenizer.prepare_seq2seq_batch(cur_src_txt, tgt_texts=cur_tgt_txt, max_length=max_length, truncation=True, padding='longest', return_tensors='pt') yield batch cur_src_txt, cur_tgt_txt = [], [] cnt += 1 if cnt > max_sample_num: break
def convert_pegasus_ckpt_to_pytorch(ckpt_path, save_dir): # save tokenizer first dataset = Path(ckpt_path).parent.name desired_max_model_length = max_model_length[dataset] tok = PegasusTokenizer.from_pretrained( "sshleifer/pegasus", model_max_length=desired_max_model_length) assert tok.model_max_length == desired_max_model_length tok.save_pretrained(save_dir) # convert model tf_weights = get_tf_weights_as_numpy(ckpt_path) cfg_updates = dict(max_length=max_gen_length[dataset], length_penalty=expected_alpha.get(dataset, 0.8)) torch_model = convert_pegasus_to_bart(tf_weights, cfg_updates) torch_model.save_pretrained(save_dir)
def execute_pegasus_augmentation(data, file_path) -> pd.DataFrame: MODEL_NAME = var.PARAPHRASING_MODEL tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME) model = PegasusForConditionalGeneration.from_pretrained(MODEL_NAME).to(torch_device) train = data.copy() train = train[['summary', 'sentiment']] number_sequences = 10 train['paraphrased text'] = train['summary'].progress_apply(get_response, num_return_sequences=number_sequences, tokenizer=tokenizer, model=model) generated = train.explode('paraphrased text') generated = generated.dropna() generated.to_csv('{}-Processed-Summarized-Augmented.csv'.format(file_path), index=False) return generated
def to_pytorch(ckpt_path, save_path): dataset = Path(ckpt_path).parent.name desired_max_model_length = task_params[f"sum_{dataset}"]["n_pos"] tok = PegasusTokenizer.from_pretrained( "sshleifer/pegasus", model_max_length=desired_max_model_length) assert tok.model_max_length == desired_max_model_length tok.save_pretrained(save_path) tf_weights = get_tf_weights_as_numpy(ckpt_path) cfg_updates = task_params[f"sum_{dataset}"] if dataset == "large": cfg_updates["task_params"] = task_params torch_model = convert_pegasus(tf_weights, cfg_updates) torch_model.save_pretrained(save_path) sd = torch_model.state_dict() sd.pop("model.decoder.embed_positions.weight") sd.pop("model.encoder.embed_positions.weight") torch.save(sd, Path(save_path) / "pytorch_model.bin")
class ParaPhrasing: model_name = 'tuner007/pegasus_paraphrase' torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device) @staticmethod def paraphrases(input_text, num_return_sequences=10, num_beams=10): if isinstance(input_text, str): input_text = [input_text] batch = ParaPhrasing.tokenizer.prepare_seq2seq_batch(input_text, truncation=True, padding='longest', max_length=60).to( ParaPhrasing.torch_device) translated = ParaPhrasing.model.generate(**batch, max_length=60, num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5) tgt_text = ParaPhrasing.tokenizer.batch_decode(translated, skip_special_tokens=True) return tgt_text
def generate_summary(text): # Create tokenizer tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum") # load pretrained model model = PegasusForConditionalGeneration.from_pretrained( "google/pegasus-xsum") # convert into tokens (number representation of text) tokens = tokenizer(text, truncation=True, padding="longest", return_tensors="pt") summary = model.generate(**tokens) #Summarized = wrapper.fill(tokenizer.decode(summary[0])).strip() Summarized = tokenizer.decode(summary[0]) return Summarized
def test_pegasus_xsum_summary(self): model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum") tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum") src_text = [ """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""", """ The London trio are up for best UK act and best album, as well as getting two nominations in the best song category."We got told like this morning 'Oh I think you're nominated'", said Dappy."And I was like 'Oh yeah, which one?' And now we've got nominated for four awards. I mean, wow!"Bandmate Fazer added: "We thought it's best of us to come down and mingle with everyone and say hello to the cameras. And now we find we've got four nominations."The band have two shots at the best song prize, getting the nod for their Tynchy Stryder collaboration Number One, and single Strong Again.Their album Uncle B will also go up against records by the likes of Beyonce and Kanye West.N-Dubz picked up the best newcomer Mobo in 2007, but female member Tulisa said they wouldn't be too disappointed if they didn't win this time around."At the end of the day we're grateful to be where we are in our careers."If it don't happen then it don't happen - live to fight another day and keep on making albums and hits for the fans."Dappy also revealed they could be performing live several times on the night.The group will be doing Number One and also a possible rendition of the War Child single, I Got Soul.The charity song is a re-working of The Killers' All These Things That I've Done and is set to feature artists like Chipmunk, Ironik and Pixie Lott.This year's Mobos will be held outside of London for the first time, in Glasgow on 30 September.N-Dubz said they were looking forward to performing for their Scottish fans and boasted about their recent shows north of the border."We just done Edinburgh the other day," said Dappy."We smashed up an N-Dubz show over there. We done Aberdeen about three or four months ago - we smashed up that show over there! Everywhere we go we smash it up!" """, ] tgt_text = [ "California's largest electricity provider has turned off power to hundreds of thousands of customers.", "Pop group N-Dubz have revealed they were surprised to get four nominations for this year's Mobo Awards.", ] inputs = tokenizer(src_text, return_tensors="np", truncation=True, max_length=512, padding=True) translated_tokens = model.generate(**inputs, num_beams=2).sequences decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True) assert tgt_text == decoded
def main(): pagesus_pretrain_path = './page_arciv/' tokenizer = PegasusTokenizer.from_pretrained(pagesus_pretrain_path) config_path = os.path.join(pagesus_pretrain_path, 'config.json') psus_config = PegasusConfig.from_json_file(config_path) MAX_LEN = 1024 decode_max_len = 256 data = load_data('./final_test_data_list.json') model = build_model(pagesus_pretrain_path, psus_config, MAX_LEN, decode_max_len) model.load_weights('./pagesus_section/best_model.hdf5') autotitle = AutoTitle(start_id=tokenizer.pad_token_id, end_id=tokenizer.eos_token_id, maxlen=256, max_decode_len=decode_max_len, model=model) result = just_predict(autotitle, tokenizer, MAX_LEN, data) with open('./pred_result.json', 'w', encoding='utf-8') as f: f.write(json.dumps(result, ensure_ascii=False, cls=NpEncoder))
def convert_pegasus_ckpt_to_pytorch(ckpt_path: str, save_dir: str): # save tokenizer first dataset = Path(ckpt_path).parent.name desired_max_model_length = task_specific_params[ f"summarization_{dataset}"]["max_position_embeddings"] tok = PegasusTokenizer.from_pretrained( "sshleifer/pegasus", model_max_length=desired_max_model_length) assert tok.model_max_length == desired_max_model_length tok.save_pretrained(save_dir) # convert model tf_weights = get_tf_weights_as_numpy(ckpt_path) cfg_updates = task_specific_params[f"summarization_{dataset}"] if dataset == "large": cfg_updates["task_specific_params"] = task_specific_params torch_model = convert_pegasus(tf_weights, cfg_updates) torch_model.save_pretrained(save_dir) sd = torch_model.state_dict() sd.pop("model.decoder.embed_positions.weight") sd.pop("model.encoder.embed_positions.weight") torch.save(sd, Path(save_dir) / "pytorch_model.bin")
def index(request): if request.method == 'POST': form = textForm(request.POST, request.FILES) if form.is_valid(): _type = form.cleaned_data['_type'] text = form.cleaned_data['text'] percent = form.cleaned_data['percent'] if (text == ""): file = request.FILES['file'] text = '' for line in file: text += line.decode() tokenized_sentence = sent_tokenize(text) if (_type == 'Extractive'): summary = summarize(tokenized_sentence, percent) return render(request, 'summary/summary.html', { 'text': text, 'summary': summary, 'percent': percent }) elif (_type == 'Abstractive'): model_name = 'google/pegasus-xsum' torch_device = 'cuda' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained( model_name).to(torch_device) batch = tokenizer.prepare_seq2seq_batch( [text], truncation=True, padding='longest').to(torch_device) translated = model.generate(**batch) summary = tokenizer.batch_decode(translated, skip_special_tokens=True) return render( request, 'summary/summary.html', { 'text': text, 'summary': summary[0], 'percent': "Not Applicable" }) return render(request, 'summary/index.html', {'form': textForm()})
def main(): pagesus_pretrain_path = './page_arciv/' tokenizer = PegasusTokenizer.from_pretrained(pagesus_pretrain_path) config_path = os.path.join(pagesus_pretrain_path, 'config.json') psus_config = PegasusConfig.from_json_file(config_path) MAX_LEN = 1920 decode_max_len = 600 batch_size = 2 data = load_data( '/home_zyz/abstract_generate/final_abdata/union_add_noabs_cleaned_1920.json' ) random.shuffle(data) print(len(data)) print(data[0][0]) print(data[0][1]) valid_data = data[:5] train_data = data[5:] train_generator = data_generator(train_data, batch_size, MAX_LEN, decode_max_len, tokenizer) K.clear_session() strategy = tf.distribute.MirroredStrategy() print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) with strategy.scope(): model = build_model(pagesus_pretrain_path, psus_config, MAX_LEN, decode_max_len) epochs = 50 autotitle = AutoTitle(start_id=tokenizer.pad_token_id, end_id=tokenizer.eos_token_id, maxlen=599, max_decode_len=decode_max_len, model=model) evaluator = Evaluator(tokenizer, MAX_LEN, autotitle, valid_data) model.fit(train_generator.forfit(), steps_per_epoch=len(train_generator) - 1, epochs=epochs, callbacks=[evaluator])
def run_one_fig(spec, args, num_samples=300): print(f"--{spec}--") CUR_DIR = os.path.join(args.prob_meta_dir, spec) args.cur_dir = CUR_DIR files = os.listdir(CUR_DIR) random.shuffle(files) files = files[:num_samples] BOS_TOKEN = 0 print(args.spec_name) if 'pegasus' in args.model_name: from transformers import PegasusTokenizer bpe_tokenizer = PegasusTokenizer.from_pretrained(args.model_name) EOS_TOK_IDs = [106, bpe_tokenizer.eos_token_id, 2] # <n> elif 'gpt' in args.model_name: from transformers import GPT2Tokenizer bpe_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') EOS_TOK_IDs = [bpe_tokenizer.eos_token_id] elif 'bart' in args.model_name: from transformers import BartTokenizer bpe_tokenizer = BartTokenizer.from_pretrained(args.model_name) EOS_TOK_IDs = [bpe_tokenizer.eos_token_id] else: raise NotImplementedError # process_data_single(args, files[0], eos_token_ids=EOS_TOK_IDs) len_samples = len(files) cpu_cnt = multiprocessing.cpu_count() with multiprocessing.Pool(processes=cpu_cnt) as pool: results = pool.starmap(process_data_single, zip([args] * len_samples, files, [EOS_TOK_IDs] * len_samples)) output = list(itertools.chain.from_iterable(results)) print(f"Samples: {len(output)}") output = proceed_data(10, output) return output
if '## Example' in text: text = re.sub(r'## Example(.*)', '', text) text = re.sub(r"\`\`\`.*?\`\`\`", '', text, flags=re.DOTALL) return text for i, doc in enumerate(docs): markdown_without_example = remove_example_from_description(doc['markdown_description']) docs[i]['markdown_without_example'] = markdown_without_example # LOGGER.debug(markdown_without_example) # Generate 1 sentence summaries for the models if not args.quick_run: from transformers import PegasusTokenizer, PegasusForConditionalGeneration mname = "google/pegasus-large" model = PegasusForConditionalGeneration.from_pretrained(mname) tok = PegasusTokenizer.from_pretrained(mname) def summarise(text): batch = tok.prepare_seq2seq_batch(src_texts=[text]) # don't need tgt_text for inference gen = model.generate(**batch) return tok.batch_decode(gen, skip_special_tokens=True)[0] for i, doc in enumerate(docs): if 'short_description' not in docs[i].keys(): short_description = summarise(doc['description']) docs[i]['short_description'] = short_description # LOGGER.debug(short_description) vi_client = ViClient(os.environ['VH_USERNAME'], os.environ['VH_API_KEY']) ids = vi_client.get_field_across_documents('_id', docs) if args.reset_collection: