def produce_summary_huggingface(source_text: str, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, config: Optional[dict] = None) -> str: """ Generates a summary using a model (huggingface's transformers library implementation). :param source_text: Source text/paragraph to produce summary on :param model: Pretrained transformer model :param tokenizer: corresponding tokenizer used for the model :param config: Configuration for generation/decoding. :return: Produced summary by the model """ if not config: config = DEFAULT_CONFIG input_ids = torch.tensor( tokenizer.encode(source_text, add_special_tokens=True)).unsqueeze(0) input_ids = input_ids.to('cuda') generated = model.generate(input_ids, **config) gen_text = tokenizer.batch_decode(generated, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0] gen_text = gen_text.strip() return gen_text
def generate_samples( model: PreTrainedModel, tokenizer: BertTokenizer, prompt_text: str, max_length=args['max_length'], temperature=args['temperature'], top_k=args['k'], top_p=args['p'], repetition_penalty=args['repetition_penalty'], num_return_sequences=args['num_return_sequences'], stop_token=args['stop'] ): encoded_prompt=tokenizer.encode(prompt_text, add_special_tokens=True, return_tensors='pt') encoded_prompt=encoded_prompt.to(model.device) input_ids=encoded_prompt if encoded_prompt.shape[-1]>0 else None output_sequences = model.generate( input_ids=input_ids, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, do_sample=True, num_return_sequences=num_return_sequences, ) if len(output_sequences.shape) > 2: output_sequences.squeeze_() generated_sequences = [] for generated_sequence_idx, generated_sequence in enumerate(output_sequences): print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1)) generated_sequence = generated_sequence.tolist() # Decode text text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) # Remove all text after the stop token text = text[: text.find(stop_token) if args['stop_token'] else None] # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing total_sequence = ( prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :] ) generated_sequences.append(total_sequence) print(total_sequence) return generated_sequences
def predict(args, p_model: PreTrainedModel, p_tokenizer: PreTrainedTokenizer, questions: list): questions_encoded = p_tokenizer.batch_encode_plus( questions, pad_to_max_length=True, max_length=args.max_length, return_tensors='pt') input_ids, attention_mask = questions_encoded[ "input_ids"], questions_encoded["attention_mask"] outputs = p_model.generate(input_ids=input_ids, attention_mask=attention_mask, num_beams=4, max_length=20, early_stopping=True) predictions = [ p_tokenizer.decode(o, skip_special_tokens=True)[1:] for o in outputs ] return predictions
def predict_task_split(self, model: transformers.PreTrainedModel, inputs: tf.data.Dataset, task: Task, max_length: int = 140, min_length: int = 55) -> typing.Sequence[typing.Sequence[int]]: try: outputs = [] model.to(self.device) for batch_inputs in tqdm.tqdm(inputs.as_numpy_iterator(), desc="Predicting %s" % task, unit="batch", leave=False): with torch.no_grad(): model.eval() forward_params = self.prepare_forward_inputs(model, batch_inputs) batch_outputs = model.generate(forward_params['input_ids'], attention_mask=forward_params['attention_mask'], do_sample=False, max_length=GENERATION_MAX_LENGTHS.get(task.dataset, max_length) + 2, min_length=GENERATION_MIN_LENGTHS.get(task.dataset, min_length) + 1, num_beams=4, length_penalty=2., no_repeat_ngram_size=3, early_stopping=True) batch_outputs = batch_outputs.detach().cpu().numpy() outputs.extend(batch_outputs) return outputs # We can't just except tf.errors.UnknownError, because it is thrown as some sort of weird proxy # instance of a tf.errors.UnknownError and python's pattern matching can't handle the scandal except Exception as e: if isinstance(e, tf.errors.UnknownError): logging.warning('Encountered error: %s on %s: %s', type(e), task, e) # Unfortunately, we don't get a more helpful error type, but this usually means # that the dataset has no labels for a given split (e.g., test evaluation occurs on a server) return [] else: # We got a different exception type so let python freak out accordingly logging.error('Encountered error: %s on %s: %s', type(e), task, e) raise e
def produce_summary(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, source_text: str, cuda: bool): """ Use a summarization model to generate output :param model: Pretrained Summarization model :param tokenizer: Tokenizer for the summarization model :param source_text: :param cuda: :return: """ input_ids = torch.tensor( tokenizer.encode(source_text, truncation=True, add_special_tokens=True)).unsqueeze(0) if cuda: input_ids = input_ids.to('cuda') generated = model.generate(input_ids) gen_text = tokenizer.batch_decode(generated, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0] gen_text = gen_text.strip() return gen_text