Пример #1
0
def produce_summary_huggingface(source_text: str,
                                model: PreTrainedModel,
                                tokenizer: PreTrainedTokenizer,
                                config: Optional[dict] = None) -> str:
    """
    Generates a summary using a model (huggingface's transformers library implementation).
    :param source_text: Source text/paragraph to produce summary on
    :param model: Pretrained transformer model
    :param tokenizer: corresponding tokenizer used for the model
    :param config: Configuration for generation/decoding.
    :return: Produced summary by the model
    """

    if not config:
        config = DEFAULT_CONFIG

    input_ids = torch.tensor(
        tokenizer.encode(source_text, add_special_tokens=True)).unsqueeze(0)
    input_ids = input_ids.to('cuda')
    generated = model.generate(input_ids, **config)
    gen_text = tokenizer.batch_decode(generated,
                                      skip_special_tokens=True,
                                      clean_up_tokenization_spaces=True)[0]

    gen_text = gen_text.strip()
    return gen_text
Пример #2
0
def generate_samples(
    model: PreTrainedModel,
    tokenizer: BertTokenizer,
    prompt_text: str,
    max_length=args['max_length'],
    temperature=args['temperature'],
    top_k=args['k'],
    top_p=args['p'],
    repetition_penalty=args['repetition_penalty'],
    num_return_sequences=args['num_return_sequences'],
    stop_token=args['stop']
    ):

    encoded_prompt=tokenizer.encode(prompt_text, add_special_tokens=True, return_tensors='pt')
    encoded_prompt=encoded_prompt.to(model.device)
    input_ids=encoded_prompt if encoded_prompt.shape[-1]>0 else None

    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        num_return_sequences=num_return_sequences,
    )

    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_()

    generated_sequences = []

    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1))
        generated_sequence = generated_sequence.tolist()

        # Decode text
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)

        # Remove all text after the stop token
        text = text[: text.find(stop_token) if args['stop_token'] else None]

        # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
        total_sequence = (
            prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
        )

        generated_sequences.append(total_sequence)
        print(total_sequence)

    return generated_sequences
Пример #3
0
def predict(args, p_model: PreTrainedModel, p_tokenizer: PreTrainedTokenizer,
            questions: list):
    questions_encoded = p_tokenizer.batch_encode_plus(
        questions,
        pad_to_max_length=True,
        max_length=args.max_length,
        return_tensors='pt')
    input_ids, attention_mask = questions_encoded[
        "input_ids"], questions_encoded["attention_mask"]
    outputs = p_model.generate(input_ids=input_ids,
                               attention_mask=attention_mask,
                               num_beams=4,
                               max_length=20,
                               early_stopping=True)
    predictions = [
        p_tokenizer.decode(o, skip_special_tokens=True)[1:] for o in outputs
    ]
    return predictions
Пример #4
0
    def predict_task_split(self,
                           model: transformers.PreTrainedModel,
                           inputs: tf.data.Dataset,
                           task: Task,
                           max_length: int = 140,
                           min_length: int = 55) -> typing.Sequence[typing.Sequence[int]]:

        try:
            outputs = []
            model.to(self.device)
            for batch_inputs in tqdm.tqdm(inputs.as_numpy_iterator(),
                                          desc="Predicting %s" % task,
                                          unit="batch", leave=False):
                with torch.no_grad():
                    model.eval()
                    forward_params = self.prepare_forward_inputs(model, batch_inputs)
                    batch_outputs = model.generate(forward_params['input_ids'],
                                                   attention_mask=forward_params['attention_mask'],
                                                   do_sample=False,
                                                   max_length=GENERATION_MAX_LENGTHS.get(task.dataset, max_length) + 2,
                                                   min_length=GENERATION_MIN_LENGTHS.get(task.dataset, min_length) + 1,
                                                   num_beams=4,
                                                   length_penalty=2.,
                                                   no_repeat_ngram_size=3,
                                                   early_stopping=True)

                    batch_outputs = batch_outputs.detach().cpu().numpy()
                    outputs.extend(batch_outputs)
            return outputs
        # We can't just except tf.errors.UnknownError, because it is thrown as some sort of weird proxy
        # instance of a tf.errors.UnknownError and python's pattern matching can't handle the scandal
        except Exception as e:
            if isinstance(e, tf.errors.UnknownError):
                logging.warning('Encountered error: %s on %s: %s', type(e), task, e)
                # Unfortunately, we don't get a more helpful error type, but this usually means
                # that the dataset has no labels for a given split (e.g., test evaluation occurs on a server)
                return []
            else:
                # We got a different exception type so let python freak out accordingly
                logging.error('Encountered error: %s on %s: %s', type(e), task, e)
                raise e
Пример #5
0
def produce_summary(model: PreTrainedModel, tokenizer: PreTrainedTokenizer,
                    source_text: str, cuda: bool):
    """
    Use a summarization model to generate output
    :param model: Pretrained Summarization model
    :param tokenizer: Tokenizer for the summarization model
    :param source_text:
    :param cuda:
    :return:
    """
    input_ids = torch.tensor(
        tokenizer.encode(source_text, truncation=True,
                         add_special_tokens=True)).unsqueeze(0)

    if cuda:
        input_ids = input_ids.to('cuda')

    generated = model.generate(input_ids)
    gen_text = tokenizer.batch_decode(generated,
                                      skip_special_tokens=True,
                                      clean_up_tokenization_spaces=True)[0]

    gen_text = gen_text.strip()
    return gen_text