Python PegasusTokenizer示例，transformers.PegasusTokenizer Python示例

示例#1

0

显示文件

    def setUp(self):
        super().setUp()

        # We have a SentencePiece fixture for testing
        tokenizer = PegasusTokenizer(SAMPLE_VOCAB,
                                     offset=0,
                                     mask_token_sent=None,
                                     mask_token="[MASK]")
        tokenizer.save_pretrained(self.tmpdirname)

示例#2

0

显示文件

文件： pegasus.py 项目： eeic-ai-01/text2slide

    def exec(self, text):
        src_text = [text]
        model_name = self.model
        #model_name = 'google/pegasus-xsum'
        #model_name = 'google/pegasus-large'
        #model_name = 'google/pegasus-cnn_dailymail'
        #model_name = 'google/pegasus-pubmed'
        #model_name = 'google/pegasus-wikihow'
        #model_name = 'google/pegasus-newsroom'
        #model_name = 'google/pegasus-multi_news'
        #model_name = 'google/pegasus-reddit_tifu'
        #model_name = 'google/pegasus-arxiv'

        torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
        tokenizer = PegasusTokenizer.from_pretrained(model_name)
        model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        batch = tokenizer.prepare_seq2seq_batch(
            src_text, truncation=True, padding='longest').to(torch_device)
        result = model.generate(**batch)
        tgt_text = tokenizer.batch_decode(result, skip_special_tokens=True)
        if self.model == "google/pegasus-cnn_dailymail":
            tgt_text[0] = re.sub('<n>', ' ', tgt_text[0])

        return tgt_text[0]

示例#3

0

显示文件

文件： base.py 项目： MichaelJanz/benchmarking-and-architectural-analysis-of-state-of-the-art-transformer-models

def get_model_tokenizer(model_name):
    import torch
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if "pegasus" in model_name:
        #its a pegasus model
        from transformers import PegasusForConditionalGeneration, PegasusTokenizer
        tokenizer = PegasusTokenizer.from_pretrained(model_name)
        model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    elif "bart-large" in model_name:
        # its a bart-model
        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    elif "bart-custom-large" in model_name:
        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    else:
        # T5 or distilbart
        from transformers import AutoTokenizer, AutoModelWithLMHead
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelWithLMHead.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

示例#4

0

显示文件

文件： paraphrasing.py 项目： udit-pandey1/kairon

class ParaPhrasing:
    """Class loads pegasus model for text augmentation"""
    model_name = 'tuner007/pegasus_paraphrase'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)

    @staticmethod
    def paraphrases(input_text, num_return_sequences=10, num_beams=10):
        """
        generates variations for
        a given sentence/text

        :param input_text: sentence or text
        :param num_return_sequences: Number of variations to be returned
        :param num_beams: Number of beams for beam search. 1 means no beam search
        :return: list of variations of the input text
        """
        if isinstance(input_text, str):
            input_text = [input_text]
        batch = ParaPhrasing.tokenizer.prepare_seq2seq_batch(
            input_text, truncation=True, padding='longest',
            max_length=60).to(ParaPhrasing.torch_device)
        translated = ParaPhrasing.model.generate(
            **batch,
            max_length=60,
            num_beams=num_beams,
            num_return_sequences=num_return_sequences,
            temperature=1.5)
        tgt_text = ParaPhrasing.tokenizer.batch_decode(
            translated, skip_special_tokens=True)
        return tgt_text

示例#5

0

显示文件

文件： huggingface.py 项目： yudai1102jp/tensorflow-onnx

    def _test_TFPegasus(self, size, large=False):
        from transformers import PegasusTokenizer, TFPegasusModel
        tokenizer = PegasusTokenizer.from_pretrained(size)
        model = TFPegasusModel.from_pretrained(size)
        input_ids = \
            tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="tf").input_ids
        decoder_input_ids = \
            tokenizer("Studies show that", return_tensors="tf").input_ids

        input_dict = {
            "input_ids": input_ids,
            "decoder_input_ids": decoder_input_ids
        }

        # this comes from TFPegasusEncoder/Decoder like:
        #   self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
        # while this is mean to come from config tf tells us that those are model inputs
        # this might be new in tensformers-2.4.2, we did not notice before that
        extra_input = {
            "tf_pegasus_model/model/decoder/mul/y:0":
            np.array([32.], dtype=np.float32),
            "tf_pegasus_model/model/encoder/mul/y:0":
            np.array([32.], dtype=np.float32)
        }
        spec, input_dict = self.spec_and_pad(
            input_dict, max_length=model.config.max_length)
        outputs = ["last_hidden_state"]
        self.run_test(model,
                      input_dict,
                      input_signature=spec,
                      outputs=outputs,
                      large=large,
                      extra_input=extra_input)

示例#6

0

显示文件

    def __init__(self, model: str = None):
        log.info(model)
        torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        log.info(torch_device)
        if model is None:
            model = "t5"
        self.modelName = model
        # path to all the files that will be used for inference
        self.path = f"./app/api/{model}/"
        self.model_path = self.path + "pytorch_model.bin"
        self.config_path = self.path + "config.json"

        # Selecting the correct model based on the passed madel input. Default t5
        if model == "t5":
            self.config = T5Config.from_json_file(self.config_path)
            self.model = T5ForConditionalGeneration(self.config)
            self.tokenizer = T5Tokenizer.from_pretrained(self.path)
            self.model.eval()
            self.model.load_state_dict(torch.load(self.model_path, map_location=torch_device))
        elif model == "google/pegasus-newsroom":
            self.config = PegasusConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = PegasusForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = PegasusTokenizer.from_pretrained(model)
        elif model == "facebook/bart-large-cnn":
            self.config = BartConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = BartForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = BartTokenizer.from_pretrained(model)
        else:
            raise Exception("This model is not supported")

        self.text = str()

示例#7

0

显示文件

def compute(sm):
    # Import the Pegasus Model
    model_name = 'google/pegasus-xsum'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)
    sm_len = len(sm)

    sen_list = splitText(sm, sm_len)  # Get sections to be summarized

    try:
        batches = []
        for s in sen_list:  # Preparation
            batch = tokenizer.prepare_seq2seq_batch(
                [s], truncation=True, padding='longest').to(torch_device)
            batches.append(batch)
    except:
        return ""

    temp = []
    for b in batches:  # Summary generation
        translated = model.generate(**b)
        temp.append(translated)

    final_summary = []
    for t in temp:  # Put together the summaries from the different sections
        final_summary.append(
            tokenizer.batch_decode(t, skip_special_tokens=True)[0])

    return final_summary

示例#8

0

显示文件

 def __init__(self, config):
     self.model_name = 'google/pegasus-reddit_tifu'
     self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
     print(f"using device: {self.device}")
     self.tokenizer = PegasusTokenizer.from_pretrained(self.model_name,
                                                       force_download=True)
     self.model = PegasusForConditionalGeneration.from_pretrained(
         self.model_name, force_download=True).to(self.device)

示例#9

0

显示文件

    def load_model(self):
        model = PegasusForConditionalGeneration.from_pretrained(
            os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'model'))
        tokenizer = PegasusTokenizer.from_pretrained(
            os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'tokenizer'))
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)

        return model, tokenizer, device

示例#10

0

显示文件

文件： AbstractiveSummarizationParaphraser.py 项目： alexstoken/nlp-qa-finalproj

 def __init__(self, args, device):
     super().__init__(args, device)
     assert args.pretrained_model_name in self.PRETRAINED_MODEL_NAMES
     self.pretrained_model_name = args.pretrained_model_name
     logging.info(f'Loading Pegasus ({self.pretrained_model_name})')
     self.model = PegasusForConditionalGeneration.from_pretrained(
         self.pretrained_model_name).to(self.device)
     self.tokenizer: PegasusTokenizer = PegasusTokenizer.from_pretrained(
         self.pretrained_model_name)

示例#11

0

显示文件

文件： abstractive_summaries.py 项目： psmukherjee009/summarizatio_final

def generate_summary(text, model_name):
    torch_device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)
    batch = tokenizer.prepare_seq2seq_batch(
        text, truncation=True, padding="longest",
        return_tensors="pt").to(torch_device)
    translated = model.generate(**batch)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)[0]

示例#12

0

显示文件

文件： main.py 项目： surbhihirawat88/Pegasus

def generate_summary(context):
    model_name = 'google/pegasus-xsum'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)
    batch = tokenizer.prepare_seq2seq_batch(src_texts='context',
                                            truncation=True,
                                            padding='max-length',
                                            return_tensors="pt")
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

示例#13

0

显示文件

文件： pegasus_inference.py 项目： sakshitantak/Pegasus

    def single_document_summarization(self, src_text):
        tokenizer = PegasusTokenizer.from_pretrained(self.model_name)
        model = PegasusForConditionalGeneration.from_pretrained(
            self.model_name).to(torch_device)
        batch = tokenizer(src_text,
                          truncation=True,
                          padding=True,
                          return_tensors='pt').to(self.torch_device)

        translated = model.generate(**batch)
        generated_summary = tokenizer.batch_decode(translated,
                                                   skip_special_tokens=True)
        return generated_summary

示例#14

0

显示文件

def get_summary(text):
        try:
            model_name = 'google/pegasus-xsum'
            tokenizer = PegasusTokenizer.from_pretrained(model_name)
            model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
            src_text=[""""""+text+""""""]
            batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)
            translated = model.generate(**batch)
            target = tokenizer.batch_decode(translated, skip_special_tokens=True)
        except :
            print("API Error occured")
            return (-100)
        return target[0]

示例#15

0

显示文件

文件： test_tokenization_sst2.py 项目： guillaume-be/rust-tokenizers

    def test_tokenization_pegasus(self):
        # Given
        self.base_tokenizer = PegasusTokenizer.from_pretrained(
            'google/pegasus-cnn_dailymail', cache_dir=self.test_dir)
        self.rust_tokenizer = PyPegasusTokenizer(get_from_cache(
            'https://cdn.huggingface.co/google/pegasus-cnn_dailymail/spiece.model'
        ),
                                                 do_lower_case=False)

        output_baseline = []
        for example in self.examples:
            output_baseline.append(
                self.base_tokenizer.encode_plus(
                    example.text_a,
                    add_special_tokens=True,
                    return_overflowing_tokens=True,
                    return_special_tokens_mask=True,
                    max_length=128))

        # When
        # Note: the original sentence piece tokenizer strips trailing spaces
        output_rust = self.rust_tokenizer.encode_list(
            [example.text_a.strip() for example in self.examples],
            max_len=256,
            truncation_strategy='longest_first',
            stride=0)

        # Then
        for idx, (rust,
                  baseline) in enumerate(zip(output_rust, output_baseline)):
            if rust.token_ids != baseline['input_ids']:
                if len(rust.token_ids) == len(baseline['input_ids']):
                    if Counter(rust.token_ids) != Counter(
                            baseline['input_ids']):
                        raise AssertionError(
                            f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n '
                            f'Sentence a: {self.examples[idx].text_a} \n'
                            f'Sentence b: {self.examples[idx].text_b} \n'
                            f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n'
                            f'Rust: {rust.token_ids} \n'
                            f'Python {baseline["input_ids"]}')
                else:
                    raise AssertionError(
                        f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n '
                        f'Sentence a: {self.examples[idx].text_a} \n'
                        f'Sentence b: {self.examples[idx].text_b} \n'
                        f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n'
                        f'Rust: {rust.token_ids} \n'
                        f'Python {baseline["input_ids"]}')
            assert (
                rust.special_tokens_mask == baseline['special_tokens_mask'])

示例#16

0

显示文件

文件： util.py 项目： jiacheng-xu/text-sum-uncertainty

def load_BART_or_PEGASUS(mname):
    if 'bart' in mname.lower():
        from transformers import BartTokenizer, BartForConditionalGeneration

        model = BartForConditionalGeneration.from_pretrained(mname)
        tokenizer = BartTokenizer.from_pretrained(mname)
    elif 'pegasus' in mname.lower():
        from transformers import PegasusTokenizer, PegasusForConditionalGeneration

        model = PegasusForConditionalGeneration.from_pretrained(mname)
        tokenizer = PegasusTokenizer.from_pretrained(mname)
    else:
        raise NotImplementedError("UNKOWN model name.")
    return model, tokenizer

示例#17

0

显示文件

def summarizeP(src_text, variant="xsum", device=None):
    model_name = "google/pegasus-"
    model_name += variant
    torch_device = ('cuda' if torch.cuda.is_available() else
                    'cpu') if device is None else device
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)
    batch = tokenizer.prepare_seq2seq_batch(src_text,
                                            truncation=True,
                                            padding='longest').to(torch_device)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

示例#18

0

显示文件

文件： util.py 项目： jiacheng-xu/text-sum-uncertainty

def load_data(dataset_dir, data_name, tokenizer_name='bart-large-cnn',
              batch_size=7, split='test', max_sample_num=34, max_length=500):
    if data_name == 'xsum':
        dataset = load_dataset(data_name, cache_dir=dataset_dir, split=split)
        print("Assume only use one subset of the dataset")
        if len(dataset) > max_sample_num:
            dataset = dataset.shuffle()
    elif data_name == 'cnndm' or data_name == "cnn_dailymail":
        # dataset = load_dataset('cnn_dailymail', '3.0.0', cache_dir=dataset_dir, split=split)
        # import tensorflow_datasets as tfds
        # cnndm_dir = '/mnt/data0/user/data/better_cnndm/formal_data/test'
        dataset = yield_cnndm()
    else:
        raise NotImplementedError("Unkown dataset")

    if 'bart' in tokenizer_name:
        tokenizer = BartTokenizer.from_pretrained(tokenizer_name)
    elif 'gpt' in tokenizer_name:
        from transformers import GPT2Tokenizer
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif 'pegasus' in tokenizer_name:
        from transformers import PegasusTokenizer
        tokenizer = PegasusTokenizer.from_pretrained(tokenizer_name)
        print("Load PEGASUS tokenizer...")
    else:
        raise NotImplementedError

    cur_src_txt, cur_tgt_txt = [], []
    cnt = 0
    for example in dataset:
        if data_name == 'xsum':
            doc = example[dataset_meta[data_name]['key_doc']]
            summary = example[dataset_meta[data_name]['key_sum']]
        elif data_name == 'cnn_dailymail' or data_name == 'cnndm':
            doc, summary = example
        else:
            raise NotImplementedError
        cur_src_txt.append(doc)
        cur_tgt_txt.append(summary)
        if len(cur_src_txt) == batch_size:
            assert len(cur_src_txt) == len(cur_tgt_txt)
            batch = tokenizer.prepare_seq2seq_batch(cur_src_txt, tgt_texts=cur_tgt_txt, max_length=max_length,
                                                    truncation=True, padding='longest', return_tensors='pt')

            yield batch
            cur_src_txt, cur_tgt_txt = [], []
        cnt += 1
        if cnt > max_sample_num:
            break

示例#19

0

显示文件

def convert_pegasus_ckpt_to_pytorch(ckpt_path, save_dir):
    # save tokenizer first
    dataset = Path(ckpt_path).parent.name
    desired_max_model_length = max_model_length[dataset]
    tok = PegasusTokenizer.from_pretrained(
        "sshleifer/pegasus", model_max_length=desired_max_model_length)
    assert tok.model_max_length == desired_max_model_length
    tok.save_pretrained(save_dir)

    # convert model
    tf_weights = get_tf_weights_as_numpy(ckpt_path)
    cfg_updates = dict(max_length=max_gen_length[dataset],
                       length_penalty=expected_alpha.get(dataset, 0.8))
    torch_model = convert_pegasus_to_bart(tf_weights, cfg_updates)
    torch_model.save_pretrained(save_dir)

示例#20

0

显示文件

文件： paraphrasing_augmentation.py 项目： cmazzoni87/SentimentAnalysis

def execute_pegasus_augmentation(data, file_path) -> pd.DataFrame:
    MODEL_NAME = var.PARAPHRASING_MODEL
    tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
    model = PegasusForConditionalGeneration.from_pretrained(MODEL_NAME).to(torch_device)
    train = data.copy()
    train = train[['summary', 'sentiment']]
    number_sequences = 10
    train['paraphrased text'] = train['summary'].progress_apply(get_response,
                                                                     num_return_sequences=number_sequences,
                                                                     tokenizer=tokenizer,
                                                                     model=model)
    generated = train.explode('paraphrased text')
    generated = generated.dropna()
    generated.to_csv('{}-Processed-Summarized-Augmented.csv'.format(file_path), index=False)
    return generated

示例#21

0

显示文件

文件： pegasus.py 项目： quantapix/qnarre

def to_pytorch(ckpt_path, save_path):
    dataset = Path(ckpt_path).parent.name
    desired_max_model_length = task_params[f"sum_{dataset}"]["n_pos"]
    tok = PegasusTokenizer.from_pretrained(
        "sshleifer/pegasus", model_max_length=desired_max_model_length)
    assert tok.model_max_length == desired_max_model_length
    tok.save_pretrained(save_path)
    tf_weights = get_tf_weights_as_numpy(ckpt_path)
    cfg_updates = task_params[f"sum_{dataset}"]
    if dataset == "large":
        cfg_updates["task_params"] = task_params
    torch_model = convert_pegasus(tf_weights, cfg_updates)
    torch_model.save_pretrained(save_path)
    sd = torch_model.state_dict()
    sd.pop("model.decoder.embed_positions.weight")
    sd.pop("model.encoder.embed_positions.weight")
    torch.save(sd, Path(save_path) / "pytorch_model.bin")

示例#22

0

显示文件

文件： paraphrasing.py 项目： paper2code/rasa-kairon

class ParaPhrasing:
    model_name = 'tuner007/pegasus_paraphrase'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    @staticmethod
    def paraphrases(input_text, num_return_sequences=10, num_beams=10):
        if isinstance(input_text, str):
            input_text = [input_text]
        batch = ParaPhrasing.tokenizer.prepare_seq2seq_batch(input_text, truncation=True, padding='longest',
                                                             max_length=60).to(
            ParaPhrasing.torch_device)
        translated = ParaPhrasing.model.generate(**batch, max_length=60, num_beams=num_beams,
                                                 num_return_sequences=num_return_sequences, temperature=1.5)
        tgt_text = ParaPhrasing.tokenizer.batch_decode(translated, skip_special_tokens=True)
        return tgt_text

示例#23

0

显示文件

文件： app.py 项目： dataisamazing/Machine-Learning

def generate_summary(text):

    # Create tokenizer
    tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
    # load pretrained model
    model = PegasusForConditionalGeneration.from_pretrained(
        "google/pegasus-xsum")

    # convert into tokens (number representation of text)
    tokens = tokenizer(text,
                       truncation=True,
                       padding="longest",
                       return_tensors="pt")
    summary = model.generate(**tokens)
    #Summarized = wrapper.fill(tokenizer.decode(summary[0])).strip()
    Summarized = tokenizer.decode(summary[0])
    return Summarized

示例#24

0

显示文件

文件： test_modeling_flax_pegasus.py 项目： yulinggu-cs/transformers

    def test_pegasus_xsum_summary(self):
        model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
        tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

        src_text = [
            """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""",
            """ The London trio are up for best UK act and best album, as well as getting two nominations in the best song category."We got told like this morning 'Oh I think you're nominated'", said Dappy."And I was like 'Oh yeah, which one?' And now we've got nominated for four awards. I mean, wow!"Bandmate Fazer added: "We thought it's best of us to come down and mingle with everyone and say hello to the cameras. And now we find we've got four nominations."The band have two shots at the best song prize, getting the nod for their Tynchy Stryder collaboration Number One, and single Strong Again.Their album Uncle B will also go up against records by the likes of Beyonce and Kanye West.N-Dubz picked up the best newcomer Mobo in 2007, but female member Tulisa said they wouldn't be too disappointed if they didn't win this time around."At the end of the day we're grateful to be where we are in our careers."If it don't happen then it don't happen - live to fight another day and keep on making albums and hits for the fans."Dappy also revealed they could be performing live several times on the night.The group will be doing Number One and also a possible rendition of the War Child single, I Got Soul.The charity song is a  re-working of The Killers' All These Things That I've Done and is set to feature artists like Chipmunk, Ironik and Pixie Lott.This year's Mobos will be held outside of London for the first time, in Glasgow on 30 September.N-Dubz said they were looking forward to performing for their Scottish fans and boasted about their recent shows north of the border."We just done Edinburgh the other day," said Dappy."We smashed up an N-Dubz show over there. We done Aberdeen about three or four months ago - we smashed up that show over there! Everywhere we go we smash it up!" """,
        ]

        tgt_text = [
            "California's largest electricity provider has turned off power to hundreds of thousands of customers.",
            "Pop group N-Dubz have revealed they were surprised to get four nominations for this year's Mobo Awards.",
        ]

        inputs = tokenizer(src_text, return_tensors="np", truncation=True, max_length=512, padding=True)
        translated_tokens = model.generate(**inputs, num_beams=2).sequences
        decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
        assert tgt_text == decoded

示例#25

0

显示文件

文件： result_pred.py 项目： xbqnl/NLP-model

def main():
    pagesus_pretrain_path = './page_arciv/'
    tokenizer = PegasusTokenizer.from_pretrained(pagesus_pretrain_path)
    config_path = os.path.join(pagesus_pretrain_path, 'config.json')
    psus_config = PegasusConfig.from_json_file(config_path)
    MAX_LEN = 1024
    decode_max_len = 256
    data = load_data('./final_test_data_list.json')
    model = build_model(pagesus_pretrain_path, psus_config, MAX_LEN,
                        decode_max_len)
    model.load_weights('./pagesus_section/best_model.hdf5')
    autotitle = AutoTitle(start_id=tokenizer.pad_token_id,
                          end_id=tokenizer.eos_token_id,
                          maxlen=256,
                          max_decode_len=decode_max_len,
                          model=model)

    result = just_predict(autotitle, tokenizer, MAX_LEN, data)
    with open('./pred_result.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(result, ensure_ascii=False, cls=NpEncoder))

示例#26

0

显示文件

文件： convert_pegasus_tf_to_pytorch.py 项目： sshleifer/transformers_fork

def convert_pegasus_ckpt_to_pytorch(ckpt_path: str, save_dir: str):
    # save tokenizer first
    dataset = Path(ckpt_path).parent.name
    desired_max_model_length = task_specific_params[
        f"summarization_{dataset}"]["max_position_embeddings"]
    tok = PegasusTokenizer.from_pretrained(
        "sshleifer/pegasus", model_max_length=desired_max_model_length)
    assert tok.model_max_length == desired_max_model_length
    tok.save_pretrained(save_dir)

    # convert model
    tf_weights = get_tf_weights_as_numpy(ckpt_path)
    cfg_updates = task_specific_params[f"summarization_{dataset}"]
    if dataset == "large":
        cfg_updates["task_specific_params"] = task_specific_params
    torch_model = convert_pegasus(tf_weights, cfg_updates)
    torch_model.save_pretrained(save_dir)
    sd = torch_model.state_dict()
    sd.pop("model.decoder.embed_positions.weight")
    sd.pop("model.encoder.embed_positions.weight")
    torch.save(sd, Path(save_dir) / "pytorch_model.bin")

示例#27

0

显示文件

def index(request):
    if request.method == 'POST':
        form = textForm(request.POST, request.FILES)
        if form.is_valid():
            _type = form.cleaned_data['_type']
            text = form.cleaned_data['text']
            percent = form.cleaned_data['percent']
            if (text == ""):
                file = request.FILES['file']
                text = ''
                for line in file:
                    text += line.decode()
            tokenized_sentence = sent_tokenize(text)
            if (_type == 'Extractive'):
                summary = summarize(tokenized_sentence, percent)
                return render(request, 'summary/summary.html', {
                    'text': text,
                    'summary': summary,
                    'percent': percent
                })
            elif (_type == 'Abstractive'):
                model_name = 'google/pegasus-xsum'
                torch_device = 'cuda'
                tokenizer = PegasusTokenizer.from_pretrained(model_name)
                model = PegasusForConditionalGeneration.from_pretrained(
                    model_name).to(torch_device)
                batch = tokenizer.prepare_seq2seq_batch(
                    [text], truncation=True,
                    padding='longest').to(torch_device)
                translated = model.generate(**batch)
                summary = tokenizer.batch_decode(translated,
                                                 skip_special_tokens=True)
                return render(
                    request, 'summary/summary.html', {
                        'text': text,
                        'summary': summary[0],
                        'percent': "Not Applicable"
                    })
    return render(request, 'summary/index.html', {'form': textForm()})

示例#28

0

显示文件

def main():
    pagesus_pretrain_path = './page_arciv/'
    tokenizer = PegasusTokenizer.from_pretrained(pagesus_pretrain_path)
    config_path = os.path.join(pagesus_pretrain_path, 'config.json')
    psus_config = PegasusConfig.from_json_file(config_path)
    MAX_LEN = 1920
    decode_max_len = 600
    batch_size = 2
    data = load_data(
        '/home_zyz/abstract_generate/final_abdata/union_add_noabs_cleaned_1920.json'
    )
    random.shuffle(data)
    print(len(data))
    print(data[0][0])
    print(data[0][1])
    valid_data = data[:5]
    train_data = data[5:]
    train_generator = data_generator(train_data, batch_size, MAX_LEN,
                                     decode_max_len, tokenizer)

    K.clear_session()
    strategy = tf.distribute.MirroredStrategy()
    print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
    with strategy.scope():
        model = build_model(pagesus_pretrain_path, psus_config, MAX_LEN,
                            decode_max_len)

    epochs = 50
    autotitle = AutoTitle(start_id=tokenizer.pad_token_id,
                          end_id=tokenizer.eos_token_id,
                          maxlen=599,
                          max_decode_len=decode_max_len,
                          model=model)
    evaluator = Evaluator(tokenizer, MAX_LEN, autotitle, valid_data)
    model.fit(train_generator.forfit(),
              steps_per_epoch=len(train_generator) - 1,
              epochs=epochs,
              callbacks=[evaluator])

示例#29

0

显示文件

文件： attention_y_entropy.py 项目： jiacheng-xu/text-sum-uncertainty

def run_one_fig(spec, args, num_samples=300):
    print(f"--{spec}--")
    CUR_DIR = os.path.join(args.prob_meta_dir, spec)
    args.cur_dir = CUR_DIR
    files = os.listdir(CUR_DIR)
    random.shuffle(files)
    files = files[:num_samples]

    BOS_TOKEN = 0
    print(args.spec_name)
    if 'pegasus' in args.model_name:
        from transformers import PegasusTokenizer

        bpe_tokenizer = PegasusTokenizer.from_pretrained(args.model_name)
        EOS_TOK_IDs = [106, bpe_tokenizer.eos_token_id, 2]  # <n>
    elif 'gpt' in args.model_name:
        from transformers import GPT2Tokenizer

        bpe_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        EOS_TOK_IDs = [bpe_tokenizer.eos_token_id]
    elif 'bart' in args.model_name:
        from transformers import BartTokenizer

        bpe_tokenizer = BartTokenizer.from_pretrained(args.model_name)
        EOS_TOK_IDs = [bpe_tokenizer.eos_token_id]
    else:
        raise NotImplementedError
    # process_data_single(args, files[0], eos_token_ids=EOS_TOK_IDs)
    len_samples = len(files)
    cpu_cnt = multiprocessing.cpu_count()
    with multiprocessing.Pool(processes=cpu_cnt) as pool:
        results = pool.starmap(process_data_single, zip([args] * len_samples, files, [EOS_TOK_IDs] * len_samples))
    output = list(itertools.chain.from_iterable(results))
    print(f"Samples: {len(output)}")
    output = proceed_data(10, output)
    return output

示例#30

0

显示文件

        if '## Example' in text:
            text = re.sub(r'## Example(.*)', '', text)
            text = re.sub(r"\`\`\`.*?\`\`\`", '', text, flags=re.DOTALL)
        return text

    for i, doc in enumerate(docs):
        markdown_without_example = remove_example_from_description(doc['markdown_description'])
        docs[i]['markdown_without_example'] = markdown_without_example
        # LOGGER.debug(markdown_without_example)

    # Generate 1 sentence summaries for the models
    if not args.quick_run:
        from transformers import PegasusTokenizer, PegasusForConditionalGeneration
        mname = "google/pegasus-large"
        model = PegasusForConditionalGeneration.from_pretrained(mname)
        tok = PegasusTokenizer.from_pretrained(mname)

        def summarise(text):
            batch = tok.prepare_seq2seq_batch(src_texts=[text])  # don't need tgt_text for inference
            gen = model.generate(**batch)
            return tok.batch_decode(gen, skip_special_tokens=True)[0]

        for i, doc in enumerate(docs):
            if 'short_description' not in docs[i].keys():
                short_description = summarise(doc['description'])
                docs[i]['short_description'] = short_description
                # LOGGER.debug(short_description)

    vi_client = ViClient(os.environ['VH_USERNAME'], os.environ['VH_API_KEY'])
    ids = vi_client.get_field_across_documents('_id', docs)
    if args.reset_collection: