예제 #1
0
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    enc = GPT2Tokenizer.from_pretrained('gpt2-medium')
    model = GPT2LMHeadModel.from_pretrained(
        'gpt2-medium')  #gpt2 as other option
    model.eval()
    model.to(device)

    # context is the universal trigger
    trigger = "TH PEOPLEMan goddreams Blacks"
    trigger_tokens = enc.encode(trigger)

    # get samples from the model
    for _ in range(100):
        out = sample_sequence(model=model,
                              length=200,
                              context=trigger_tokens,
                              temperature=1.5,
                              top_k=5)
        out = out[:, len(trigger_tokens):].tolist()
        for i in range(1):
            text = enc.decode(out[i])
            if text.find('<|endoftext|>') > 0:
                text = text[0:text.find('<|endoftext|>')]
            print("Prompt: " + trigger)
            print("Output: " + text)
            print("=" * 80)
예제 #2
0
 def load(self):
     try:
         self._tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
         self._model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
     except:
         self._model = None
     return self
def generate_samples(args):
    """Use a pre-trained GPT-2 model to generate a set of samples from scratch."""
    # Set seed
    set_random_seeds(args.random_seed)

    # Initialize training
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print('Device: {}'.format(str(device)))

    # Load pre-trained network weights
    print('Loading pre-trained model...')
    config = GPT2Config.from_pretrained(args.gpt2_version)
    model = GPT2LMHeadModel(config)
    model.load_state_dict(torch.load(args.model_load_path))
    model = model.to(device)
    model.eval()

    # Create tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(args.gpt2_version)

    # Generate some samples
    print('Generating...')
    generated = generate_sequence(model,
                                  tokenizer,
                                  context=args.context,
                                  max_length=args.max_gen_len,
                                  num_samples=args.num_samples,
                                  top_k=args.sampling_top_k,
                                  device=device)
    print('Generated samples:')
    print(*generated, sep="\n---\n")
예제 #4
0
def sample_sequence(cfg,
                    model: JointSentiGPT2Model,
                    tokenizer: GPT2Tokenizer,
                    context_token: torch.Tensor,
                    token_type: torch.Tensor,
                    context_emotion: torch.Tensor,
                    cls_mask: torch.Tensor,
                    emotion_pad=0,
                    speaker1_state=2,
                    decoding_strategy='sampling'):
    cls_mask_extra = torch.LongTensor([[[1], [0], [0], [0]]]).to(cfg.device)

    context_len = context_token.shape[1]
    generated = context_token

    past, pred_response_emotion = None, None
    result = []
    for step in range(cfg.max_decode_length):
        inputs = {
            'input_ids': generated,
            'token_type_ids': token_type,
            'emotion_ids': context_emotion,
            'pred_response_emotion_vector': pred_response_emotion,
            'cls_mask': cls_mask,
            'past': past,
            'decoding': True
        }
        outputs = model.decoding(
            **inputs
        )  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
        pred_response_emotion, past = outputs[1:]
        next_token_logits = outputs[0][0, -1, :] / cfg.sampling_temperature
        if decoding_strategy == 'sampling':
            filtered_logits = top_k_top_p_filtering(next_token_logits,
                                                    top_k=cfg.top_k,
                                                    top_p=cfg.top_p)
            prob = F.softmax(filtered_logits, dim=-1)
            next_token = torch.multinomial(prob, num_samples=1)
        else:
            next_token = torch.argmax(next_token_logits, dim=-1)
            next_token = next_token.unsqueeze(0)

        if next_token.item(
        ) == tokenizer.eos_token_id and step >= cfg_gpt.min_decode_length:
            break

        result.append(next_token.item())
        generated = next_token.unsqueeze(0)
        token_type = torch.LongTensor([[speaker1_state]]).to(cfg.device)
        cls_mask = torch.cat((cls_mask, cls_mask_extra), dim=-1)

    # generated = generated[0, context_len:].tolist()
    result = [
        token_id for token_id in result if token_id not in cfg.special_id_list
    ]
    text = tokenizer.decode(result,
                            skip_special_tokens=True,
                            clean_up_tokenization_spaces=False)
    text = text.replace("\n", "").replace("\r", "")
    return text
예제 #5
0
def get_tokenizer(model_path=None, name="bert"):
    tokenizer = None

    if name == "bert":
        from pytorch_transformers import BertTokenizer
        tokenizer = BertTokenizer.from_pretrained(model_path)
        tokenizer = Seq2SeqAdapterTokenizer(tokenizer)
    if name == "gpt2":
        from pytorch_transformers import GPT2Tokenizer
        tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        tokenizer = Seq2SeqAdapterTokenizer(tokenizer)
    if name == "xlnet":
        from pytorch_transformers import XLNetTokenizer
        tokenizer = XLNetTokenizer.from_pretrained(model_path)
        tokenizer = Seq2SeqAdapterTokenizer(tokenizer)
    if name == "roberta":
        tokenizer = RoBertaTokenizer(model_path)

    if name == "simple":
        tokenizer = SimpleTokenizer()
    if name == "spacy":
        tokenizer = SpacyTokenizer()
    if name == "corenlp":
        tokenizer = CoreNLPTokenizer()

    if tokenizer is None:
        raise RuntimeError("tokenizer:{} is not supported!".format(name))

    return tokenizer
예제 #6
0
def get_tokenizer(tokenizer_name):
    log.info(f"\tLoading Tokenizer {tokenizer_name}")
    if tokenizer_name.startswith("bert-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        # TransformerXL is trained on data pretokenized with MosesTokenizer
        tokenizer = MosesTokenizer()
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name == "MosesTokenizer":
        tokenizer = MosesTokenizer()
    elif tokenizer_name == "SplitChars":
        tokenizer = SplitCharsTokenizer()
    elif tokenizer_name == "":
        tokenizer = SpaceTokenizer()
    else:
        tokenizer = None
    return tokenizer
    def __init__(
        self,
        pretrained_model_name_or_path: str = "gpt2-medium",
        layers: str = "1",
        pooling_operation: str = "first_last",
        use_scalar_mix: bool = False,
    ):
        """OpenAI GPT-2 embeddings, as proposed in Radford et al. 2019.
        :param pretrained_model_name_or_path: name or path of OpenAI GPT-2 model
        :param layers: comma-separated list of layers
        :param pooling_operation: defines pooling operation for subwords
        :param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
        """
        super().__init__()

        self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path)
        self.model = GPT2Model.from_pretrained(
            pretrained_model_name_or_path=pretrained_model_name_or_path, output_hidden_states=True
        )
        self.name = pretrained_model_name_or_path
        self.layers: List[int] = [int(layer) for layer in layers.split(",")]
        self.pooling_operation = pooling_operation
        self.use_scalar_mix = use_scalar_mix
        self.static_embeddings = True

        dummy_sentence: Sentence = Sentence()
        dummy_sentence.add_token(Token("hello"))
        embedded_dummy = self.embed(dummy_sentence)
        self.__embedding_length: int = len(
            embedded_dummy[0].get_token(1).get_embedding()
        )
예제 #8
0
    def __init__(self, model_path='gpt2', top_k=None, top_p=None, device=None):
        super().__init__(device, top_k=top_k, top_p=top_p)
        self.model_path = model_path

        self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        self.model = GPT2LMHeadModel.from_pretrained(model_path)
        self.model.to(device)
        self.model.eval()
예제 #9
0
 def __init__(self):
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() else "cpu")
     # TODO maybe smaller gpt2 model separately
     self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
     self.model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
     self.model.to(self.device)
     self.model.eval()
예제 #10
0
def get_special_token_ids(cfg, tokenizer: GPT2Tokenizer):
    special_id_list = []
    for key, value in cfg.SPECIAL_tokens.items():
        if key == 'additional_special_tokens':
            special_id_list.extend(value)
        else:
            special_id_list.append(value)
    return tokenizer.convert_tokens_to_ids(special_id_list)
예제 #11
0
    def __init__(self, **kwargs):
        self.beam_width = kwargs['beam_width']
        self.beam_depth = kwargs['beam_depth']
        self.timeout = kwargs['timeout']
        random.seed = kwargs['seed']

        self.model = GPT2LMHeadModel.from_pretrained('gpt2')
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
예제 #12
0
파일: gpt2.py 프로젝트: nbaghel777/nlpaug
    def __init__(self, model_path='gpt2', device='cuda'):
        super().__init__()
        self.model_path = model_path
        self.device = device

        self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        self.model = GPT2LMHeadModel.from_pretrained(model_path)
        self.model.to(device)
        self.model.eval()
예제 #13
0
    def __init__(self):
        super(GPT2, self).__init__()

        self.model_type = "GPT2"

        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

        # Load pre-trained model (weights)
        self.model = GPT2LMHeadModel.from_pretrained("gpt2")
    def gpt2(self, prep_obj):
        self.vector_corpus = []

        model = GPT2LMHeadModel.from_pretrained('gpt2')
        token_maker = GPT2Tokenizer.from_pretrained('gpt2')
        for tweet in prep_obj.detokenized_corpus:
            text_index = token_maker.encode(tweet)
            vector = (model.transformer.wte.weight[text_index, :])
            vector = vector.detach().numpy()
            vector = np.sum(vector, axis=0)
            self.vector_corpus.append(vector)
예제 #15
0
 def __init__(self, chunck_size=64, max_length=35, device=torch.device('cuda:0')):
     super(GPT2Client, self).__init__()
     self.chunck_size = chunck_size
     self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
     self.max_length = max_length
     # load the model
     self.model = GPT2Model.from_pretrained('gpt2')
     self.model.eval()
     self.device = device
     # move model to device
     self.model.to(self.device)
예제 #16
0
def add_pytorch_transformers_vocab(vocab, tokenizer_name):
    """Add vocabulary from tokenizers in pytorch_transformers for use with pre-tokenized data.

    These tokenizers have a convert_tokens_to_ids method, but this doesn't do
    anything special, so we can just use the standard indexers.
    """
    do_lower_case = "uncased" in tokenizer_name

    if tokenizer_name.startswith("bert-"):
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)

    if (tokenizer_name.startswith("openai-gpt")
            or tokenizer_name.startswith("gpt2")
            or tokenizer_name.startswith("transo-xl-")):
        tokenizer.add_special_tokens({
            "bos_token": "<start>",
            "sep_token": "<delim>",
            "cls_token": "<extract>"
        })
    # TODO: this is another place can be simplified by "model-before-preprocess" reorganization
    # we can pass tokenizer created in model here, see issue <TBD>

    vocab_size = len(tokenizer)
    # do not use tokenizer.vocab_size, it does not include newly added token
    if tokenizer_name.startswith("roberta-"):
        if tokenizer.convert_ids_to_tokens(vocab_size - 1) is None:
            vocab_size -= 1
        else:
            log.info("Time to delete vocab_size-1 in preprocess.py !!!")
    # due to a quirk in huggingface's file, the last token of RobertaTokenizer is None, remove
    # this when they fix the problem

    ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
    log.info("Added pytorch_transformers vocab (%s): %d tokens",
             tokenizer_name, len(ordered_vocab))
    for word in ordered_vocab:
        vocab.add_token_to_namespace(
            word, input_module_tokenizer_name(tokenizer_name))
예제 #17
0
    def __init__(self, max_size=None, vocab_file=None):
        from pytorch_transformers import GPT2Tokenizer
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.EOT = self.tokenizer.encoder['<|endoftext|>']
        self.max_size = max_size
        self.vocab_file = vocab_file

        pad = 8
        vocab_size = len(self.tokenizer)
        padded_vocab_size = (vocab_size + pad - 1) // pad * pad
        for i in range(0, padded_vocab_size - vocab_size):
            token = f'madeupword{i:09d}'
            self.tokenizer.add_tokens([token])
예제 #18
0
파일: nubia.py 프로젝트: mhkane/nubia
    def __init__(self):
        if not os.path.exists(AGGREGATOR_DIR):
            os.makedirs(AGGREGATOR_DIR)
        if not os.path.isfile(AGGREGATOR_2015_2016):
            print("Downloading aggregators from s3...")
            wget.download(AGGREGATOR_2015_2016_URL,
                          AGGREGATOR_2015_2016,
                          bar=self._download_progress_bar)
        if not os.path.isfile(AGGREGATOR_2015_2017):
            print("Downloading aggregators from s3...")
            wget.download(AGGREGATOR_2015_2017_URL,
                          AGGREGATOR_2015_2017,
                          bar=self._download_progress_bar)
        if not os.path.isfile(AGGREGATOR_2015_2016_8_dim):
            print("Downloading aggregators from s3...")
            wget.download(AGGREGATOR_2015_2016_8_dim_URL,
                          AGGREGATOR_2015_2016_8_dim,
                          bar=self._download_progress_bar)
        if not os.path.isfile(AGGREGATOR_2015_2017_8_dim):
            print("Downloading aggregators from s3...")
            wget.download(AGGREGATOR_2015_2017_8_dim_URL,
                          AGGREGATOR_2015_2017_8_dim,
                          bar=self._download_progress_bar)
        if not os.path.isfile(ROBERTA_STS_PATH + '/checkpoint_best.pt'):
            print("Downloading ROBERTA STS model from s3...")
            wget.download(ROBERTA_STS_URL,
                          ROBERTA_STS_PATH + '/checkpoint_best.pt',
                          bar=self._download_progress_bar)
        if not os.path.isfile(ROBERTA_MNLI_PATH + '/model_mnli.pt'):
            print("Downloading ROBERTA MNLI model from s3...")
            wget.download(ROBERTA_MNLI_URL,
                          ROBERTA_MNLI_PATH + '/model_mnli.pt',
                          bar=self._download_progress_bar)

        self.roberta_STS = RobertaModel.from_pretrained(
            checkpoint_file='checkpoint_best.pt',
            model_name_or_path=ROBERTA_STS_PATH)
        self.roberta_STS.eval()

        self.roberta_MNLI = RobertaModel.from_pretrained(
            checkpoint_file='model_mnli.pt',
            model_name_or_path=ROBERTA_MNLI_PATH)
        self.roberta_MNLI.eval()
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')
        self.agg_one = load(AGGREGATOR_2015_2016)
        self.agg_two = load(AGGREGATOR_2015_2017)
        self.agg_one_8_dim = load(AGGREGATOR_2015_2016_8_dim)
        self.agg_two_8_dim = load(AGGREGATOR_2015_2017_8_dim)
    def __init__(self, opt, shared=None):
        super(TransformerAgent, self).__init__(opt, shared)

        args = AttrDict(
            opt)  # to keep most commands identical to the interact.py script
        self.args = args

        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__file__)
        self.logger.info(pformat(args))

        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

        if shared is None:
            self.logger.info("Get pretrained model and tokenizer")
            if args.model_checkpoint == "":
                args.model_checkpoint = download_pretrained_model()
            if 'gpt2' in args.model_checkpoint:
                self.tokenizer = GPT2Tokenizer.from_pretrained(
                    args.model_checkpoint)
                model_class = GPT2DoubleHeadsModel if self.args.eval_type == "hits@1" else GPT2LMHeadModel
            else:
                self.tokenizer = OpenAIGPTTokenizer.from_pretrained(
                    args.model_checkpoint)
                model_class = OpenAIGPTDoubleHeadsModel if self.args.eval_type == "hits@1" else OpenAIGPTLMHeadModel

            self.model_checkpoint = model_class.from_pretrained(
                args.model_checkpoint)
            self.model_checkpoint.to(args.device)

            self.logger.info("Build BPE prefix dictionary")
            convai_dict = build_dict()
            assert len(convai_dict) == 19304
            self.prefix2words = self.get_prefix2words(convai_dict)
        else:
            self.model_checkpoint = shared['model']
            self.tokenizer = shared['tokenizer']
            self.prefix2words = shared['prefix2words']
        add_special_tokens_(self.model_checkpoint, self.tokenizer)
        self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids(
            SPECIAL_TOKENS)

        self.persona = []
        self.history = []
        self.labels = []

        self.reset()
예제 #20
0
def gpt_predictor(n=3):
    if request.method == 'GET':
        return render_template('index.html', value='hi')

    if request.method == 'POST':
        tok = GPT2Tokenizer.from_pretrained("gpt2")
        model = GPT2LMHeadModel.from_pretrained("gpt2")
        text = request.form.get('text')
        n = request.form.get('n')
        for i in range(int(n)):
            pred = get_pred(text, model, tok)
            if pred == "<|endoftext|>":
                break
            else:
                text += pred
        return render_template('result.html', text=text)
예제 #21
0
    def __init__(self, type, file_prefix=None):
        self.type = type

        self.bos_token = None
        self.eos_token = None
        self.unk_token = None
        self.sep_token = None
        self.pad_token = None
        self.cls_token = None
        self.mask_token = None

        if type == "gpt2":
            from pytorch_transformers import GPT2Tokenizer
            self._tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
            # add <PAD> special token
            self._tokenizer.add_special_tokens({'pad_token': '<PAD>'})

            for i in range(len(self._tokenizer)):
                token = self._tokenizer.convert_ids_to_tokens(i)
            if self._tokenizer._bos_token:  # using _xxx_token instead of xxx_token to silence gpt2tokenizer not set errors
                self.bos_token = self._tokenizer.bos_token
            if self._tokenizer._eos_token:
                self.eos_token = self._tokenizer.eos_token
            if self._tokenizer._unk_token:
                self.unk_token = self._tokenizer.unk_token
            if self._tokenizer._sep_token:
                self.sep_token = self._tokenizer.sep_token
            if self._tokenizer._pad_token:
                self.pad_token = self._tokenizer.pad_token
            if self._tokenizer._cls_token:
                self.cls_token = self._tokenizer.cls_token
            if self._tokenizer._mask_token:
                self.mask_token = self._tokenizer.mask_token

        if type == "bpe":
            self.bpe_vocab_size = 0
            self.bos_token = "<BOS>"
            self.eos_token = "<EOS>"
            self.unk_token = "<UNK>"
            self.sep_token = "<SEP>"
            self.pad_token = "<PAD>"
            self.cls_token = "<CLS>"
            self.mask_token = "<MASK>"
            self._recreate_special_tokens()

        if file_prefix:
            self.load(file_prefix)
def get_model(seed=1234, model_name='gpt2'):
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    enc = GPT2Tokenizer.from_pretrained(model_name)
    enc.unk_token = None
    enc.bos_token = None
    enc.eos_token = None

    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.to(device)
    model.eval()
    #model.double()

    return enc, model
예제 #23
0
def predict_next_word(phrase):
    """
    Function to process the phrase using GPT-2
    :param phrase:
    :return:
    """
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # Tokenize the input phrase
    tokenized_phrase = tokenizer.encode(phrase)
    print("Tokenized Phrase: {}".format(tokenized_phrase))

    # Convert tokenized phrase to pytorch tensor
    tokenized_phrase_tensor = torch.tensor([tokenized_phrase])
    print("Tokenized Phrase Tensor: {}".format(tokenized_phrase_tensor))

    # Load pretrainied model. This will have weights and bias
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    # Set the model in evaluation mode to deactivate drop-out. (Back-prop)
    model.eval()

    try:
        tokenized_phrase_tensor = tokenized_phrase_tensor.to('cuda')
        model.to('cuda')
        print("CUDA present.Running code on GPU")
    except AssertionError:
        print("Torch not compiled with CUDA. Running on CPU.")
    except Exception:
        print("CUDA not present. Running on CPU")

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokenized_phrase_tensor)
        print("Outputs: {}".format(outputs))

        predictions = outputs[0]
        print("Prediction: {}".format(predictions))

    # Get the predicted next sub-word
    predicted_index = torch.argmax(predictions[0, -1, :]).item()
    predicted_text = tokenizer.decode(tokenized_phrase + [predicted_index])

    return predicted_text
예제 #24
0
def main():
    nltk.data.path.append('/data/chuancen/pip_package/nltk_data')
    print(nltk.__version__)
    file_handler = open('../../result/reference_SR_only.txt', 'r')
    ref = file_handler.readlines()
    file_handler = open('../../result/SR_only.txt', 'r')
    hyp = file_handler.readlines()

    print("#ref{} #hyp{}".format(len(ref), len(hyp)))
    meteor_sum = 0
    for i in range(min(len(ref), len(hyp))):
        meteor_sum += meteor_score([ref[i]], hyp[i])

    meteor_sum /= min(len(ref), len(hyp))
    print(meteor_sum)

    tokenizer = GPT2Tokenizer.from_pretrained(
        '/data/chuancen/LIT/models/345M_Alex')
예제 #25
0
def load_model(args):
    """
    Load model and the corresponding tokenizer from pre-trained weight.
    :param args: The command line arguments.
    :return model: The main model.
    :return tokenzier: The tokenzier comes with the main model.
    """
    USE_CUDA = torch.cuda.is_available()
    # ====== Load GPT2 model ========
    model_dir = '../models/' + args.model_dir
    model = GPT2LMHeadModel.from_pretrained(model_dir)
    # model = GPT2LMHeadModel.from_pretrained('gpt2')
    if USE_CUDA:
        model.cuda()
    tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
    # tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    print('Model loaded.')
    return model, tokenizer
예제 #26
0
def evaluate_ppl_gpt(args):
    """
    Evaluate on raw text, use this with GPT which has its own tokenizer
    """
    if args.expanded_dataset:
        path = ".data/stories/story_commonsense/torchtext_expanded"
    else:
        path = ".data/stories/story_commonsense/torchtext"
    # Data
    test_src = [line.rstrip('\n') for line in open(path + "/test.src")]
    test_trg = [line.rstrip('\n') for line in open(path + "/test.trg")]

    # Model
    enc = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.to(device)
    model.eval()
    loss = 0
    batch_size = 1

    print("Evaluating test set with GPT2")
    for i in trange(len(test_src)):
        src, trg = test_src[i], test_trg[i]
        context = enc.encode(src)
        target = enc.encode(trg)
        length = len(target)

        # Generate prediction
        out = utils.sample_sequence(model,
                                    length,
                                    batch_size=1,
                                    context=context,
                                    top_k=10,
                                    device=device)
        out = out[:, len(context):]

        # Get model loss
        target = torch.tensor([target]).to(device)
        with torch.no_grad():
            #pred, past  = model(out)
            l = model(out, labels=target)
            loss += float(l)
    av_loss = loss / len(loss)
    print(f"ppl: {math.exp(av_loss):.04f}")
예제 #27
0
    def __init__(self, gpt2_model, language, name, loi, cuda=False):
        super(GPT2, self).__init__()
        # Load pre-trained model tokenizer (vocabulary)
        # Crucially, do not do basic tokenization; PTB is tokenized. Just do wordpiece tokenization.
        if gpt2_model not in ['small', 'medium']:
            raise ValueError("GPT2 model must be small or medium")
        self.model = GPT2Model.from_pretrained(
            'gpt2{}'.format('' if gpt2_model == 'small' else '-medium'),
            output_hidden_states=True)
        self.tokenizer = GPT2Tokenizer.from_pretrained(
            'gpt2{}'.format('' if gpt2_model == 'small' else '-medium'))

        self.language = language
        self.LAYER_COUNT = parameters[gpt2_model]['LAYER_COUNT']
        self.FEATURE_COUNT = parameters[gpt2_model]['FEATURE_COUNT']
        self.name = name
        self.loi = np.array(loi) if loi else np.arange(
            parameters[gpt2_model]['LAYER_COUNT'])  # loi: layers of interest
        self.cuda = cuda
예제 #28
0
def gpt_predictor(request, n=3):
    tok = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")

    if request.method == 'GET':
        return "Welcome to GPT predictor"

    if request.method == 'POST':
        data = request.get_json()
        text = data["text"]
        res = []
        n = data["n"]
        for i in range(n):
            pred = get_pred(text, model, tok)
            if pred == "<|endoftext|>":
                break
            else:
                text += pred
        return text
예제 #29
0
 def test_special_tokens_checkpoint_behavior(self):
     toks = [
         OpenAIGPTTokenizer.from_pretrained('openai-gpt'),
         GPT2Tokenizer.from_pretrained('gpt2')
     ]
     for tok in toks:
         self.assertEqual(len(tok.added_tokens_encoder), 0)
         tok.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
         self.assertEqual(len(tok.added_tokens_encoder), 5)
         # Make sure we never split
         self.assertEqual(len(tok.tokenize("<bos> <speaker1>")), 2)
         ids = tok.convert_tokens_to_ids(SPECIAL_TOKENS)
         self.assertTrue(
             all([x > 0 for x in ids]),
             f'some tokens failed to tokenize {SPECIAL_TOKENS} -> {ids}')
         # Need to mantain indices through save. (this is also tested in pytorch-transformers)
         tok.save_pretrained(self.save_dir)
         tok_loaded = tok.from_pretrained(str(self.save_dir))
         ids2 = tok_loaded.convert_tokens_to_ids(SPECIAL_TOKENS)
         self.assertListEqual(ids, ids2)
예제 #30
0
def get_textgen(sentence: str) -> str:
    """
    Runs text_generation GPT2 model and returns generated text.
    :param sentence: sentence taken from serializer.data.
    :return: Generated text.
    """
    output_dir = './models/text_gen'
    tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
    model = GPT2LMHeadModel.from_pretrained(output_dir)
    tokens = tokenizer.encode(sentence)
    tokens_tensor = torch.tensor([tokens])
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokens_tensor = tokens_tensor.to(device)
    model.to(device)
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]
    predicted_index = torch.argmax(predictions[0, -1, :]).item()
    predicted_text = tokenizer.decode(tokens + [predicted_index])
    return predicted_text