Exemplo n.º 1
0
def main(model: GPT2LMHeadModel, enc: GPT2Tokenizer, phrase: str = ''):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    nsamples = 1
    length = 40
    temperature = 1.2
    top_k = 0
    top_p = 0.9
    batch_size = 1
    stop_token = [enc.encoder[x] for x in ('<|endoftext|>', '.', '?', '!')]
    assert nsamples % batch_size == 0

    if length == -1:
        length = model.config.n_ctx // 2
    elif length > model.config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         model.config.n_ctx)

    context_tokens = enc.encode(phrase) if phrase else [
        enc.encoder['<|endoftext|>']
    ]
    generated = 0
    out = sample_sequence(model=model,
                          length=length,
                          context=context_tokens,
                          start_token=None,
                          batch_size=batch_size,
                          temperature=temperature,
                          top_k=top_k,
                          device=device,
                          top_p=top_p,
                          stop_token=stop_token)
    out = out[:, len(context_tokens):].tolist()
    return enc.decode(out[0])
Exemplo n.º 2
0
    def __init__(self, *args, language_model=None, template_loc='./relation_map_multiple.json',
                 use_local_model=False):
        super().__init__(*args, language_model=language_model, template_loc=template_loc,
                         use_local_model=use_local_model)
        self.nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
        if use_local_model:
            self.enc = GPT2Tokenizer.from_pretrained("../models/GPT2LMHeadModel")
        else:
            self.enc = GPT2Tokenizer.from_pretrained('gpt2')

        with open(self.template_loc, 'r') as f:
            self.templates = json.load(f)
Exemplo n.º 3
0
def main(args):
    # TODO specify vocab path to avoid download
    if args.tokenizer is not None:
        toker = GPT2Tokenizer.from_pretrained(args.tokenizer)
    else:
        toker = GPT2Tokenizer.from_pretrained('gpt2')
    assert args.corpus.endswith('.txt') or args.corpus.endswith('.tsv')
    db_path = f'{args.corpus[:-4]}.db/db'
    if exists(dirname(db_path)):
        raise ValueError('Found existing DB, please backup')
    else:
        os.makedirs(dirname(db_path))
    with open(args.corpus, "r", encoding="utf-8") as reader, \
            shelve.open(db_path, 'n') as db:
        chunk = []
        n_chunk = 0
        n_example = 0
        for line in tqdm(reader, total=_get_file_len(args.corpus)):
            try:
                if len(chunk) >= args.chunk_size:
                    # save and renew chunk
                    db[f'chunk_{n_chunk}'] = gzip.compress(
                        json.dumps(chunk[:args.chunk_size]).encode('utf-8'))
                    chunk = chunk[args.chunk_size:]
                    n_chunk += 1

                weights, inputs, attn_masks, position_ids, type_ids = _get_inputs_from_text(
                    line, toker)
                if len(weights) < 2:
                    continue
                features = _make_features(n_example, weights, inputs,
                                          attn_masks, position_ids, type_ids,
                                          toker, args.max_seq_len)
                for feature in features:
                    chunk.append(vars(feature))
                    n_example += 1
            except Exception as e:
                raise e

        # save last chunk
        db[f'chunk_{n_chunk}'] = gzip.compress(
            json.dumps(chunk).encode('utf-8'))
    # save relevant information to reproduce
    meta = {
        'n_example': n_example,
        'chunk_size': args.chunk_size,
        'max_seq_len': args.max_seq_len
    }
    with open(join(dirname(db_path), 'meta.json'), 'w') as writer:
        json.dump(meta, writer, indent=4)
    torch.save(toker, join(dirname(db_path), 'tokenizer.pt'))
Exemplo n.º 4
0
def main():
    """Preprocess a dataset."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--dataset_path', type=str, default='')
    parser.add_argument('--model_name_or_path',
                        type=str,
                        default='gpt2',
                        help='pretrained model name')
    parser.add_argument(
        '--min_file_len',
        type=int,
        help=
        "When loading dataset, throw out files with fewer than this many characters"
    )
    parser.add_argument(
        '--max_file_len',
        type=int,
        help=
        "When loading dataset, throw out files with greater than this many characters"
    )

    args = parser.parse_args()
    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    _ = lazy_load(args.dataset_path, enc, args)
Exemplo n.º 5
0
def run_model():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name_or_path', type=str, default='', help='pretrained model name or path to local checkpoint')
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--load_checkpoint", '-c', type=str, default='')
    parser.add_argument("--fp16", type=boolean_string, default=False)
    parser.add_argument("--max_seq_length", type=int, default=128)
    
    parser.add_argument("--generation_length", type=int, default=20)
    parser.add_argument("--max_history", type=int, default=2)

    parser.add_argument("--temperature", type=float, default=1)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument("--top_p", type=float, default=0.9)

    parser.add_argument('--use_gpu', action='store_true')
    parser.add_argument("--gpu", type=int, default=0)

    args = parser.parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)


    device = torch.device("cuda" if torch.cuda.is_available() and args.use_gpu else "cpu")
    n_gpu = torch.cuda.device_count()
    args.device, args.n_gpu = device, n_gpu

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    #### load the GPT-2 model 
    config = GPT2Config.from_json_file(os.path.join(args.model_name_or_path, 'config.json'))
    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    model = load_model(GPT2LMHeadModel(config), args.load_checkpoint, args, verbose=True)
    model.to(device)
    model.eval()

    history = []
    while True:
        raw_text = input("USR >>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input("USR >>> ")
        if raw_text.lower() == 'quit':
          print('SYS >>> Goodbye!')
          break
        history.append(raw_text)
        context_tokens = sum([enc.encode(h) + [EOS_ID] for h in history],[]) #+ [EOS_ID]
        context_tokens = torch.tensor(context_tokens, device=device, dtype=torch.long).unsqueeze(0)
        position_ids = torch.arange(0, context_tokens.size(-1), dtype=torch.long, device=context_tokens.device)

        out = generate_sequence(model, context_tokens, position_ids=position_ids,
                                length=args.generation_length, temperature=args.temperature, 
                                top_k=args.top_k, top_p= args.top_p) 

        out = out.tolist()                        
        text = enc.decode(cut_seq_to_eos(out[0])).encode('ascii','ignore').decode('ascii')
        print("SYS >>> ", text)
        history.append(text)
        history = history[-(2*args.max_history+1):]
def fluency_score(rated_a, opt):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    enc = GPT2Tokenizer.from_pretrained(opt.pretrained_model_path)
    model = GPT2LMHeadModel.from_pretrained(opt.pretrained_model_path)
    model.to(device)

    model.eval()
    nb_steps, eval_loss, exp_average_loss = 0, 0, None
    score_list = []
    # k = "the book is on the desk. These impressions show , when alive , they had smooth skin , robust limbs with webbed feet , and a ridge of skin on their undersides." tensor(169.6684, device='cuda:0')
    with torch.no_grad():
        for step, s in enumerate(
                rated_a):  # actually here is a batch with batchsize=1
            # Put model in training mode.
            if not s:
                print('space sentence')
                score_list.append(1e6)
                continue
            s = enc.encode(
                s)  # + [50256]  #50256 is the token_id for <|endoftext|>
            batch = torch.tensor([s]).to(device)
            loss = model(batch, lm_labels=batch)  # everage -logp
            # print(loss*len(s))
            eval_loss += loss.item()
            nb_steps += 1

            score_list.append(loss.item())

    cutoff = np.quantile([-t for t in score_list], 0.05)
    modified_rating = np.array(
        [cutoff if -t < cutoff else -t for t in score_list])
    normed_rating = (modified_rating - cutoff) / np.abs(cutoff)
    return normed_rating
Exemplo n.º 7
0
def init_model(seed=0, model_path='gpt2'):
    '''
    Parameters:
    ----------
    seed : int
        seed number for different ramdomizers
    model_name_or_path : string, optional
        either model name for existing model or path for trained model
    '''
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    enc = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load(model_path))
    model = model.module
    
    model.to(device)
    model.eval()
    return model, enc, device
Exemplo n.º 8
0
 def __init__(self, *args, language_model=None, template_loc='./relation_map_multiple.json'):
     super().__init__(*args, language_model=language_model, template_loc=template_loc)
     self.nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
     self.enc = GPT2Tokenizer.from_pretrained('gpt2')
     print("Loading template JSON.")
     with open(self.template_loc, 'r') as f:
         self.templates = json.load(f)
Exemplo n.º 9
0
def main(mode: str = 'baseline', max_length: int = None):
    enc = GPT2Tokenizer.from_pretrained('gpt2')
    CORPUS_FILE = '/Users/ben/data/wikitext-2/wiki.train.tokens'
    with open(CORPUS_FILE) as f:
        corpus = f.read()
    if max_length:
        corpus = corpus[:max_length]

    # Reprocess vocab as real bytes
    vocab = [
        bytes(enc.byte_decoder[c] for c in token) for token in enc.encoder
    ]
    encoder = dict(zip(vocab, range(len(vocab))))
    greedy = Encoder(vocab)

    with Timer():
        if mode == 'baseline':
            out = enc.encode(corpus)
        elif mode == 'greedy':
            out = list(greedy.encode(corpus))
        elif mode == 'greedy-c':
            pass
        elif mode == 'numba':
            out = list(
                numba_bpe.numba_encode(numba_bpe.random_str(100000),
                                       numba_bpe.fake_vocab()))
        elif mode == 'nonumba':
            out = list(
                numba_bpe.encode(numba_bpe.random_str(100000),
                                 numba_bpe.fake_vocab()))
        else:
            raise Exception('Uknown mode %s'.format(mode))
        print(f'Compression ratio {len(out)/len(corpus):.4f}')
Exemplo n.º 10
0
def extract_gpt2_hidden_activations(
        text_path, save_activs_to):  #, mode="full_model", focus_layers=[]):
    # read in text samples to pass through single layer of gpt2 model
    text_inputs = []
    with open(text_path, "rb") as infile:
        text_inputs = pickle.load(infile)

    # num_inputs = len(text_inputs)

    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # get the hidden activations - assumes a gpu is available
    layer_activs = []
    for text in text_inputs:
        # tokenize text
        indexed_tokens = tokenizer.encode(text)
        tokens_tensor = torch.tensor([indexed_tokens]).to('cuda')
        # set up model
        model = GPT2Model.from_pretrained('gpt2')
        model.eval()
        model.to('cuda')

        # grab the hidden activations and save them to layer_actives
        with torch.no_grad():
            hidden, _ = model(tokens_tensor)
            layer_activs.append(hidden.cpu().numpy().squeeze())

        # clear gpu memory in preparation for next text sample
        torch.cuda.empty_cache()

    # save layer dimensions
    with open(save_activs_to, "wb") as outfile:
        pickle.dump(layer_activs, outfile)
    pass
def download_model(name):
    if not name in MODELS:
        raise Exception(str(name) + ' not a model in the list')
    if not exists(PATH):
        print("# ", str(PATH), "not found, creating dir.")
        mkdir(PATH)
    print('# Downloading model: ' + str(name))
    name_path = MODEL_PATH_DICT[name]
    if name == 'word2vec':
        if not exists(join(PATH, name_path)):
            wget.download(
                'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
            )
            shutil.move(name_path, join(PATH, name_path))
            print('# Downloaded word2vec')
        else:
            print('# Already downloaded')
    if name == 'glove':
        if not exists(join(PATH, name_path)):
            wget.download(
                'http://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip')
            zip = zipfile.ZipFile('./glove.840B.300d.zip')
            zip.extractall()
            _ = glove2word2vec('./glove.840B.300d.txt', join(PATH, name_path))
            print('# Downloaded glove')
        else:
            print('# Already downloaded')
    if name == 'dict2vec':
        if not exists(join(PATH, name_path)):
            wget.download(
                'https://dict2vec.s3.amazonaws.com/dict2vec300.tar.bz2')
            tar = tarfile.open("dict2vec300.tar.bz2")
            tar.extractall()
            tar.close()
            shutil.move(name_path, join(PATH, name_path))
            print('# Downloaded dict2vec')
        else:
            print('# Already downloaded')

    if name == 'conceptnet':
        if not exists(join(PATH, name_path)):
            wget.download(
                'https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz'
            )
            shutil.move(name_path, join(PATH, name_path))
            print('# Downloaded Conceptnet Numberbatch')
        else:
            print('# Already downloaded')
    if name == 'bert' or name == 'bert-context':
        _ = BertTokenizer.from_pretrained('bert-large-uncased')
        _ = BertModel.from_pretrained(
            'bert-large-uncased').embeddings.word_embeddings.weight.data.numpy(
            )
        print('# Downloaded bert')
    if name == 'gpt2' or name == 'gpt2-context':
        _ = GPT2Tokenizer.from_pretrained('gpt2')
        _ = GPT2LMHeadModel.from_pretrained('gpt2')
        _ = GPT2Model.from_pretrained('gpt2')
        print('# Downloaded gpt-2')
Exemplo n.º 12
0
 def __init__(self,GPU, model_name_or_path="gpt2"):
     self.device = torch.device(GPU if torch.cuda.is_available() else "cpu")
     self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path)
     self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
     self.model.to(self.device)
     self.model.eval()
     self.start_token = '<|endoftext|>'
     print("Loaded GPT-2 model!")
Exemplo n.º 13
0
 def __init__(self, model_name_or_path="gpt2"):
     super(LM, self).__init__()
     self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path)
     self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
     self.model.to(self.device)
     self.model.eval()
     self.start_token = '<|endoftext|>'
     print("Loaded GPT-2 model!")
Exemplo n.º 14
0
 def construct_encoder(self):
     model = GPT2Model.from_pretrained(self.model_name)
     model.cuda()
     model = torch.nn.DataParallel(model)
     model.eval()
     tokenizer = GPT2Tokenizer.from_pretrained(self.model_name)
     print("Model and tokenzier are constructed!")
     return model, tokenizer
Exemplo n.º 15
0
def run_model():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint')
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--nsamples", type=int, default=1)
    parser.add_argument("--batch_size", type=int, default=-1)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument("--temperature", type=int, default=1)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
    args = parser.parse_args()
    print(args)

    if args.batch_size == -1:
        args.batch_size = 1
    assert args.nsamples % args.batch_size == 0

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
    model.to(device)
    model.eval()

    if args.length == -1:
        args.length = model.config.n_ctx // 2
    elif args.length > model.config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)

    while True:
        context_tokens = []
        if not args.unconditional:
            raw_text = input("Model prompt >>> ")
            while not raw_text:
                print('Prompt should not be empty!')
                raw_text = input("Model prompt >>> ")
            context_tokens = enc.encode(raw_text)
        generated = 0
        for _ in range(args.nsamples // args.batch_size):
            out = sample_sequence(
                model=model, length=args.length,
                context=context_tokens if not args.unconditional else None,
                start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None,
                batch_size=args.batch_size,
                temperature=args.temperature, top_k=args.top_k, device=device
            )
            out = out[:, len(context_tokens):].tolist()
            for i in range(args.batch_size):
                generated += 1
                text = enc.decode(out[i])
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                print(text)
        print("=" * 80)
        if args.unconditional:
            break
Exemplo n.º 16
0
def get_tokenizer(tokenizer_name):
    if tokenizer_name == 'GPT-2':
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    elif tokenizer_name == 'GPT':
        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    else:
        raise NotImplementedError(f'{tokenizer_name} -- No such tokenizer')

    return tokenizer
Exemplo n.º 17
0
def encode_many_texts(tokenizer: GPT2Tokenizer, texts: Iterable[str]) \
-> torch.Tensor:
    """Uses -1 as padding."""
    encoded_texts = [tokenizer.encode(text) for text in texts]
    max_len = max(len(text) for text in encoded_texts)
    padded_encoded_texts = [
        text + [-1] * (max_len - len(text)) for text in encoded_texts
    ]
    return torch.tensor(padded_encoded_texts)
Exemplo n.º 18
0
 def __init__(self, type, model_name_or_path="gpt2"):
     super(LM, self).__init__()
     self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path)
     if type == '345M':
         self.model = GPT2LMHeadModel.from_pretrained('output/')
     elif type == '117M':
         self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
     self.model.to(self.device)
     self.model.eval()
     self.start_token = '<|endoftext|>'
Exemplo n.º 19
0
 def __init__(
         self,
         model_name_or_path="/data/pradeesh/detecting-fake-text/pytorch/"):
     super(LM, self).__init__()
     self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path)
     self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
     self.model.to(self.device)
     self.model.eval()
     self.start_token = '<|endoftext|>'
     print("Loaded GPT-2 model!")
def load_model_fromlist(name):
    if not name in MODELS:
        raise Exception(str(name) + ' not a model in the list')
    print('# Loading model: ' + str(name))
    name_path = MODEL_PATH_DICT[name]
    if name == 'word2vec':
        if not exists(join(PATH, name_path)): download_model(name)
        return (gensim.models.KeyedVectors.load_word2vec_format(join(
            PATH, name_path),
                                                                binary=True))
    if name == 'glove':
        if not exists(join(PATH, name_path)): download_model(name)
        return (gensim.models.KeyedVectors.load_word2vec_format(
            join(PATH, name_path)))
    if name == 'dict2vec':
        if not exists(join(PATH, name_path)): download_model(name)
        return (gensim.models.KeyedVectors.load_word2vec_format(
            join(PATH, name_path), binary=False, unicode_errors="ignore"))
    if name == 'conceptnet':
        if not exists(join(PATH, name_path)): download_model(name)
        return (gensim.models.KeyedVectors.load_word2vec_format(
            join(PATH, name_path)))
    if name == 'bert':
        tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
        model = BertModel.from_pretrained(
            'bert-large-uncased').embeddings.word_embeddings.weight.data.numpy(
            )
        return ([model, tokenizer])
    if name == 'bert-context':
        tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
        model = BertModel.from_pretrained('bert-large-uncased',
                                          output_hidden_states=True)
        return ([model, tokenizer])
    if name == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = GPT2LMHeadModel.from_pretrained(
            'gpt2').transformer.wte.weight.data.numpy()
        return ([model, tokenizer])
    if name == 'gpt2-context':
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = GPT2Model.from_pretrained('gpt2', output_hidden_states=True)
        return ([model, tokenizer])
Exemplo n.º 21
0
    def __init__(self, args):
        super().__init__()

        if args.gpt2_model_dir is not None:
            # load GPT2 model from file
            gpt_model_name = str(args.gpt2_model_dir) + "/"
            dict_file = gpt_model_name
            print("loading GPT2 model from {}".format(gpt_model_name))
        else:
            # load GPT2 model from huggingface cache
            gpt_model_name = args.gpt2_model_name
            dict_file = gpt_model_name

        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = GPT2Tokenizer.from_pretrained(dict_file)

        # GPT uses different way to represent BPE then BERT. Namely, the
        # final suffixes are indicated with </w> suffix, while pieces that must
        # be followed are written as is. In BERT the prefixes are written as is
        # while the parts that must follow (not be followed!) have '##' prefix.
        # There is no one-to-one coversion. But at least we may make pieces that
        # may form a full word look the same.
        # Note that we should be very careful now,
        # tokenizer.convert_tokens_to_ids won't work with our vocabulary.

        def convert_word(word):
            if word == GPT2_EOS:
                return word

            if word.startswith('Ġ'):  # the token starts with a whitespace
                return word[1:]

            return f'_{word}_'  # the token not start with a white space.
            # may be not a head of a word,
            # or may be a head of a sentence.

        _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items()))
        self.vocab = [convert_word(word) for word in gpt_vocab]
        self._init_inverse_vocab()

        # Load pre-trained model (weights)
        self.gpt_model = GPT2LMHeadModel.from_pretrained(gpt_model_name)
        self.gpt_model.eval()
        # print(self.gpt_model.config)

        # Sanity check.
        assert len(self.vocab) == self.gpt_model.config.vocab_size
        #assert 0 == self.gpt_model.config.n_special

        self.eos_id = self.gpt_model.config.eos_token_id
        self.pad_id = self.gpt_model.config.eos_token_id
        self.unk_id = self.gpt_model.config.eos_token_id
        self.bos_id = self.gpt_model.config.bos_token_id
        self.model_vocab = self.vocab
Exemplo n.º 22
0
def init():
    #seed = 42
    #np.random.seed(seed)
    #torch.random.manual_seed(seed)
    #torch.cuda.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    enc = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.to(device)
    model.eval()
    return enc, model
    def __init__(self, cuda_device=-1):
        super(GPT2Embedder, self).__init__()

        self.cuda_device = 'cpu' if cuda_device == -1 else f'cuda:{cuda_device}'

        # Load pre-trained model tokenizer (vocabulary)
        self.enc = GPT2Tokenizer.from_pretrained('gpt2')
        # Load pre-trained model (weights)
        self.model = GPT2Model.from_pretrained('gpt2')

        self.model.to(self.cuda_device)
        self.model.eval(
        )  # we only use the evaluation mode of the pretrained model

        self._bos_id = self.enc.encoder['<|endoftext|>']
        self._bos_past = None
Exemplo n.º 24
0
def token_split(s, method='split', tokenizer=None):
    ''' Given a string s, tokenize '''
    if method == 'split':
        return s.split()
    if method == 'moses':
        tokenized_text = mt.tokenize(s, return_str=True)
        return tokenized_text.split()
    if method == 'gpt2':
        if tokenizer is None:
            global global_tokenizer
            if global_tokenizer is None:
                global_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
            tokenizer = global_tokenizer
        return gpt2_split(tokenizer, s)

    assert (False)
Exemplo n.º 25
0
def fetch_objects():
    bert = BertModel.from_pretrained(
        'bert-base-uncased').embeddings.position_embeddings.weight.data
    gpt = OpenAIGPTModel.from_pretrained(
        'openai-gpt').positions_embed.weight.data
    gpt2 = GPT2Model.from_pretrained('gpt2').wpe.weight.data
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    gpt_tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    return {
        'bert': bert,
        'gpt': gpt,
        'gpt2': gpt2
    }, {
        'bert': bert_tokenizer,
        'gpt': gpt_tokenizer,
        'gpt2': gpt2_tokenizer
    }
Exemplo n.º 26
0
    def init(self, model_path, model_checkpoint):
        self.config = GPT2Config.from_json_file(os.path.join(model_path, "config.json"))
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        self.model = GPT2LMHeadModel(self.config)

        model_state_dict = fix_state_dict_namespace(torch.load(model_checkpoint))

        start_model = self.model
        if hasattr(self.model, "transformer") and all(not s.startswith('transformer.') for s in model_state_dict.keys()):
            print('loading transfomer only')
            start_model = self.model.transformer
        start_model.load_state_dict(model_state_dict)

        if self.fp16:
            self.model.half()

        self.model.to(self.device)
        self.model.eval()
    def __init__(self,
                 text,
                 lens,
                 target,
                 identity_df,
                 weights,
                 model="gpt2",
                 split_point=0.25):
        super(TrainDataset, self).__init__()

        self._text = text
        self._lens = lens
        self._target = target
        self._identity_df = identity_df
        self._weights = weights
        self._split_point = split_point
        VOCAB_PATH = Path('../input/torch-bert-weights/%s' % (model))
        self._tokenizer = GPT2Tokenizer.from_pretrained(VOCAB_PATH)
Exemplo n.º 28
0
    def __init__(self, tuple_dir, device, language_model=None, template_loc=None):
        """
        Args:
            tuple_dir (string): Path to the csv file with commonsense tuples
        """
        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.device = device
        self.model = language_model
        if self.model is not None:
            self.model.eval()
            self.model.to(self.device)
        self.template_loc = template_loc

        # Load tuples
        with open(tuple_dir) as tsvfile:
            reader = csv.reader(tsvfile, delimiter='\t')
            self.tuples = [row for row in reader]
Exemplo n.º 29
0
 def __init__(self, text_sequence, model_type, temperature = 1.0, top_k = 0, batch_size = 1, length = 1, nsamples =1, debug = True):
     self.text_sequence = text_sequence
     #eventually will differentiate between gpt-2, BERT, etc.
     self.model_type = model_type
     model_name = 'gpt2'
     self.debug = debug
     #detect device
     self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     self.temperature = temperature
     self.top_k = top_k
     self.batch_size = batch_size
     self.length = length
     self.nsamples = nsamples
     #create encoder and model
     self.enc = GPT2Tokenizer.from_pretrained(model_name)
     self.model = GPT2LMHeadModel.from_pretrained(model_name)
     self.model.to(self.device)
     self.model.eval()
Exemplo n.º 30
0
    def transform(self, X):
        # Load pre-trained model tokenizer (vocabulary)
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        # Load pre-trained model (weights)
        model = GPT2Model.from_pretrained('gpt2', cache_dir='tmp/gpt2/')
        model.eval()

        output = []
        for idx, row in tqdm(X.iterrows(), total=len(X)):
            # Encode some inputs
            indexed_tokens_1 = tokenizer.encode(row.text)

            # If you have a GPU, put everything on cuda
            # Convert inputs to PyTorch tensors
            tokens_tensor_1 = torch.tensor([indexed_tokens_1])
            tokens_tensor_1 = tokens_tensor_1.to('cuda')
            model.to('cuda')

            # Predict hidden states features for each layer
            with torch.no_grad():
                hidden_states_1, past = model(tokens_tensor_1)

            tokens = [
                tokenizer.decoder[token].replace('Ġ', '')
                for token in indexed_tokens_1
            ]
            output.append([tokens, hidden_states_1.cpu()[0]])

        output = pd.DataFrame(output, columns=['tokens', 'layer_-1'])
        res = []
        for idx, row in X.iterrows():
            res.append(self.get_sample_props(output.loc[idx], **row)[1:])

        res = pd.DataFrame(res,
                           columns=[
                               'tokens', 'pronoun_offset_token',
                               'a_offset_token', 'b_offset_token', 'a_span',
                               'b_span', 'pronoun_token', 'a_tokens',
                               'b_tokens', 'bert', 'cls'
                           ])

        cols = set(X.columns).difference(res.columns)
        return {'X': pd.concat([X[cols], res], axis=1)}
import numpy as np
import torch
import torch.nn.functional as F
import tqdm
from tensorboardX import SummaryWriter
from torch.utils.data import DataLoader, Dataset
from tqdm import trange

import pytorch_pretrained_bert
from data_loader import get_data_loader
from model_sampler import print_samples
from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer, OpenAIAdam
from torch.utils.data import DataLoader, Dataset, Subset
model_name = 'gpt2'
enc = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


model_name = 'gpt2'
enc = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
device='cpu'
beam_width = 130
stopwords = []

def to_list(tensor):
    return list(tensor.cpu().numpy())

def predict(line, max_predictions):
    """Give continuation of the line with at most max_predictions BPE tokens. Returns line extended with predictions of
## Predict all tokens
with torch.no_grad():
    predictions = model(tokens_tensor, segments_tensors)
print(predictions.shape)  # torch.Size([1, 14, 30522])

## confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item(); print(predicted_index)  # 27227
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])
print(predicted_token)  # ['henson']

##################################################################
## OpenAI GPT2
##################################################################
## GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('/Users/coder352/datasets/WordVec/pytorch_pretrained_bert/gpt2/')
print(tokenizer.max_len)  # 1000000000000
print(len(tokenizer.encoder))  # 50257
print(type(tokenizer.encoder))  # <class 'dict'>
print(tokenizer.encoder.keys())
print(len(tokenizer.decoder))  # 50257
print(type(tokenizer.decoder))  # <class 'dict'>
print(tokenizer.decoder.get(56))

## Encode some inputs
text_1 = "Who was Jim Henson ?"  # 大小写在一起的...
text_2 = "Jim Henson was a puppeteer"
indexed_tokens_1 = tokenizer.encode(text_1); print(indexed_tokens_1, type(indexed_tokens_1))  # [8241, 373, 5395, 367, 19069, 5633]; <class 'list'>
print(tokenizer.encode("who was jim henson ?"))  # [8727, 373, 474, 320, 30963, 1559, 5633]
print(tokenizer.decode(indexed_tokens_1))  # Who was Jim Henson ?
print(tokenizer.decode([8727, 373, 474, 320]))  # who was jim