예제 #1
0
def get_tokenizer(tokenizer_name):
    log.info(f"\tLoading Tokenizer {tokenizer_name}")
    if tokenizer_name.startswith("bert-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("albert-"):
        tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        # TransformerXL is trained on data pretokenized with MosesTokenizer
        tokenizer = MosesTokenizer()
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name == "MosesTokenizer":
        tokenizer = MosesTokenizer()
    elif tokenizer_name == "SplitChars":
        tokenizer = SplitCharsTokenizer()
    elif tokenizer_name == "":
        tokenizer = SpaceTokenizer()
    else:
        tokenizer = None
    return tokenizer
예제 #2
0
def reset_bot():
    global history, tokenizer, model, personality
    dataset_path = './chatapp/data/counsel_chat_250-tokens_full.json'
    dataset_cache = './chatapp/dataset_cache'
    model_checkpoint = download_pretrained_model()
    device = "cpu"
    seed = random.randrange(0, 100)
    random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    # Get pretrained model and tokenizer
    tokenizer = OpenAIGPTTokenizer.from_pretrained(model_checkpoint)
    model = OpenAIGPTLMHeadModel.from_pretrained(model_checkpoint)
    model.to(device)
    add_special_tokens_(model, tokenizer)

    # Sample a personality
    dataset = get_dataset(tokenizer, dataset_path, dataset_cache)
    personalities = [
        dialog["personality"] for dataset in dataset.values()
        for dialog in dataset
    ]
    personality = random.choice(personalities)

    history = []
    return ""
예제 #3
0
    def __init__(self, bot):
        self.bot = bot
        self.src_dir = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..'))

        model_path = os.path.join(self.src_dir, "conv_ai/model/")
        self.args = {
            "max_history": 2,
            "device": "cpu",
            "max_length": 20,
            "min_length": 1,
            "temperature": 0.7,
            "top_k": 0,
            "top_p": 0.9,
            "no_sample": 1
        }
        self.tokenizer = OpenAIGPTTokenizer.from_pretrained(model_path)
        self.model = OpenAIGPTLMHeadModel.from_pretrained(model_path)
        self.model.to('cpu')
        add_special_tokens_(self.model, self.tokenizer)
        dataset = get_dataset(
            self.tokenizer, "",
            os.path.join(self.src_dir, "conv_ai/dataset_cache"))

        self.personalities = [
            dialog["personality"] for dataset in dataset.values()
            for dialog in dataset
        ]
        self.personality = random.choice(self.personalities)

        self.history = []
        print("Conversational AI model loaded successfully.")
    def test_openai(self):
        for tokenizer_name in OpenAIGPTTokenizer.pretrained_vocab_files_map["vocab_file"].keys():
            tokenizer_p = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
            tokenizer_r = OpenAIGPTTokenizerFast.from_pretrained(tokenizer_name)

            # Check we have the same number of added_tokens for both pair and non-pair inputs.
            self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False))
            self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True))

            # Check we have the correct max_length for both pair and non-pair inputs.
            self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
            self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)

            # Assert the set of special tokens match.
            self.assertSequenceEqual(
                tokenizer_p.special_tokens_map.items(),
                tokenizer_r.special_tokens_map.items(),
                "GPT tokenizers doesn't have the same set of special_tokens",
            )

            # Assure tokenization overlap between python and rust impl.
            self.assert_tokenization_python_rust_almost_equals(tokenizer_p, tokenizer_r, 0.0)

            # Ensure add_tokens and add_special_tokens return the correct vocab size
            self.assert_add_tokens(tokenizer_r)

            # Check for offsets mapping
            self.assert_offsets_mapping(tokenizer_r)

            # Check for dynamic encoding sequence handling in batch_encode_plus
            self.assertRaises(ValueError, self.assert_batch_encode_dynamic_overflowing, tokenizer_r)

            # Check alignment for build_inputs_with_special_tokens
            self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
예제 #5
0
def create_dataset(input_dir: str, output_file: str, num_candidates: int,
                   max_history: int):
    files_to_parse = os.listdir(input_dir)

    dataset = defaultdict(list)

    parsed_logs = []
    for file in files_to_parse:
        parsed = parse_chat_logs(os.path.join(input_dir, file))
        parsed_logs.append(parsed)

    # init tokenizer for checking
    tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
    tokenizer.add_special_tokens(SPECIAL_TOKENS)

    utterances_set = set()
    for parsed in parsed_logs:
        for dialog in parsed:
            for utterance in dialog:
                length = len(tokenizer.tokenize(utterance[1]))
                if length > config["dataset"]["max_message_length"]:
                    print("Skipping following message:\n{}".format(
                        utterance[1] if len(utterance[1]) < 512 else
                        utterance[1][:510] + "..."))
                else:
                    utterances_set.add(utterance)
    utterances = list(utterances_set)
    # print(utterances)

    for parsed in parsed_logs:
        for dialog in parsed:
            # remove invalid dialogs
            clean_dialog = []
            for utterance in dialog:
                if utterance in utterances_set:
                    clean_dialog.append(utterance)
            dialog = clean_dialog
            for i, utterance in enumerate(dialog):
                history = dialog[max(0, i - max_history):i]
                # replies = utterance + np.random.choice(utterances, NUM_CANDIDATES - 1)
                candidates = [utterance]
                for _ in range(num_candidates - 1):
                    x = utterances[np.random.randint(0, len(utterances))]
                    while x in candidates:
                        x = utterances[np.random.randint(0, len(utterances))]
                    candidates.append(x)
                correct = np.random.randint(0, num_candidates)
                candidates[correct], candidates[0] = candidates[0], candidates[
                    correct]
                dataset["history"].append(history)
                dataset["candidates"].append(candidates)
                dataset["correct"].append(correct)

    torch.save(dataset, output_file)
예제 #6
0
 def test_TFOpenAIGPTDoubleHeadsModel(self):
     from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
     pretrained_weights = 'openai-gpt'
     tokenizer = OpenAIGPTTokenizer.from_pretrained(pretrained_weights)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     model = TFOpenAIGPTDoubleHeadsModel.from_pretrained(pretrained_weights)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(
         run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx,
                          predictions, self.model_files))
예제 #7
0
def load_model_and_tokenizer(file_path: str) -> Tuple[OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer]:
    model = OpenAIGPTDoubleHeadsModel.from_pretrained("openai-gpt")
    tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")

    orig_num_tokens = len(tokenizer.encoder)
    num_added_tokens = tokenizer.add_special_tokens(SPECIAL_TOKENS)
    model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)

    model.load_state_dict(torch.load(file_path))

    return model, tokenizer
예제 #8
0
def add_transformers_vocab(vocab, tokenizer_name):
    """Add vocabulary from tokenizers in transformers for use with pre-tokenized data.

    These tokenizers have a convert_tokens_to_ids method, but this doesn't do
    anything special, so we can just use the standard indexers.
    """
    do_lower_case = "uncased" in tokenizer_name
    log.info('In add_transformers_vocab')
    log.info(tokenizer_name)
    if tokenizer_name.startswith(
            "bert-"
    ) or 'rubert' in tokenizer_name or '/bert-' in tokenizer_name:
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith(
            "roberta-"):  # or 'roberta' in tokenizer_name:
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("albert-"):
        tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2") or 'gpt' in tokenizer_name:
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-roberta"):
        tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)

    if (tokenizer_name.startswith("openai-gpt")
            or tokenizer_name.startswith("gpt2")
            or tokenizer_name.startswith("transo-xl-")):
        tokenizer.add_special_tokens({
            "bos_token": "<start>",
            "sep_token": "<delim>",
            "cls_token": "<extract>"
        })
    # TODO: this is another place can be simplified by "model-before-preprocess" reorganization
    # we can pass tokenizer created in model here, see issue <TBD>

    vocab_size = len(tokenizer)
    # do not use tokenizer.vocab_size, it does not include newly added token

    ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
    log.info("Added transformers vocab (%s): %d tokens", tokenizer_name,
             len(ordered_vocab))
    for word in ordered_vocab:
        vocab.add_token_to_namespace(
            word, input_module_tokenizer_name(tokenizer_name))
    def __init__(self, opt, shared=None):
        super(TransformerAgent, self).__init__(opt, shared)

        args = AttrDict(
            opt)  # to keep most commands identical to the interact.py script
        self.args = args

        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__file__)
        self.logger.info(pformat(args))

        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

        if shared is None:
            self.logger.info("Get pretrained model and tokenizer")
            if args.model_checkpoint == "":
                args.model_checkpoint = download_pretrained_model()
            if 'gpt2' in args.model_checkpoint:
                self.tokenizer = GPT2Tokenizer.from_pretrained(
                    args.model_checkpoint)
                model_class = GPT2DoubleHeadsModel if self.args.eval_type == "hits@1" else GPT2LMHeadModel
            else:
                self.tokenizer = OpenAIGPTTokenizer.from_pretrained(
                    args.model_checkpoint)
                model_class = OpenAIGPTDoubleHeadsModel if self.args.eval_type == "hits@1" else OpenAIGPTLMHeadModel

            self.model_checkpoint = model_class.from_pretrained(
                args.model_checkpoint)
            self.model_checkpoint.to(args.device)

            self.logger.info("Build BPE prefix dictionary")
            convai_dict = build_dict()
            assert len(convai_dict) == 19304
            self.prefix2words = self.get_prefix2words(convai_dict)
        else:
            self.model_checkpoint = shared['model']
            self.tokenizer = shared['tokenizer']
            self.prefix2words = shared['prefix2words']
        add_special_tokens_(self.model_checkpoint, self.tokenizer)
        self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids(
            SPECIAL_TOKENS)

        self.persona = []
        self.persona1 = []
        self.persona2 = []
        self.history = []
        self.labels = []

        self.reset()
def get_gpt2_perplexity(sentence):
    global model
    if model is None:
        from transformers import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
        import torch
        model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
        model.eval()
        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

    tokenize_input = tokenizer.tokenize(sentence)
    tensor_input = torch.tensor(
        [tokenizer.convert_tokens_to_ids(tokenize_input)])
    loss = model(tensor_input, lm_labels=tensor_input)
    return math.exp(loss[0].item())
예제 #11
0
    def __init__(self):
        self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        self.gpt = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt').cuda()
        self.embedder = SentenceTransformer('bert-base-nli-mean-tokens').cuda()
        self.pos_phrase = "I have an undiagnosed disease. "

        self.keywords = [term.strip().lower() for term in open('tweet_crawler/terms.txt').read().split('\n')
                         if term != "" and term != "undiagnosed" and term != "disease"]

        self.udn_examples = list(open('data/UDN_patient_search_TWEET_samples.txt').read().split('\n')) + \
                            list(open('data/UDN_patient_search_WEB_samples.txt').read().split('\n'))

        # self.phrase_gpt_score = gpt_log_prob_score([self.phrase], self.gpt, self.tokenizer)
        self.pos_phrase_emb = self.embedder.encode([self.pos_phrase])[0]
 def setup_class(self):
     self.processor = Sst2Processor()
     self.test_dir = Path(tempfile.mkdtemp())
     sst2_url = 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8'
     contents = requests.get(sst2_url)
     (self.test_dir / 'SST-2.zip').open('wb').write(contents.content)
     with ZipFile(self.test_dir / 'SST-2.zip', 'r') as zipObj:
         zipObj.extractall(self.test_dir)
     self.examples = self.processor.get_train_examples(self.test_dir /
                                                       'SST-2')
     self.base_tokenizer = OpenAIGPTTokenizer.from_pretrained(
         'openai-gpt', do_lower_case=True, cache_dir=self.test_dir)
     self.rust_tokenizer = PyOpenAiGptTokenizer(
         get_from_cache(
             self.base_tokenizer.pretrained_vocab_files_map['vocab_file']
             ['openai-gpt']),
         get_from_cache(
             self.base_tokenizer.pretrained_vocab_files_map['merges_file']
             ['openai-gpt']),
         do_lower_case=True)
예제 #13
0
 def test_special_tokens_checkpoint_behavior(self):
     toks = [
         OpenAIGPTTokenizer.from_pretrained('openai-gpt'),
         GPT2Tokenizer.from_pretrained('gpt2')
     ]
     for tok in toks:
         self.assertEqual(len(tok.added_tokens_encoder), 0)
         tok.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
         self.assertEqual(len(tok.added_tokens_encoder), 5)
         # Make sure we never split
         self.assertEqual(len(tok.tokenize("<bos> <speaker1>")), 2)
         ids = tok.convert_tokens_to_ids(SPECIAL_TOKENS)
         self.assertTrue(
             all([x > 0 for x in ids]),
             f'some tokens failed to tokenize {SPECIAL_TOKENS} -> {ids}')
         # Need to mantain indices through save. (this is also tested in pytorch-transformers)
         tok.save_pretrained(self.save_dir)
         tok_loaded = tok.from_pretrained(str(self.save_dir))
         ids2 = tok_loaded.convert_tokens_to_ids(SPECIAL_TOKENS)
         self.assertListEqual(ids, ids2)
    def test_tokenization_openai_gpt(self):
        # Given
        self.base_tokenizer = OpenAIGPTTokenizer.from_pretrained(
            'openai-gpt', do_lower_case=True, cache_dir=self.test_dir)
        self.rust_tokenizer = PyOpenAiGptTokenizer(
            get_from_cache(
                self.base_tokenizer.pretrained_vocab_files_map['vocab_file']
                ['openai-gpt']),
            get_from_cache(
                self.base_tokenizer.pretrained_vocab_files_map['merges_file']
                ['openai-gpt']),
            do_lower_case=True)
        output_baseline = []
        for example in self.examples:
            output_baseline.append(
                self.base_tokenizer.encode_plus(
                    example.text_a,
                    add_special_tokens=True,
                    return_overflowing_tokens=True,
                    return_special_tokens_mask=True,
                    max_length=128))

        # When
        output_rust = self.rust_tokenizer.encode_list(
            [example.text_a for example in self.examples],
            max_len=128,
            truncation_strategy='longest_first',
            stride=0)

        # Then
        for idx, (rust,
                  baseline) in enumerate(zip(output_rust, output_baseline)):
            assert rust.token_ids == baseline[
                'input_ids'], f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \
                              f'Sentence a: {self.examples[idx].text_a} \n' \
                              f'Sentence b: {self.examples[idx].text_b} \n' \
                              f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' \
                              f'Rust: {rust.token_ids} \n' \
                              f'Python {baseline["input_ids"]}'
            assert (
                rust.special_tokens_mask == baseline['special_tokens_mask'])
예제 #15
0
    def test_fuzz_convert_df_to_conv_ai_dict(self):
        df = pd.read_csv("data/20200325_counsel_chat.csv")
        df = df[df["split"] == "train"]
        tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
        for i in range(5):
            temp_df = df.sample(100)
            max_tokens = np.random.randint(1, 200)
            n_candidates = np.random.randint(1, 10)
            d = convert_df_to_conv_ai_dict(temp_df,
                                           [""],
                                           ["answerText"],
                                           tokenizer,
                                           max_tokens=max_tokens,
                                           n_candidates=n_candidates)

            # Test max length
            self.assertLessEqual(max([len(x["utterances"][0]["history"][0].split()) for x in d["train"]]), max_tokens)

            # Test n_candidates is equal to the number in the candidates list plus the one true response.
            train_lengths = [len(x["utterances"][0]["candidates"]) for x in d["train"]]
            self.assertEqual(n_candidates + 1, max(train_lengths))
            self.assertEqual(n_candidates + 1, min(train_lengths))
예제 #16
0
	args.device = device

	# Set logging
	timestr = time.strftime("%Y%m%d-%H%M%S")
	logging.basicConfig(filename = os.path.join(args.output_dir, 'log_{0}_{1}.log'.format(
		str(args.task_name), timestr)),
	                    filemode = 'a', format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
	                    datefmt = '%m/%d/%Y %H:%M:%S',
	                    level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
	logger = logging.getLogger(__name__)
	logger.info("device: {} n_gpu: {}, distributed training: {}".format(
		device, n_gpu, bool(args.local_rank != -1)))

	if args.do_train and args.do_eval:
		# Set tokenizer
		tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

		# Parameters for cross-validation
		length_list = list(map(int, args.length[0].split(' ')))
		lr_list = list(map(float, args.learning_rate[0].split(' ')))
		print(length_list)
		print(lr_list)

		for (length, learning_rate) in [(i, j) for i in length_list for j in lr_list]:
			logger.info("***** Validation parameters*****")
			logger.info(" Sequence length = %d" % length)
			logger.info(" Learning rate = %f" % learning_rate)
			logger.info("***** Load training data*****")
			# Read and save training input_ids
			cached_input_file = os.path.join(args.data_dir, 'train_{0}_{1}'.format(
				str(args.task_name), str(length)))
예제 #17
0
from typing import *

import torch
from transformers import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer

from itertools import chain

from special_tokens import bos, eos, speaker_self, speaker_other, lsep, pad, SPECIAL_TOKENS

model = OpenAIGPTDoubleHeadsModel.from_pretrained("openai-gpt")
tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")

# history = [[(True, "hello"), (True, "how"), (True, "are"), (True, "you"), (True, "?")],
#            [(False, "i"), (False, "am"), (False, "fine"), (False, "thanks"), (False, ".")]]

history = [(True, tokenizer.tokenize("hello how are you?")),
           (False, tokenizer.tokenize("i am fine thanks."))]

reply = (True, ["good", "to", "hear", "."])

orig_num_tokens = len(tokenizer.encoder)
print(orig_num_tokens)
num_added_tokens = tokenizer.add_special_tokens(SPECIAL_TOKENS)
model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)


def build_inputs(history: List[Tuple[bool, List[str]]], reply: Tuple[bool, List[str]]):
    history = history + [reply]
    sequence = list(map(lambda x: [speaker_self if x[0] else speaker_other] + x[1], history))
    # print(sequence)
    sequence[0] = [bos] + sequence[0]
예제 #18
0
import datetime
# import spacy
# from allennlp.commands.elmo import ElmoEmbedder
torch.cuda.is_available()

tokenizer_gpt2 = GPT2Tokenizer.from_pretrained('gpt2')
model_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2', output_hidden_states=True)
model_gpt2.eval()
model_gpt2.to('cuda')

tokenizer_bert = BertTokenizer.from_pretrained('bert-base-cased')
model_bert = BertModel.from_pretrained('bert-base-cased')
model_bert.eval()
model_bert.to('cuda')

tokenizer_gpt = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model_gpt = OpenAIGPTModel.from_pretrained('openai-gpt')
model_gpt.eval()
model_gpt.to('cuda')

# weat 1
flowers = [
    'aster', 'clover', 'hyacinth', 'marigold', 'poppy', 'azalea', 'crocus',
    'iris', 'orchid', 'rose', 'bluebell', 'daffodil', 'lilac', 'pansy',
    'tulip', 'buttercup', 'daisy', 'lily', 'peony', 'violet', 'carnation',
    'magnolia', 'petunia', 'zinnia', 'gladiola'
]  #'gladiola' deleted since it not appear
insects = [
    'ant', 'caterpillar', 'flea', 'locust', 'spider', 'bedbug', 'centipede',
    'fly', 'maggot', 'tarantula', 'bee', 'cockroach', 'gnat', 'mosquito',
    'termite', 'beetle', 'cricket', 'hornet', 'moth', 'wasp', 'dragonfly',
예제 #19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument("--max_steps",
                        default=-1,
                        type=int,
                        help="If > 0: set total number of training \
                        steps to perform. Override num_train_epochs.")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before\
                        performing a backward/update pass.")
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
    tokenizer.add_tokens(special_tokens)
    special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)

    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps //\
                (len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(train_dataloader)\
                // args.gradient_accumulation_steps * args.num_train_epochs

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=t_total)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids,
                               mc_token_ids=mc_token_ids,
                               lm_labels=lm_labels,
                               mc_labels=mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    scheduler.get_lr()[0])

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model itself

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss, _, mc_logits = model(input_ids,
                                                 mc_token_ids=mc_token_ids,
                                                 lm_labels=lm_labels,
                                                 mc_labels=mc_labels)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
예제 #20
0
import torch.nn as nn
# import a config from transformers
from transformers import Trainer, TrainingArguments
from transformers import TextDataset
# OpenAI GPT for text generation
from transformers import OpenAIGPTConfig, OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
from transformers import DataCollatorForLanguageModeling
from process_data import *

# initialize a model from config
config = OpenAIGPTConfig(vocab_size=100000, n_positions=512, n_layer=6)
model = False

# the pretrained tokenizer
tname = "Jojo_Tokenizer"
tokenizer = OpenAIGPTTokenizer.from_pretrained(tname)

# initialize a data collator
# https://github.com/huggingface/transformers/blob/1af58c07064d8f4580909527a8f18de226b226ee/src/transformers/data/data_collator.py#L68
data_collator = False

# initialize dataset - process_data
# https://github.com/huggingface/transformers/blob/1af58c07064d8f4580909527a8f18de226b226ee/src/transformers/data/datasets/language_modeling.py#L16
dataset = False

output = "output"

# initialize training arguments
training_args = TrainingArguments(
    output_dir="./" + output,
    overwrite_output_dir=True,
예제 #21
0
def main():
    config = get_config(mode='test')

    if config.data_name == "cornell":
        vocab = Vocab()
        vocab.load(config.word2id_path,
                   config.id2word_path,
                   ptb=(config.model == "PTB"))
        print(f'Vocabulary size: {vocab.vocab_size}')
        config.vocab_size = vocab.vocab_size

        if config.users:
            test_users = load_pickle(config.convs_users_path)
            config.user_size = max([x for xx in test_users for x in xx]) + 1
            print(f'User size: {config.user_size}')
        else:
            test_users = None

        data_loader = get_loader(
            convs=load_pickle(config.convs_path),
            convs_length=load_pickle(config.conversations_length_path),
            utterances_length=load_pickle(config.utterances_length_path),
            vocab=vocab,
            batch_size=config.batch_size,
            shuffle=False,
            convs_users=test_users,
            is_ptb_model=(config.model == "PTB"))

    elif config.model == "DialoGPT":
        if config.users:
            vocab = GPT2Tokenizer.from_pretrained(config.user_vocab_path)
        else:
            vocab = GPT2Tokenizer.from_pretrained('gpt2')
        config.vocab_size = len(vocab)
        config.vocab = vocab
        config.export_test = True
        data_loader = get_loader(convs=load_pickle(config.convs_path),
                                 vocab=vocab,
                                 batch_size=config.batch_size,
                                 model=config.model,
                                 dataset=config.data_name,
                                 config=config,
                                 shuffle=False)

    elif config.data_name == "cornell2" or config.data_name == "ubuntu" or config.data_name == "twitter_s":
        vocab = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        special_tokens = {
            'pad_token': PAD_TOKEN,
            'bos_token': SOS_TOKEN,
            'eos_token': EOS_TOKEN,
            'sep_token': SEP_TOKEN,
        }
        vocab.add_special_tokens(special_tokens)
        config.vocab_size = len(vocab)
        config.vocab = vocab
        config.pad_id = vocab.pad_token_id
        config.eos_id = vocab.eos_token_id
        config.sos_id = vocab.bos_token_id

        data_loader = get_loader(convs=load_pickle(config.convs_path),
                                 vocab=vocab,
                                 batch_size=config.batch_size,
                                 model=config.model,
                                 dataset=config.data_name,
                                 config=config,
                                 shuffle=False)
    else:
        raise ValueError("{} Sorry... We don't support that data".format(
            config.data_name))

    model_solver = getattr(solvers, "Solver{}".format(config.model))
    test_solver = model_solver(config,
                               None,
                               data_loader,
                               vocab=vocab,
                               is_train=False)

    test_solver.build()
    test_solver.export_samples(config.beam_size)
예제 #22
0
def main():

    config = get_config(mode="test")

    if config.data_name == "cornell2":
        vocab = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        special_tokens = {
            'pad_token': PAD_TOKEN,
            'bos_token': SOS_TOKEN,
            'eos_token': EOS_TOKEN,
            'sep_token': SEP_TOKEN,
        }
        vocab.add_special_tokens(special_tokens)
        config.vocab_size = len(vocab)
        config.vocab = vocab
        config.pad_id = vocab.pad_token_id
        config.eos_id = vocab.eos_token_id 
        config.sos_id = vocab.bos_token_id 

        convs = [
            # [["u0", "how's the weather today in Daejeon?"], ["u1", "It's rainy... "], ["u0", "Did you take your umbrella?"], ["u1", "Sure I did"]],
            [["u0", "how's the weather today?"], ["u1", "Sure I did"]],
            [["u0", "did you have a nice weekends?"], ["u1", "sure"], ["u0", "where did you go?"]],
            # [["u0", "did you have a nice weekends?"], ["u1", "sure, It was wonderful :)"]],
            [["u0", "did you take your umbrella?"], ["u1", "sure, It was wonderful :)"]], 
            [["u0", "I hurt my legs"], ["u1", "oh,, i'm sorry to hear that"]],
            [["u200", "Do u love me?"], ["u1", "oh,, i'm sorry to hear that"]],
            [["u0", "I hurt my legs"], ["u1", "oh,, i'm sorry to hear that"], ["u0", "thanks"]],
            [["u0", "how's the weather today in Daejeon?"], ["u1", "Sure I did"]],
            # [["u0", "how's the weather today in Daejeon?"], ["u1", "It's sunny today!"], ["u0", "Did you take your umbrella?"], ["u1", "Sure I did"]],
            # [["u0", "hello"], ["u1", "i hate you"], ["u0", "what??"]],
            # [["u0", "hello"], ["u1", "i love you"], ["u0", "what??"]],
            [["u0", "hello"], ["u1", "i dont't have a girlfriend likes you"], ["u0", "i know"]]
        ]
    
    else: 
        raise ValueError("{} Sorry... We don't support that data".format(config.data_name))   

    models_path = os.path.join(config.dataset_dir, "model_infos.json")
    with open(models_path) as f: 
        models = json.load(f)["models"]

    project_dir = config.dataset_dir.parent.parent

    total_outputs = []
    model_names = []
    
    for model_i, model in enumerate(models):
        config.model = model["name"]
        config.checkpoint = os.path.join(project_dir, "results", config.data_name, model["name"], model["path"])
        model_names.append(model["name"] + "/" + model["path"])

        if model.get('config'):
            for key in model["config"]:
                setattr(config, key, model["config"][key])
        
        data_loader = get_loader(convs=convs,
                                vocab=vocab,
                                batch_size=1,
                                model=config.model,
                                dataset=config.data_name,
                                config=config,
                                shuffle=False)

        model_solver = getattr(solvers, "Solver{}".format(config.model))

        solver = model_solver(config, None, data_loader, vocab=vocab, is_train=False)

        solver.build()
        inputs, outputs = solver.export_samples(config.beam_size, file_write=False)

        for i, utter in enumerate(outputs):
            if model_i == 0: 
                total_outputs.append([utter])
            else:
                total_outputs[i].append(utter)

    result_path = os.path.join(project_dir, "results", config.data_name, "qualitative_samples.txt")

    with open(result_path, 'w') as fw:
        for input_utter, outputs in zip(inputs, total_outputs): 
            # print(input_utter, file=fw)
            # for i, output in enumerate(outputs):
            #     print("{} : {}".format(model_names[i], output), file=fw)
            # print('============================', file=fw)
            print(input_utter)
            for i, output in enumerate(outputs):
                print("{} : {}".format(model_names[i], output.split('<eos>')[0]))
            print('============================')
예제 #23
0
def test_gpt_embeddings():
    gpt_model: str = "openai-gpt"

    tokenizer = OpenAIGPTTokenizer.from_pretrained(gpt_model)
    model = OpenAIGPTModel.from_pretrained(
        pretrained_model_name_or_path=gpt_model, output_hidden_states=True)
    model.to(flair.device)
    model.eval()

    s: str = "Berlin and Munich have a lot of puppeteer to see ."

    with torch.no_grad():
        tokens = tokenizer.tokenize(s)

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)

        hidden_states = model(tokens_tensor)[-1]

        first_layer = hidden_states[1][0]

    assert len(first_layer) == len(tokens)

    #     0             1           2            3          4         5         6        7       8       9        10        11         12
    #
    # 'berlin</w>', 'and</w>', 'munich</w>', 'have</w>', 'a</w>', 'lot</w>', 'of</w>', 'pupp', 'ete', 'er</w>', 'to</w>', 'see</w>', '.</w>'
    #     |             |           |            |          |         |         |         \      |      /          |         |          |
    #   Berlin         and        Munich        have        a        lot        of           puppeteer             to       see         .
    #
    #     0             1           2            3          4         5         6                7                  8        9          10

    def embed_sentence(
        sentence: str,
        pooling_operation,
        layers: str = "1",
        use_scalar_mix: bool = False,
    ) -> Sentence:
        embeddings = OpenAIGPTEmbeddings(
            pretrained_model_name_or_path=gpt_model,
            layers=layers,
            pooling_operation=pooling_operation,
            use_scalar_mix=use_scalar_mix,
        )
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)

        return flair_sentence

    # First subword embedding
    sentence_first_subword = embed_sentence(sentence=s,
                                            pooling_operation="first")

    first_token_embedding_ref = first_layer[0].tolist()
    first_token_embedding_actual = sentence_first_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_subword_embedding_ref = first_layer[7].tolist()
    puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_subword_embedding_ref ==
            puppeteer_first_subword_embedding_actual)

    # Last subword embedding
    sentence_last_subword = embed_sentence(sentence=s,
                                           pooling_operation="last")

    first_token_embedding_ref = first_layer[0].tolist()
    first_token_embedding_actual = sentence_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_last_subword_embedding_ref = first_layer[9].tolist()
    puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_last_subword_embedding_ref ==
            puppeteer_last_subword_embedding_actual)

    # First and last subword embedding
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation="first_last")

    first_token_embedding_ref = torch.cat([first_layer[0],
                                           first_layer[0]]).tolist()
    first_token_embedding_actual = sentence_first_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_last_subword_embedding_ref = torch.cat(
        [first_layer[7], first_layer[9]]).tolist()
    puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_last_subword_embedding_ref ==
            puppeteer_first_last_subword_embedding_actual)

    # Mean of all subword embeddings
    sentence_mean_subword = embed_sentence(sentence=s,
                                           pooling_operation="mean")

    first_token_embedding_ref = calculate_mean_embedding([first_layer[0]
                                                          ]).tolist()
    first_token_embedding_actual = sentence_mean_subword.tokens[
        0].embedding.tolist()

    puppeteer_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[7], first_layer[8], first_layer[9]]).tolist()
    puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_mean_subword_embedding_ref ==
            puppeteer_mean_subword_embedding_actual)

    # Check embedding dimension when using multiple layers
    sentence_mult_layers = embed_sentence(sentence="Munich",
                                          pooling_operation="first",
                                          layers="1,2,3,4")

    ref_embedding_size = 4 * 768
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size

    # Check embedding dimension when using multiple layers and scalar mix
    sentence_mult_layers_scalar_mix = embed_sentence(
        sentence="Berlin",
        pooling_operation="first",
        layers="1,2,3,4",
        use_scalar_mix=True,
    )

    ref_embedding_size = 1 * 768
    actual_embedding_size = len(
        sentence_mult_layers_scalar_mix.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size
예제 #24
0
    parser.add_argument('--data_dir', type=str, default='../../data')
    parser.add_argument('--n_batch', type=int, default=1)
    parser.add_argument('--beam', type=int, default=10)
    parser.add_argument('--filter_decode', type=bool, default=True)
    parser.add_argument('--mem_k', type=int, default=1)
args = parser.parse_args()
print(args)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
use_mem = args.use_mem
device = torch.device(device)
text_encoder = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
encoder = text_encoder.encoder
decoder = text_encoder.decoder

#sentence-level special tokens
encoder['<|sent0|>'] = len(encoder)
decoder[len(decoder)] = '<|sent0|>'

encoder['<|sent1|>'] = len(encoder)
decoder[len(decoder)] = '<|sent1|>'

encoder['<|sent2|>'] = len(encoder)
decoder[len(decoder)] = '<|sent2|>'

encoder['<|sent3|>'] = len(encoder)
decoder[len(decoder)] = '<|sent3|>'
예제 #25
0
with open('./chatapp/data/censoring.csv', 'r') as triggers_file:
    triggers = {row[0]: row[1] for row in csv.reader(triggers_file)}

max_history = 2
min_length, max_length = 1, 20
dataset_path = './chatapp/data/counsel_chat_250-tokens_full.json'
dataset_cache = './chatapp/dataset_cache'
model_checkpoint = download_pretrained_model()
device = "cpu"
seed = 0
random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)

# Get pretrained model and tokenizer
tokenizer = OpenAIGPTTokenizer.from_pretrained(model_checkpoint)
model = OpenAIGPTLMHeadModel.from_pretrained(model_checkpoint)
model.to(device)
add_special_tokens_(model, tokenizer)

# Sample a personality
dataset = get_dataset(tokenizer, dataset_path, dataset_cache)
personalities = [
    dialog["personality"] for dataset in dataset.values() for dialog in dataset
]
personality = random.choice(personalities)

history = []


@app.route("/")
예제 #26
0
def get_gpt_token_num():
    tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    tokenizer.add_tokens(GPT_SPECIAL_TOKENS)
    return len(tokenizer)
예제 #27
0
def load_model(name: str) -> Tuple[OpenAIGPTLMHeadModel, OpenAIGPTTokenizer]:
    model = OpenAIGPTLMHeadModel.from_pretrained(name)
    tokenizer = OpenAIGPTTokenizer.from_pretrained(name)
    model.eval()
    return model, tokenizer
예제 #28
0
파일: GPT.py 프로젝트: HUSTLyn/SentEval
}

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        default='openai-gpt',
                        help='model name or path')
    args = parser.parse_args()

    config = OpenAIGPTConfig.from_pretrained(args.model)
    model = OpenAIGPTModel.from_pretrained(args.model, config=config)

    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model)
    special_tokens_dict = {'pad_token': '<pad>'}
    tokenizer.add_special_tokens(special_tokens_dict=special_tokens_dict)

    model.resize_token_embeddings(len(tokenizer))

    params_senteval['model'] = model.cuda().eval()
    params_senteval['tokenizer'] = tokenizer

    se = senteval.engine.SE(params_senteval, batcher, prepare)
    transfer_tasks = [
        'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA',
        'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment',
        'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth',
        'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
        'OddManOut', 'CoordinationInversion', 'ImageCaptionRetrieval', 'SNLI'
예제 #29
0
def load_gpt_input_tensors(statement_jsonl_path, max_seq_length):
    def _truncate_seq_pair(tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length."""
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()

    def load_qa_dataset(dataset_path):
        """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
        with open(dataset_path, "r", encoding="utf-8") as fin:
            output = []
            for line in fin:
                input_json = json.loads(line)
                label = ord(input_json.get("answerKey", "A")) - ord("A")
                output.append(
                    (input_json['id'], input_json["question"]["stem"], *[
                        ending["text"]
                        for ending in input_json["question"]["choices"]
                    ], label))
        return output

    def pre_process_datasets(encoded_datasets, num_choices, max_seq_length,
                             start_token, delimiter_token, clf_token):
        """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)

            To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
            input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
        """
        tensor_datasets = []
        for dataset in encoded_datasets:
            n_batch = len(dataset)
            input_ids = np.zeros((n_batch, num_choices, max_seq_length),
                                 dtype=np.int64)
            mc_token_ids = np.zeros((n_batch, num_choices), dtype=np.int64)
            lm_labels = np.full((n_batch, num_choices, max_seq_length),
                                fill_value=-1,
                                dtype=np.int64)
            mc_labels = np.zeros((n_batch, ), dtype=np.int64)
            for i, data, in enumerate(dataset):
                q, mc_label = data[0], data[-1]
                choices = data[1:-1]
                for j in range(len(choices)):
                    _truncate_seq_pair(q, choices[j], max_seq_length - 3)
                    qa = [start_token] + q + [delimiter_token
                                              ] + choices[j] + [clf_token]
                    input_ids[i, j, :len(qa)] = qa
                    mc_token_ids[i, j] = len(qa) - 1
                    lm_labels[i, j, :len(qa) - 1] = qa[1:]
                mc_labels[i] = mc_label
            all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
            tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
        return tensor_datasets

    def tokenize_and_encode(tokenizer, obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        else:
            return list(tokenize_and_encode(tokenizer, o) for o in obj)

    tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    tokenizer.add_tokens(GPT_SPECIAL_TOKENS)
    special_tokens_ids = tokenizer.convert_tokens_to_ids(GPT_SPECIAL_TOKENS)

    dataset = load_qa_dataset(statement_jsonl_path)
    examples_ids = [data[0] for data in dataset]
    dataset = [data[1:] for data in dataset]  # discard example ids
    num_choices = len(dataset[0]) - 2

    encoded_dataset = tokenize_and_encode(tokenizer, dataset)

    (input_ids, mc_token_ids, lm_labels,
     mc_labels), = pre_process_datasets([encoded_dataset], num_choices,
                                        max_seq_length, *special_tokens_ids)
    return examples_ids, mc_labels, input_ids, mc_token_ids, lm_labels
def initialize():
    global model, tokenizer
    model = TFOpenAIGPTLMHeadModel.from_pretrained("openai-gpt")
    tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")