예제 #1
0
    def InitModel(self):
        logger.info(
            f"Starting conv model with gpu: {torch.cuda.is_available()}")
        """ This takes care of loading model/dataset/tokenizing. Can be called
        async or in a seperate thread so as to avoid loooong waiting time"""

        # Start with model and download pretrained if neccesary
        if self.args["model_checkpoint"] == "":
            logger.debug("Downloading pretrained model...")
            self.args["model_checkpoint"] = download_pretrained_model()
        # do model setup and tokenize vocabulary
        tokenizer_class = (GPT2Tokenizer if self.args["model"] == "gpt2" else
                           OpenAIGPTTokenizer)
        logger.debug("Opening tokenizer class from pretrained model...")
        self.tokenizer = tokenizer_class.from_pretrained(
            self.args["model_checkpoint"])

        model_class = (GPT2LMHeadModel if self.args["model"] == "gpt2" else
                       OpenAIGPTLMHeadModel)
        logger.debug("Opening model class from pretrained model...")
        self.model = model_class.from_pretrained(self.args["model_checkpoint"])
        self.model.to(self.args["device"])
        self.model.eval()
        logger.debug("Getting dataset personalities...")
        personalities = get_dataset_personalities(self.tokenizer,
                                                  self.args["dataset_path"],
                                                  self.args["dataset_cache"])
        logger.debug("Selecting a random personality...")
        self.personality = random.choice(personalities)
        logger.info(f"Selected personality: " +
                    f"{self.tokenizer.decode(chain(*self.personality))}")
        self.is_ready = True
        logger.info("⭐Model initialized and ready to go! ⭐")
예제 #2
0
def run_interactive(tokenizer, model, args):
    logger.info("Sample a personality")
    personalities = get_dataset_personalities(tokenizer, args.dataset_path,
                                              args.dataset_cache)
    personality = list(chain(*random.choice(personalities)))
    history_encoded = []
    history_types_encoded = []
    logger.info("Selected personality: %s", tokenizer.decode(personality))
    while True:
        raw_text = input(">>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input(">>> ")
        history_encoded.append(tokenizer.encode(raw_text))
        history_types_encoded.append(
            tokenizer.convert_tokens_to_ids(TYPE_USER))
        with torch.no_grad():
            out_ids, _ = sample_sequence(personality=personality,
                                         utterances=history_encoded,
                                         utterance_types=history_types_encoded,
                                         tokenizer=tokenizer,
                                         model=model,
                                         args=args)
        history_encoded.append(out_ids)
        history_types_encoded.append(tokenizer.convert_tokens_to_ids(TYPE_BOT))
        history_encoded = history_encoded[-(2 * args.max_history + 1):]
        history_types_encoded = history_types_encoded[-(2 * args.max_history +
                                                        1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print(out_text)
예제 #3
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")

    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        args.model_checkpoint = download_pretrained_model()

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint)
    model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    model.eval()

    logger.info("Sample a personality")
    personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache)
    personality = random.choice(personalities)
    logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))

    history = []
    while True:
        raw_text = input(">>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input(">>> ")
        history.append(tokenizer.encode(raw_text))
        with torch.no_grad():
            out_ids = sample_sequence(personality, history, tokenizer, model, args)
        history.append(out_ids)
        history = history[-(2*args.max_history+1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print(out_text)
예제 #4
0
 def WriteAvailablePersonalities(self, filename="/tmp/personalities.txt"):
     """Lists and decodes all personalities and writes to filename"""
     personalities = get_dataset_personalities(self.tokenizer,
                                               self.args["dataset_path"],
                                               self.args["dataset_cache"])
     with open(filename, "w") as out:
         maxFailures = 5
         failures = 0
         successes = 0
         for p in personalities:
             if failures > maxFailures:
                 logger.error(
                     "Too many failures. Aborting personality write")
                 break
             try:
                 out.write(self.tokenizer.decode(chain(*p)))
                 out.write("\n" + ("-" * 50) + "\n")
                 successes += 1
             except:
                 logger.warning(f"Couldn't write personality: {p}")
                 failures += 1
예제 #5
0
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument("--model",
                        type=str,
                        default="gpt2",
                        help="Model type (gpt or gpt2)")
    parser.add_argument("--model_checkpoint",
                        "-mc",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=100,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature",
                        type=float,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    # add option to not use personality
    parser.add_argument("--no_personality",
                        type=bool,
                        default=True,
                        help="Set to not sample a personality.")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if os.path.isdir("./huggingface_s3/"):
            args.model_checkpoint = "./huggingface_s3/"
            logger.info("Loading from pre-downloaded temp path: {}".format(
                args.model_checkpoint))
        else:
            args.model_checkpoint = download_pretrained_model()

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (
        GPT2Tokenizer,
        GPT2LMHeadModel) if "gpt2" == args.model else (OpenAIGPTTokenizer,
                                                       OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)

    model.to(args.device)
    add_special_tokens_(model, tokenizer)
    model.eval()

    # added the option to opt out of using a personality
    if args.no_personality:
        logger.info("No personality is sampled for this chatbot.")
        personality = ""
        # personality = ["My name is Isabelle Hawkins.",
        #                "I am five years old.",
        #                "My phone number is 959-100-9300.",
        #                "Here is a link I would like you to check out: google.com.",
        #                "I would like to know more about you."]
        # personality = [tokenizer.encode(p) for p in personality]
        # logger.info("Selected custom personality: %s",tokenizer.decode(chain(*personality)))
    else:
        logger.info("Sample a personality")
        personalities = get_dataset_personalities(tokenizer, args.dataset_path,
                                                  args.dataset_cache)
        personality = random.choice(personalities)
        # import pdb; pdb.set_trace()
        logger.info("Selected personality: %s",
                    tokenizer.decode(chain(*personality)))

    history = []
    # while True:
    #     custom_history = input("Press 0 to end\n\tAdd history: ")
    #     if custom_history == '0':
    #         break
    #     else:
    #         history.append(tokenizer.encode(custom_history))

    while True:
        history = []
        args.temperature = float(input("Set temperature: > 0 and <= 1"))
        prompt = input("Speaker 1 >>> ")
        while not prompt:
            print('Prompt should not be empty!')
            prompt = input("Speaker 1 >>> ")
        history.append(tokenizer.encode(prompt))

        i = 0
        while True:
            with torch.no_grad():
                out_ids = sample_sequence(personality, history, tokenizer,
                                          model, args)
            history.append(out_ids)
            history = history[-(2 * args.max_history + 1):]
            out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
            i += 1
            speaker = "Speaker 2" if i % 2 else "Speaker 1"
            print(f"{speaker}: {out_text}")

            if i == 10:
                break
예제 #6
0
def main(args):
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Import %s", args.train_mod)
    train_mod = __import__(args.train_mod, globals(), locals(), [
        'GPT2BPETokenizer_CN', 'SPECIAL_TOKENS', 'build_input_from_segments',
        'add_special_tokens_'
    ], 0)
    global SPECIAL_TOKENS, build_input_from_segments, add_special_tokens_
    GPT2BPETokenizer_CN = train_mod.GPT2BPETokenizer_CN
    SPECIAL_TOKENS = train_mod.SPECIAL_TOKENS
    build_input_from_segments = train_mod.build_input_from_segments
    add_special_tokens_ = train_mod.add_special_tokens_

    logger.info("Get pretrained model and tokenizer")
    model_class, tokenizer_class = GPT2DoubleHeadsModel, GPT2BPETokenizer_CN

    logger.info("load tokenizer....")
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)

    logger.info("load model....")
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)
    n_positions = len(model.transformer.wpe.weight)

    logger.info("Sample a personality")
    personalities = get_dataset_personalities(tokenizer, args.dataset_path,
                                              args.dataset_cache)
    personality = random.choice(personalities)
    logger.info("Selected personality: %s",
                tokenizer.decode(chain(*personality)))

    # HUP 清空 history
    def sighup_fn(signum, frame):
        logger.info('Signal %s!', signum)
        nonlocal history
        history = []

    signal.signal(signal.SIGHUP, sighup_fn)

    history = []
    try:
        while True:
            raw_text = input(">>> ").strip()
            while not raw_text:
                print('Prompt should not be empty!')
                raw_text = input(">>> ").strip()
            history.append(tokenizer.encode(raw_text))
            with torch.no_grad():
                out_ids = []
                for out_id in sample_generate(personality, history, tokenizer,
                                              model, args):
                    out_ids.append(out_id)
                    out_text = tokenizer.decode([out_id],
                                                skip_special_tokens=True)
                    print(out_text.strip(), end='')
                print()
            history.append(out_ids)
            history = history[-(2 * args.max_history + 1):]
            history_length = sum(len(ids) for ids in history)
            if history_length >= n_positions:
                warnings.warn('历史数据 tokens 长度 %s 大于等于 %s,程序将崩溃!',
                              history_length, n_positions)
    except KeyboardInterrupt:
        logger.warning('KeyboardInterrupt')
def run():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model", type=str, default="gpt2", help="Model type (gpt or gpt2)")
    parser.add_argument("--model_checkpoint", "-mc", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=50, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    # add option to not use personality
    parser.add_argument("--no_personality", type=bool, default=True, help="Set to not sample a personality.")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if os.path.isdir("./huggingface_s3/"): 
            args.model_checkpoint = "./huggingface_s3/"
            logger.info("Loading from pre-downloaded temp path: {}".format(args.model_checkpoint))
        else: 
            args.model_checkpoint = download_pretrained_model()

    with open("dailydialog_formatted.json", "r") as f: 
        valid = json.load(f) 

    inputs = [utterance['history'] for instance in valid for utterance in instance['utterances']]
    shuffled_inputs = shuffle(inputs, random_state=42, n_samples=1000)

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)

    model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel
    model = model_class.from_pretrained(args.model_checkpoint)
    add_special_tokens_(model, tokenizer)

    model.to(args.device)
    model.eval()

    # added the option to opt out of using a personality 
    if args.no_personality: 
        logger.info("No personality is sampled for this chatbot.")
        personality = [""]
        # personality = ["My name is Isabelle Hawkins.", 
        #                "I am five years old.", 
        #                "My phone number is 959-100-9300.", 
        #                "Here is a link I would like you to check out: google.com.", 
        #                "I would like to know more about you."]
        # personality = [tokenizer.encode(p) for p in personality] 
        # logger.info("Selected custom personality: %s",tokenizer.decode(chain(*personality)))
    else:
        logger.info("Sample a personality")
        personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache)
        personality = random.choice(personalities)
        # import pdb; pdb.set_trace()
        logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))

    device = 'gpu' if args.device == 'cuda' else 'cpu'
    time_path = Path(args.model_checkpoint).absolute().name + f"{device}_predict_time.txt"
    f = open(time_path, 'w')

    total_time = 0 
    lines = []
    for history in tqdm(shuffled_inputs): 
        start = time.time() 
        tokenized_history = [tokenizer.encode(h) for h in history]
        
        with torch.no_grad(): 
            out_ids = sample_sequence(personality, tokenized_history, tokenizer, model, args)
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        elapsed = time.time() - start 
        rounded_ = round(elapsed, 2)
        total_time += elapsed 

        input_text = 'Input: ' + ' --- '.join(history) + '\n'
        output = f'Output: {out_text}\n'
        elapsed_text = f'Elapsed time: {rounded_}s\n'

        lines.append(input_text + output + elapsed_text)



    if args.device == 'cpu': 
        processor = get_processor_name()
        f.writelines(f"CPU predictions\nProcessor: {processor}\n")
    else: 
        f.writelines(f"GPU predictions\n")

    avg_time = round((total_time / len(shuffled_inputs)), 2)
    f.writelines(f"Average time elapsed per prediction: {avg_time} seconds\n\n")

    f.writelines(lines)
    f.close()
예제 #8
0
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument("--model",
                        type=str,
                        default="gpt",
                        help="Model type (gpt or gpt2)")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=20,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        args.model_checkpoint = download_pretrained_model()

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel
    model = model_class.from_pretrained(args.model_checkpoint)

    model.to(args.device)
    model.eval()

    logger.info("Sample a personality")
    personalities = get_dataset_personalities(tokenizer, args.dataset_path,
                                              args.dataset_cache)
    personality = random.choice(personalities)
    logger.info("Selected personality: %s",
                tokenizer.decode(chain(*personality)))

    history = []
    while True:
        print(
            "MyData: Type ==> 'Convo' for an interactive chat OR  Type ==> 'ask' for Asking Questions... "
        )
        raw_text = input("You: ")
        raw_text = raw_text.lower()
        if raw_text.replace(' ', '') == 'ask':
            print('MyData: Please Ask your Question..')
            message = input('You: ')
            main(message)

        elif raw_text.replace(' ', '') == 'convo':
            print('MyData: Hello, whats up?')
            continue
        while not raw_text:
            print('Please i did not get that')
            raw_text = input("You: ")
        history.append(tokenizer.encode(raw_text))
        with torch.no_grad():
            out_ids = sample_sequence(personality, history, tokenizer, model,
                                      args)
        history.append(out_ids)
        history = history[-(2 * args.max_history + 1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print("MyData: {}".format(out_text))
def run():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="",
                        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model", type=str, default="gpt", help="Model type (gpt or gpt2)")
    parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9,
                        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        args.model_checkpoint = download_pretrained_model()

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)
    B_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    B_model = BertModel.from_pretrained('bert-base-uncased')
    B_model.eval()

    logger.info("Sample a personality")
    personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache)
    personality = random.choice(personalities)
    logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))

    history = []

    class WindowClass(QMainWindow, form_class):
        def __init__(self):
            super().__init__()
            self.setupUi(self)
            self.enter.clicked.connect(self.enterPlainTextEdit)

            # PlainTextEdit과 관련된 함수
            def enterPlainTextEdit(self):
                input = self.user.toPlainText()
                self.bot.append(input)
                raw_text = input
                history.append(tokenizer.encode(raw_text))
                with torch.no_grad():
                    out_ids = sample_sequence(personality, history, tokenizer, B_tokenizer, model, args, B_model)
                history.append(out_ids)
                history = history[-(2 * args.max_history + 1):]
                out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
                self.bot.append(out_text)

    app = QApplication(sys.argv)
    myWindow = WindowClass()
    myWindow.show()
    app.exec_()
예제 #10
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="",
                        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--model", type=str, default="bert", help="Model type (gpt or gpt2)")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint", type=str, default="/home/rohola/codes/transfer-learning-conv-ai/logs/logs", help="Path, url or short name of the model")
    #parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    #parser.add_argument("--model_checkpoint", type=str, default="/home/rohola/data/bert_checkpoint", help="Path, url or short name of the model")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9,
                        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))



    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    if args.model == "bert":
        tokenizer_class = BertTokenizer
        model_class = BertLMHeadModel
    elif args.model == "gpt2":
        tokenizer_class = GPT2Tokenizer
        model_class = GPT2LMHeadModel

    if args.model == "bert":
        SPECIAL_TOKENS = ["[BOS]", "[EOS]", "[SPEAKER1]", "[SPEAKER2]", "[PAD]"]
    else:
        SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]

    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)

    model.to(args.device)
    model.eval()


    if args.length < 0 and model.config.max_position_embeddings > 0:
        args.length = model.config.max_position_embeddings
    elif 0 < model.config.max_position_embeddings < args.length:
        args.length = model.config.max_position_embeddings  # No generation bigger than model size
    elif args.length < 0:
        args.length = MAX_LENGTH  # avoid infinite loop

    logger.info("Sample a personality")
    personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache)
    personality = random.choice(personalities)
    logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))

    history = []
    while True:
        raw_text = input(">>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input(">>> ")
        history.append(tokenizer.encode(raw_text))
        with torch.no_grad():
            #out_ids = sample_sequence(personality, history, tokenizer, model, args, SPECIAL_TOKENS)

            out = sample_sequence(
                args=args,
                model=model,
                personality=personality,
                history=history,
                length=args.length,
                temperature=args.temperature,
                top_k=args.top_k,
                top_p=args.top_p,
                device=args.device,
                is_xlnet=bool(args.model_type == "xlnet"),
            )

        history.append(out_ids)
        history = history[-(2 * args.max_history + 1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print(out_text)
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument("--model",
                        type=str,
                        default="gpt2",
                        help="Model type (gpt or gpt2)")
    parser.add_argument("--model_checkpoint",
                        "-mc",
                        type=str,
                        default="runs/gpt2_convai_yesand",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        "-mh",
        type=int,
        default=5,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=20,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    # set a number of top choices to show
    parser.add_argument("--top_c",
                        type=int,
                        default=10,
                        help="Determine how many top choices to be shown.")
    # add option to not use personality
    parser.add_argument("--no_personality",
                        "-np",
                        action='store_true',
                        help="Set to not sample a personality.")
    # use text file to deduce results
    parser.add_argument(
        "--email_sequence",
        "-es",
        default=None,
        help=
        "Provide the text file for which to generate outputs for the chatbot.")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if os.path.isdir("./huggingface_s3/"):
            args.model_checkpoint = "./huggingface_s3/"
            logger.info("Loading from pre-downloaded temp path: {}".format(
                args.model_checkpoint))
        else:
            args.model_checkpoint = download_pretrained_model()

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel
    model = model_class.from_pretrained(args.model_checkpoint)

    add_special_tokens_(model, tokenizer)
    model.to(args.device)
    model.eval()

    # added the option to opt out of using a personality
    if args.no_personality:
        logger.info("No personality is sampled for this chatbot.")
        personality = ""
        # personality = ["My name is Isabelle Hawkins.",
        #                "I am five years old.",
        #                "My phone number is 959-100-9300.",
        #                "Here is a link I would like you to check out: google.com.",
        #                "I would like to know more about you."]
        # personality = [tokenizer.encode(p) for p in personality]
        # logger.info("Selected custom personality: %s",tokenizer.decode(chain(*personality)))
    else:
        logger.info("Sample a personality")
        personalities = get_dataset_personalities(tokenizer, args.dataset_path,
                                                  args.dataset_cache)
        personality = random.choice(personalities)
        # import pdb; pdb.set_trace()
        logger.info("Selected personality: %s",
                    tokenizer.decode(chain(*personality)))

    # test case to see if decoder works as expected. If not, see if special tokens are appropriately added to the tokenizer
    test_case = "Hello, my name is Justin. I'd like a strawberry cheesecake."
    test_encode = tokenizer.encode(test_case)
    test_decode = tokenizer.decode(test_encode)
    assert test_case == test_decode

    # adapt code for efficient experimentation of existing email exchanges
    if args.email_sequence:
        logger.info(
            f"Running chatbot generations for {os.path.split(args.email_sequence)[-1]}"
        )
        while True:
            with open(args.email_sequence, 'r') as f:
                email_sequence = f.readlines()
            his_length = int(
                input(
                    "Indicate how many exchanges you want to refer back to - must be an integer. \n1 indicates only the most recent email from the scammer: "
                ))
            email_sequence = [
                e for e in email_sequence if re.sub('\n', '', e)
            ]  # remove any empty lines
            email_sequence = email_sequence[-((his_length - 1) * 2 + 1):]

            history = [tokenizer.encode(e) for e in email_sequence]

            logger.info("Used input:\n")
            for idx, e in enumerate(email_sequence):
                output = f"\tUser: {e}" if idx % 2 else f"\tScammer: {e}"
                print(output)
            with torch.no_grad():
                out_ids = sample_sequence(personality, history, tokenizer,
                                          model, args)

            out_texts = [
                tokenizer.decode(o, skip_special_tokens=True) for o in out_ids
            ]

            print(
                f"Top {args.top_c} choices of history length = {his_length}:")
            for idx, o in enumerate(out_texts):
                print(f"\t{idx}: {o}")

    # manual generation
    else:
        history = []
        while True:
            custom_history = input("Press 0 to end\n\tAdd history: ")
            if custom_history == '0':
                break
            else:
                history.append(tokenizer.encode(custom_history))

        while True:
            raw_text = input("Scammer >>> ")
            while not raw_text:
                print('Prompt should not be empty!')
                raw_text = input("Scammer >>> ")
            history.append(tokenizer.encode(raw_text))

            with torch.no_grad():
                out_ids = sample_sequence(personality, history, tokenizer,
                                          model, args)
            # multiple retries
            # history.append(out_ids)
            # history = history[-(2*args.max_history+1):]
            out_texts = [
                tokenizer.decode(o, skip_special_tokens=True) for o in out_ids
            ]
            print(f"Top {args.top_c} choices:")
            for idx, o in enumerate(out_texts):
                print(f"\t{idx}: {o}")