Python get_dataset 예제들, simpletransformers.conv_ai.conv_ai_utils.get_dataset Python 예제들

예제 #1

0

파일 보기

파일: conv_ai_model.py 프로젝트: topgunforone/simpletransformers

    def interact_single(self, message, history, personality=None, encode_history=True):
        """
        Get Response from the model based on the history and message

        Args:
            message: A message to be sent to the model.
            history: A list of sentences that repersents the interaction history between the model and the user.
            personality (optional): A list of sentences that the model will use to build a personality.
            encode_history (optional): If True, the history should be in text (string) form.
                            The history will be tokenized and encoded.

        Returns:
            out_text: The response generated by the model based on the personality, history and message.
            history: The updated history of the conversation. If encode_history is True, this will be in text form.
                        If not, it will be in encoded form.
        """
        model = self.model
        args = self.args
        tokenizer = self.tokenizer
        process_count = self.args.process_count

        if self.args.fp16:
            from torch.cuda import amp

        self._move_model_to_device()

        if not personality:
            dataset = get_dataset(
                tokenizer,
                None,
                args.cache_dir,
                process_count=process_count,
                proxies=self.__dict__.get("proxies", None),
                interact=True,
            )
            personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset]
            personality = random.choice(personalities)
        else:
            personality = [tokenizer.encode(s.lower()) for s in personality]

        if encode_history:
            raw_history = history.copy()
            raw_history.append(message)
            history = [tokenizer.encode(sentence) for sentence in history]
        history.append(tokenizer.encode(message))
        with torch.no_grad():
            if args.fp16:
                with amp.autocast():
                    out_ids = self.sample_sequence(personality, history, tokenizer, model, args)
            else:
                out_ids = self.sample_sequence(personality, history, tokenizer, model, args)
        out_text = tokenizer.decode(out_ids, skip_special_tokens=self.args.skip_special_tokens)

        if encode_history:
            raw_history.append(out_text)
            history = raw_history
        else:
            history.append(out_ids)

        return out_text, history

예제 #2

0

파일 보기

    def get_answer(self, question="", personality=[], history=[]):
        model = self.model
        args = self.args
        tokenizer = self.tokenizer
        process_count = self.args["process_count"]

        self._move_model_to_device()

        if personality == []:
            personality = ["i like medicine .", "i'm a doctor ."]

        if not personality:
            dataset = get_dataset(
                tokenizer,
                None,
                args["cache_dir"],
                process_count=process_count,
                #proxies=self.__dict__.get("proxies", None),
                interact=True,
            )
            personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset]
            personality = random.choice(personalities)
        else:
            personality = [tokenizer.encode(s.lower()) for s in personality]

        history.append(tokenizer.encode(question))
        with torch.no_grad():
            out_ids = self.sample_sequence(personality, history, tokenizer, model, args)
            history.append(out_ids)
        history = history[-(2 * args["max_history"] + 1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        return out_text

예제 #3

0

파일 보기

파일: api.py 프로젝트: yai333/ConversationalAIChatbot

def interact(raw_text, model, personality, userid, history):
    args = model.args
    tokenizer = model.tokenizer
    process_count = model.args["process_count"]

    model._move_model_to_device()

    if not personality:
        dataset = get_dataset(
            tokenizer,
            None,
            args["cache_dir"],
            process_count=process_count,
            proxies=model.__dict__.get("proxies", None),
            interact=True,
        )
        personalities = [
            dialog["personality"] for dataset in dataset.values()
            for dialog in dataset
        ]
        personality = random.choice(personalities)
    else:
        personality = [tokenizer.encode(s.lower()) for s in personality]

    history.append(tokenizer.encode(raw_text))
    with torch.no_grad():
        out_ids = sample_sequence(model, personality, history, tokenizer,
                                  model.model, args)
    history.append(out_ids)
    history = history[-(2 * args["max_history"] + 1):]
    out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
    save_chat_history(userid, json.dumps({"history": history}))
    return out_text

예제 #4

0

파일 보기

파일: conv_ai_model.py 프로젝트: topgunforone/simpletransformers

    def interact(self, personality=None):
        """
        Interact with a model in the terminal.

        Args:
            personality: A list of sentences that the model will use to build a personality.

        Returns:
            None
        """

        model = self.model
        args = self.args
        tokenizer = self.tokenizer
        process_count = self.args.process_count

        if self.args.fp16:
            from torch.cuda import amp

        self._move_model_to_device()

        if not personality:
            dataset = get_dataset(
                tokenizer,
                None,
                args.cache_dir,
                process_count=process_count,
                proxies=self.__dict__.get("proxies", None),
                interact=True,
                args=args,
            )
            personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset]
            personality = random.choice(personalities)
        else:
            personality = [tokenizer.encode(s.lower()) for s in personality]

        history = []
        while True:
            raw_text = input(">>> ")
            while not raw_text:
                print("Prompt should not be empty!")
                raw_text = input(">>> ")
            history.append(tokenizer.encode(raw_text))
            with torch.no_grad():
                if args.fp16:
                    with amp.autocast():
                        out_ids = self.sample_sequence(personality, history, tokenizer, model, args)
                else:
                    out_ids = self.sample_sequence(personality, history, tokenizer, model, args)
            history.append(out_ids)
            history = history[-(2 * args.max_history + 1) :]
            out_text = tokenizer.decode(out_ids, skip_special_tokens=self.args.skip_special_tokens)
            print(out_text)

예제 #5

0

파일 보기

    def load_and_cache_examples(self,
                                dataset_path=None,
                                evaluate=False,
                                no_cache=False,
                                verbose=True,
                                silent=False):
        """
        Loads, tokenizes, and prepares data for training and/or evaluation.

        Utility function for train() and eval() methods. Not intended to be used directly.
        """  # noqa: ignore flake8"

        process_count = self.args["process_count"]

        tokenizer = self.tokenizer
        args = self.args

        if not no_cache:
            no_cache = args["no_cache"]

        os.makedirs(self.args["cache_dir"], exist_ok=True)

        dataset_path = dataset_path if dataset_path else ""

        dataset = get_dataset(
            tokenizer,
            dataset_path,
            args["cache_dir"],
            process_count=process_count,
            evaluate=evaluate,
            no_cache=no_cache,
        )
        # print(personachat.keys())
        # datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
        # for dataset_name, dataset in personachat.items():
        datasets = defaultdict(list)
        num_candidates = len(dataset[0]["utterances"][0]["candidates"])
        if args["num_candidates"] > 0 and not evaluate:
            num_candidates = min(args["num_candidates"], num_candidates)
        for dialog in dataset:
            persona = dialog["personality"].copy()
            for _ in range(args["personality_permutations"]):
                for utterance in dialog["utterances"]:
                    history = utterance["history"][-(2 * args["max_history"] +
                                                     1):]
                    for j, candidate in enumerate(
                            utterance["candidates"][-num_candidates:]):
                        lm_labels = bool(j == num_candidates - 1)
                        instance = self.build_input_from_segments(
                            persona, history, candidate, tokenizer, lm_labels)
                        for input_name, input_array in instance.items():
                            datasets[input_name].append(input_array)
                    datasets["mc_labels"].append(num_candidates - 1)
                    datasets["n_candidates"] = num_candidates
                persona = [persona[-1]
                           ] + persona[:-1]  # permuted personalities

        # logger.info("Pad inputs and convert to Tensor")
        # tensor_datasets = {"train": [], "valid": []}
        # for dataset_name, dataset in datasets.items():
        tensor_datasets = []
        dataset = self.pad_dataset(datasets,
                                   padding=tokenizer.convert_tokens_to_ids(
                                       SPECIAL_TOKENS[-1]))
        for input_name in MODEL_INPUTS:
            tensor = torch.tensor(dataset[input_name])
            if input_name != "mc_labels":
                tensor = tensor.view((-1, datasets["n_candidates"]) +
                                     tensor.shape[1:])
            tensor_datasets.append(tensor)

        # logger.info("Build train and validation dataloaders")
        # train_dataset, valid_dataset = (
        #     TensorDataset(*tensor_datasets["train"]),
        #     TensorDataset(*tensor_datasets["valid"]),
        # )
        tensor_dataset = TensorDataset(*tensor_datasets)
        if not evaluate:
            data_sampler = RandomSampler(tensor_dataset)
            data_loader = DataLoader(tensor_dataset,
                                     sampler=data_sampler,
                                     batch_size=args["train_batch_size"])
        else:
            data_sampler = SequentialSampler(tensor_dataset)
            data_loader = DataLoader(tensor_dataset,
                                     sampler=data_sampler,
                                     batch_size=args["eval_batch_size"])

        # logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
        # logger.info("valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
        return data_loader, data_sampler

예제 #6

0

파일 보기

    "top_k": TOPK,
    "top_p": TOPP,
    "max_history": 5,
    "min_length": 1,
    "do_sample": True
}

tuned_model = ConvAIModel("gpt", "./saved_model",
                          use_cuda=False,
                          args=interact_args)

tokenizer = tuned_model.tokenizer
args = tuned_model.args
dataset = get_dataset(tokenizer, None,
                      args.cache_dir,
                      process_count=tuned_model.args.process_count,
                      proxies=tuned_model.__dict__.get("proxies", None),
                      interact=True,
                      args=args)

personalities = [dialog["personality"]
    for dataset in dataset.values() for dialog in dataset]
personality = random.choice(personalities)
global personality_decode
personality_decode = [tuned_model.tokenizer.decode(
    desc) for desc in personality]

HISTORY_FPATH = '/tmp/history.csv' #need /tmp folder for writing in docker container
hist_df = pd.DataFrame({'history': [],
                        'history_decode': []})
hist_df.to_csv(HISTORY_FPATH)