def load_vgg19(input_image):
    """
    Load VGG into a TensorFlow model.
    Use a dictionary to hold the model instead of using a Python class
    """
    # VGG-19 parameters file
    VGG19_DOWNLOAD_LINK = 'http://www.vlfeat.org/matconvnet/models/imagenet-vgg-verydeep-19.mat'
    VGG19_MODEL = 'imagenet-vgg-verydeep-19.mat'
    VGG19_EXPECTED_BYTES = 534904783
    download_pretrained_model(VGG19_DOWNLOAD_LINK, VGG19_MODEL,
                              VGG19_EXPECTED_BYTES)

    vgg = scipy.io.loadmat(VGG19_MODEL)
    vgg_layers = vgg['layers']

    graph = {}
    graph['conv1_1'] = _conv2d_relu(vgg_layers, input_image, 0, 'conv1_1')
    graph['conv1_2'] = _conv2d_relu(vgg_layers, graph['conv1_1'], 2, 'conv1_2')
    graph['avgpool1'] = _avgpool(graph['conv1_2'])
    graph['conv2_1'] = _conv2d_relu(vgg_layers, graph['avgpool1'], 5,
                                    'conv2_1')
    graph['conv2_2'] = _conv2d_relu(vgg_layers, graph['conv2_1'], 7, 'conv2_2')
    graph['avgpool2'] = _avgpool(graph['conv2_2'])
    graph['conv3_1'] = _conv2d_relu(vgg_layers, graph['avgpool2'], 10,
                                    'conv3_1')
    graph['conv3_2'] = _conv2d_relu(vgg_layers, graph['conv3_1'], 12,
                                    'conv3_2')
    graph['conv3_3'] = _conv2d_relu(vgg_layers, graph['conv3_2'], 14,
                                    'conv3_3')
    graph['conv3_4'] = _conv2d_relu(vgg_layers, graph['conv3_3'], 16,
                                    'conv3_4')
    graph['avgpool3'] = _avgpool(graph['conv3_4'])
    graph['conv4_1'] = _conv2d_relu(vgg_layers, graph['avgpool3'], 19,
                                    'conv4_1')
    graph['conv4_2'] = _conv2d_relu(vgg_layers, graph['conv4_1'], 21,
                                    'conv4_2')
    graph['conv4_3'] = _conv2d_relu(vgg_layers, graph['conv4_2'], 23,
                                    'conv4_3')
    graph['conv4_4'] = _conv2d_relu(vgg_layers, graph['conv4_3'], 25,
                                    'conv4_4')
    graph['avgpool4'] = _avgpool(graph['conv4_4'])
    graph['conv5_1'] = _conv2d_relu(vgg_layers, graph['avgpool4'], 28,
                                    'conv5_1')
    graph['conv5_2'] = _conv2d_relu(vgg_layers, graph['conv5_1'], 30,
                                    'conv5_2')
    graph['conv5_3'] = _conv2d_relu(vgg_layers, graph['conv5_2'], 32,
                                    'conv5_3')
    graph['conv5_4'] = _conv2d_relu(vgg_layers, graph['conv5_3'], 34,
                                    'conv5_4')
    graph['avgpool5'] = _avgpool(graph['conv5_4'])

    return graph
示例#2
0
def model_tokenizer(args):
    if torch.cuda.is_available():
        args["device"] = "cuda"
    else:
        args["device"] = "cpu"

    if args["model_checkpoint"] == "":
        if args["model"] == 'gpt2':
            raise ValueError(
                "Interacting with GPT2 requires passing a finetuned model_checkpoint"
            )
        else:
            args["model_checkpoint"] = download_pretrained_model()

    if args["seed"] != 0:
        random.seed(args["seed"])
        torch.random.manual_seed(args["seed"])
        torch.cuda.manual_seed(args["seed"])

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (
        GPT2Tokenizer,
        GPT2LMHeadModel) if args["model"] == 'gpt2' else (OpenAIGPTTokenizer,
                                                          OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args["model_checkpoint"])
    model = model_class.from_pretrained(args["model_checkpoint"])
    model.to(args["device"])
    add_special_tokens_(model, tokenizer)

    logger.info("Get text to emote model")
    emote_clf = txtemote_model(args["txtemotion_dataset_path"])
    return model, emote_clf, tokenizer
示例#3
0
def model_pretrained(path_dir=None, model=None):
    if model is not None:
        path_dir = download_pretrained_model(model)
    config = load_dict(path_dir)
    model = Property_Prediction(**config)
    model.load_pretrained(path_dir + '/model.pt')
    return model
示例#4
0
def run():
    pretrained_model = utils.download_pretrained_model()
    tokenizer_class, model_class = (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(pretrained_model)
    model = model_class.from_pretrained(pretrained_model)
    model.to("cpu")
    add_special_tokens_(model, tokenizer)
    dataset = utils.get_dataset(tokenizer, "./dataset_cache")
    features = [
        dialog["feature"] for dataset in dataset.values() for dialog in dataset
    ]
    feature = random.choice(features)
    print("Examples of selected feature:\n",
          tokenizer.decode(itertools.chain(*feature)))
    background = [tokenizer.encode("tell me about yourself")]
    generated_lyrics = []
    hist_size = 2
    for _ in range(
            5
    ):  # how many lines of lyrics to generate - time grows exponentially with this value
        with torch.no_grad():
            out_ids = sample_sequence(feature, background, tokenizer, model)
        background.append(out_ids)
        background.append(random.choice(background))
        background = background[
            -5:]  # size of history to retain (needs to be odd number since we're using two headed model)
        this_line = tokenizer.decode(out_ids, skip_special_tokens=True)
        generated_lyrics.append(this_line)
    print("\nGenerated lyrics:")
    print("\n".join(generated_lyrics))
    def InitModel(self):
        logger.info(
            f"Starting conv model with gpu: {torch.cuda.is_available()}")
        """ This takes care of loading model/dataset/tokenizing. Can be called
        async or in a seperate thread so as to avoid loooong waiting time"""

        # Start with model and download pretrained if neccesary
        if self.args["model_checkpoint"] == "":
            logger.debug("Downloading pretrained model...")
            self.args["model_checkpoint"] = download_pretrained_model()
        # do model setup and tokenize vocabulary
        tokenizer_class = (GPT2Tokenizer if self.args["model"] == "gpt2" else
                           OpenAIGPTTokenizer)
        logger.debug("Opening tokenizer class from pretrained model...")
        self.tokenizer = tokenizer_class.from_pretrained(
            self.args["model_checkpoint"])

        model_class = (GPT2LMHeadModel if self.args["model"] == "gpt2" else
                       OpenAIGPTLMHeadModel)
        logger.debug("Opening model class from pretrained model...")
        self.model = model_class.from_pretrained(self.args["model_checkpoint"])
        self.model.to(self.args["device"])
        self.model.eval()
        logger.debug("Getting dataset personalities...")
        personalities = get_dataset_personalities(self.tokenizer,
                                                  self.args["dataset_path"],
                                                  self.args["dataset_cache"])
        logger.debug("Selecting a random personality...")
        self.personality = random.choice(personalities)
        logger.info(f"Selected personality: " +
                    f"{self.tokenizer.decode(chain(*self.personality))}")
        self.is_ready = True
        logger.info("⭐Model initialized and ready to go! ⭐")
示例#6
0
def run():
    config_file = "configs/interact_config.json"
    config = InteractConfig.from_json_file(config_file)

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(config))

    if config.model_checkpoint == "":
        config.model_checkpoint = download_pretrained_model()

    random.seed(config.seed)
    torch.random.manual_seed(config.seed)
    torch.cuda.manual_seed(config.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class = GPT2Tokenizer if "gpt2" == config.model else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
    model_class = GPT2LMHeadModel if "gpt2" == config.model else OpenAIGPTLMHeadModel
    model = model_class.from_pretrained(config.model_checkpoint)

    model.to(config.device)
    model.eval()

    dataset = get_dataset(tokenizer, config.dataset_path, config.dataset_cache)

    special_tokens = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
    calculate_metrics(config, model, tokenizer, dataset, special_tokens)
示例#7
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")

    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature", type=float, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if args.model == 'gpt2':
            raise ValueError("Interacting with GPT2 requires passing a finetuned model_checkpoint")
        else:
            args.model_checkpoint = download_pretrained_model()
	
	
    if args.seed != 0:
    	random.seed(args.seed)
    	torch.random.manual_seed(args.seed)
    	torch.cuda.manual_seed(args.seed)


    logger.info("Get pretrained model and tokenizer")
    
    
    tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)
    global tokenizer
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    global model
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    logger.info("Sample a personality")
    dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
    personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset]
    personality = random.choice(personalities)
    logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))

    return model, tokenizer, args, personality
示例#8
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")

    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        args.model_checkpoint = download_pretrained_model()

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint)
    model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    model.eval()

    logger.info("Sample a personality")
    personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache)
    personality = random.choice(personalities)
    logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))

    history = []
    while True:
        raw_text = input(">>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input(">>> ")
        history.append(tokenizer.encode(raw_text))
        with torch.no_grad():
            out_ids = sample_sequence(personality, history, tokenizer, model, args)
        history.append(out_ids)
        history = history[-(2*args.max_history+1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print(out_text)
示例#9
0
    def __init__(self, opt, shared=None):
        super(TransformerAgent, self).__init__(opt, shared)

        args = AttrDict(
            opt)  # to keep most commands identical to the interact.py script
        self.args = args

        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__file__)
        self.logger.info(pformat(args))

        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

        if shared is None:
            self.logger.info("Get pretrained model and tokenizer")
            if args.model_checkpoint == "":
                args.model_checkpoint = download_pretrained_model()
            if 'gpt2' in args.model_checkpoint:
                self.tokenizer = GPT2Tokenizer.from_pretrained(
                    args.model_checkpoint)
                model_class = GPT2DoubleHeadsModel if self.args.eval_type == "hits@1" else GPT2LMHeadModel
            else:
                self.tokenizer = OpenAIGPTTokenizer.from_pretrained(
                    args.model_checkpoint)
                model_class = OpenAIGPTDoubleHeadsModel if self.args.eval_type == "hits@1" else OpenAIGPTLMHeadModel

            self.model_checkpoint = model_class.from_pretrained(
                args.model_checkpoint)
            self.model_checkpoint.to(args.device)

            self.logger.info("Build BPE prefix dictionary")
            convai_dict = build_dict()
            assert len(convai_dict) == 19304
            self.prefix2words = self.get_prefix2words(convai_dict)
        else:
            self.model_checkpoint = shared['model']
            self.tokenizer = shared['tokenizer']
            self.prefix2words = shared['prefix2words']
        add_special_tokens_(self.model_checkpoint, self.tokenizer)
        self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids(
            SPECIAL_TOKENS)

        self.persona = []
        self.history = []
        self.labels = []

        self.reset()
示例#10
0
def run():
    config_file = "configs/interact_config.json"
    config = InteractConfig.from_json_file(config_file)

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(config))

    if config.model_checkpoint == "":
        config.model_checkpoint = download_pretrained_model()

    torch.random.manual_seed(config.seed)
    torch.cuda.manual_seed(config.seed)

    logger.info("Get pretrained model and tokenizer")
    if config.model == "bert":
        tokenizer_class = BertTokenizer
        model_class = BertLMHeadModel
    elif config.model == "gpt2":
        tokenizer_class = GPT2Tokenizer
        model_class = GPT2LMHeadModel
    else:
        tokenizer_class = OpenAIGPTTokenizer
        model_class = OpenAIGPTLMHeadModel

    SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]

    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
    model = model_class.from_pretrained(config.model_checkpoint)

    model.to(config.device)
    model.eval()

    history = []
    while True:
        raw_text = input(">>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input(">>> ")
        history.append(tokenizer.encode(raw_text))
        with torch.no_grad():
            out_ids = sample_sequence(history, tokenizer, model, config,
                                      SPECIAL_TOKENS)
        history.append(out_ids)
        history = history[-(2 * config.max_history + 1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print(out_text)
示例#11
0
def get_model(
    dataset_path="",
    dataset_cache='./dataset_cache',
    model="openai-gpt",
    model_checkpoint="",
    device="cuda" if torch.cuda.is_available() else "cpu",
    seed=0,
):
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)

    if model_checkpoint == "":
        if model == 'gpt2':
            raise ValueError(
                "Interacting with GPT2 requires passing a finetuned model_checkpoint"
            )
        else:
            model_checkpoint = download_pretrained_model()

    if seed != 0:
        random.seed(seed)
        torch.random.manual_seed(seed)
        torch.cuda.manual_seed(seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (GPT2Tokenizer,
                                    GPT2LMHeadModel) if model == 'gpt2' else (
                                        OpenAIGPTTokenizer,
                                        OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(model_checkpoint)
    model = model_class.from_pretrained(model_checkpoint)
    model.to(device)
    add_special_tokens_(model, tokenizer)

    logger.info("Sample a personality")
    dataset = get_dataset(tokenizer, dataset_path, dataset_cache)
    personalities = [
        dialog["personality"] for dataset in dataset.values()
        for dialog in dataset
    ]
    personality = random.choice(personalities)
    logger.info("Selected personality: %s",
                tokenizer.decode(chain(*personality)))
    return model, personality, tokenizer
示例#12
0
def init():
    args = {
        "dataset_path": "",
        "dataset_cache": "./dataset_cache_GPT2tokenizer",
        "model": "gp2",
        "model_checkpoint": "../runs/Sep19_21-11-42_micah-HP-ENVY-x360-Convertible-15-ee0xxx_gpt2/",
        "max_history": 2,
        "device": "cpu",
        "max_length": 20,
        "min_length": 1,
        "seed": 0,
        "temperature": 0.7,
        "top_k": 0,
        "top_p": 0.9
    }

    if args.get("model_checkpoint") == "":
        if args.get("model") == 'gpt2':
            raise ValueError("Interacting with GPT2 requires passing a finetuned model_checkpoint")
        else:
            args["model_checkpoint"] = download_pretrained_model()
	
    if args.get("seed") != 0:
    	random.seed(args.get("seed"))
    	torch.random.manual_seed(args.get("seed"))
    	torch.cuda.manual_seed(args.get("seed"))

    print("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.get("model") == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.get("model_checkpoint"))
    model = model_class.from_pretrained(args.get("model_checkpoint"))
    model.to(args.get("device"))
    add_special_tokens_(model, tokenizer)

    print("Sample a personality")
    dataset = get_dataset(tokenizer, args.get("dataset_path"), args.get("dataset_cache"))
    personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset] 
    personality = random.choice(personalities)
    print(tokenizer.decode(chain(*personality)))

    return tokenizer, personality, model, args
示例#13
0
def load_model(model_checkpoint, model_type):
    if model_checkpoint == "":
        model_checkpoint = download_pretrained_model()
    else:
        assert os.path.exists(
            model_checkpoint
        ), f'checkpoint directory not found: {model_checkpoint}'

    logger.info("Get pretrained model and tokenizer")
    if model_type not in MODELS:
        raise NotImplementedError('model "%s" not implemented. use one of %s' %
                                  (model_type, str(MODELS.keys)))
    config_class, tokenizer_class, _, model_class = MODELS[model_type]

    _tokenizer = tokenizer_class.from_pretrained(model_checkpoint)
    _model = model_class.from_pretrained(model_checkpoint)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    _model.to(device)
    _model.eval()
    return _model, _tokenizer, os.path.basename(
        model_checkpoint) if model_checkpoint else model_checkpoint
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument("--model",
                        type=str,
                        default="gpt",
                        help="Model type (gpt or gpt2)")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=20,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        args.model_checkpoint = download_pretrained_model()

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    logger.info("Sample a personality")
    #personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache)
    #personality = random.choice(personalities)
    #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))
    wordfile = './data/truncate.txt'  # word vector file, can be downloaded from GloVe website
    weightfile = './auxiliary_data/enwiki_vocab_min200.txt'  # each line is a word and its frequency
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word

    p = 0
    start_time = time.time()
    with open('data_volunteers.json') as json_file:
        json_data = json.load(json_file)
        for i in json_data:
            p += 1
            #if p <1100:
            #    continue
            history = []
            personality = []
            query_set = []
            json_dialog = i["dialog"]
            json_bot = i["bot_profile"]
            for j in json_bot:
                personality.append(tokenizer.encode(j))
            #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))
            persona = tokenizer.decode(chain(*personality))
            row = {"Personality": persona}
            text = []
            for j in json_dialog:
                if j["sender_class"] == "Human":
                    json_text = j["text"]
                    raw_text = json_text
                    check = tokenizer.decode(tokenizer.encode(raw_text),
                                             skip_special_tokens=True)
                    if check == "":
                        history.append(tokenizer.encode(raw_text))
                        with torch.no_grad():
                            out_ids = normal_sample_sequence(
                                personality, history, tokenizer, model, args)
                        # history.append(out_ids)
                        history = history[-(2 * args.max_history + 1):]
                        out_text = tokenizer.decode(out_ids,
                                                    skip_special_tokens=True)
                        text.append({
                            "evaluation_score": j["evaluation_score"],
                            "id": j["id"],
                            "sender": j["sender"],
                            "sender_class": j["sender_class"],
                            "text": raw_text,
                            "generated_text": out_text
                        })
                        continue
                    history.append(tokenizer.encode(raw_text))
                    with torch.no_grad():
                        out_ids = sample_sequence(personality, history,
                                                  tokenizer, model, args,
                                                  words, weight4ind, We)
                    # history.append(out_ids)
                    history = history[-(2 * args.max_history + 1):]
                    out_text = tokenizer.decode(out_ids,
                                                skip_special_tokens=True)
                    text.append({
                        "evaluation_score": j["evaluation_score"],
                        "id": j["id"],
                        "sender": j["sender"],
                        "sender_class": j["sender_class"],
                        "text": raw_text,
                        "generated_text": out_text
                    })
                else:
                    json_text = j["text"]
                    raw_text = json_text
                    history.append(tokenizer.encode(raw_text))
                    text.append({
                        "evaluation_score": j["evaluation_score"],
                        "id": j["id"],
                        "sender": j["sender"],
                        "sender_class": j["sender_class"],
                        "text": raw_text
                    })
            row["dialog"] = text
            query_set.append(row)
            #print(query_set)
            with open('./sif_set/sif' + str(p) + '.json',
                      'w',
                      encoding='utf-8') as make_file:
                json.dump(query_set, make_file)
            if not p % 10:
                print(
                    str(p * 100 / 1111) + '%, ' +
                    str(time.time() - start_time) + 'sec')
    '''
示例#15
0
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument(
        "--model",
        type=str,
        default="openai-gpt",
        help="Model type (openai-gpt or gpt2)",
        choices=['openai-gpt',
                 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=20,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")

    parser.add_argument("--n_samples", type=int, default=10)
    parser.add_argument("--sample_term", type=int, default=1)

    args = parser.parse_args()

    if args.model_checkpoint == "":
        if args.model == 'gpt2':
            raise ValueError(
                "Interacting with GPT2 requires passing a finetuned model_checkpoint"
            )
        else:
            args.model_checkpoint = download_pretrained_model()

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    tokenizer_class, model_class = (
        GPT2Tokenizer,
        GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer,
                                                       OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)

    #sample_idxs = range(args.n_samples)
    sample_idxs = [args.sample_term * i for i in range(1, args.n_samples + 1)]
    for i in sample_idxs:
        personality = dataset['valid'][i]['personality']
        history = dataset['valid'][i]['utterances'][4]['history']
        target = dataset['valid'][i]['utterances'][4]['candidates'][-1]
        with torch.no_grad():
            out_ids = sample_sequence(personality, history, tokenizer, model,
                                      args)
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)

        print('Persona info:')
        for persona in personality:
            print(tokenizer.decode(persona, skip_special_tokens=True), end=' ')
        print('\nDialog:')
        for his in history:
            print(tokenizer.decode(his, skip_special_tokens=True))
        print('Target:')
        print(tokenizer.decode(target, skip_special_tokens=True))
        print('Prediction:')
        print(out_text, end='\n\n')
示例#16
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")

    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if args.model == 'gpt2':
            raise ValueError("Interacting with GPT2 requires passing a finetuned model_checkpoint")
        else:
            args.model_checkpoint = download_pretrained_model()
	
	
    if args.seed != 0:
    	random.seed(args.seed)
    	torch.random.manual_seed(args.seed)
    	torch.cuda.manual_seed(args.seed)


    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    #add_special_tokens_(model, tokenizer)

    logger.info("Sample a personality")
    dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)

    with open('test.txt','w') as file:
        for item in dataset['valid']:
            personality =item["personality"]
            file.write("个性:")
            file.write(tokenizer.decode(chain(*personality)))
            file.write("\n")
            for op in item["utterances"]:
                history=op["history"]
                history = history[-(2 * args.max_history + 1):]
                file.write("另一个人说的话")
                file.write(tokenizer.decode(history[-1]))
                file.write("\n")
                with torch.no_grad():
                    out_ids = sample_sequence(personality, history, tokenizer, model, args)
                out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
                file.write("生成的回答:")
                file.write(out_text)
                file.write("\n")
                file.write("正确答案")
                file.write(tokenizer.decode(op["candidates"][-1]))
                file.write("\n")
            file.write("\n\n")
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument(
        "--model",
        type=str,
        default="openai-gpt",
        help="Model type (openai-gpt or gpt2)",
        choices=['openai-gpt',
                 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=20,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if args.model == 'gpt2':
            raise ValueError(
                "Interacting with GPT2 requires passing a finetuned model_checkpoint"
            )
        else:
            args.model_checkpoint = download_pretrained_model()

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (
        GPT2Tokenizer,
        GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer,
                                                       OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    logger.info("Sample a personality")
    dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
    personalities = [
        dialog["personality"] for dataset in dataset.values()
        for dialog in dataset
    ]
    personality = random.choice(personalities)
    logger.info("Selected personality: %s",
                tokenizer.decode(chain(*personality)))

    history = []
    while True:
        raw_text = input(">>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input(">>> ")

            #classifier code starts
            # zz = ['I like to sleep',"that's cool other cultures are nice", "where is Geneva cats?", "What public figure defended New York in Januar"]
        zz = [raw_text]
        valDF = pd.DataFrame()
        valDF['question_text'] = zz

        # prediction part
        batch_size = 256

        def batch_gen(test_df):
            n_batches = math.ceil(len(test_df) / batch_size)
            for i in range(n_batches):
                texts = test_df.iloc[i * batch_size:(i + 1) * batch_size, 0]
                text_arr = np.array([text_to_array(text) for text in texts])
                yield text_arr

        # test_df = pd.read_csv("../input/quora-insincere-questions-classification/test.csv")
        test_df = valDF

        all_preds = []
        for x in tqdm(batch_gen(test_df)):
            all_preds.extend(classifier_model.predict(x).flatten())

        y_te = (np.array(all_preds) > 0.5).astype(np.int)
        print(y_te)
        print(valDF['question_text'])

        history.append(tokenizer.encode(raw_text))
        with torch.no_grad():
            out_ids = sample_sequence(personality, history, tokenizer, model,
                                      args)
        history.append(out_ids)
        history = history[-(2 * args.max_history + 1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print(out_text)
示例#18
0
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument(
        "--model",
        type=str,
        default="openai-gpt",
        help="Model type (openai-gpt or gpt2)",
        choices=['openai-gpt',
                 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=20,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    parser.add_argument(
        "--conv_limit",
        type=int,
        default=None,
        help="Length of conversation - number of times Speaker1 can respond")

    args = parser.parse_args()

    #logging.basicConfig(level=logging.INFO)
    #logger = logging.getLogger(__file__)
    #logger.info(pformat(args))

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    print("Select type of chat:\n1. Counselling\n2. Task-Oriented")
    raw_text = input(">>> ")

    initial = [
        "Will you like to learn a new recipe?",
        "Do you want to learn a new recipe?", "Let us learn a new recipe."
    ]
    sents = ["To sum up, ", "Thus, as I understand, ", "So, to summarize, "]

    history = []

    if raw_text == "1":
        if args.model_checkpoint == "":
            if args.model == 'gpt2':
                raise ValueError(
                    "Interacting with GPT2 requires passing a finetuned model_checkpoint"
                )
            else:
                args.model_checkpoint = download_pretrained_model()

        #logger.info("Get pretrained model and tokenizer")
        tokenizer_class, model_class = (
            GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (
                OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)
        tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
        model = model_class.from_pretrained(args.model_checkpoint)
        model.to(args.device)
        add_special_tokens_(model, tokenizer)

        #logger.info("Sample a personality")
        dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
        personalities = [dialog["personality"] for dialog in dataset]
        personality = random.choice(personalities)
        print("Selected personality: ", tokenizer.decode(chain(*personality)))

        if args.conv_limit:
            conv_len = args.conv_limit
        else:
            conv_len = -1

        utt = 0
        text_summary = []
        while utt != conv_len:
            raw_text = input(">>> ")
            while not raw_text:
                print('Prompt should not be empty!')
                raw_text = input(">>> ")
            history.append(tokenizer.encode(raw_text))
            text_summary.append(raw_text)
            with torch.no_grad():
                out_ids = sample_sequence(personality, history, tokenizer,
                                          model, args)
            history.append(out_ids)
            history = history[-(2 * args.max_history + 1):]
            out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
            print(out_text)
            utt = utt + 1
            if utt == conv_len:
                if out_text.endswith("?"):
                    utt = utt - 1

        # generate emotion
        raw_text = 'exit chat'
        history.append(tokenizer.encode(raw_text))
        with torch.no_grad():
            out_ids = sample_sequence(personality, history, tokenizer, model,
                                      args)
        history.append(out_ids)
        history = history[-(2 * args.max_history + 1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print("\n" + "Chat Emotion: " + out_text)

        # generate summary
        text = ".".join(text_summary)
        summary = summarizer(text, max_length=50)
        print("\n" + "Summary:\n" + random.choice(sents) +
              create_reflection(summary[0]['summary_text']))

        # generate a supporting response to the summary
        raw_text = 'summarize-chat'
        history.append(tokenizer.encode(raw_text))
        with torch.no_grad():
            out_ids = sample_sequence(personality, history, tokenizer, model,
                                      args)
        history.append(out_ids)
        history = history[-(2 * args.max_history + 1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print("\n" + "Response:\n" + out_text)

    elif raw_text == "2":
        print(random.choice(initial))
        raw_text = input(">>> ")
        scores = sentiment.polarity_scores(raw_text)
        if scores['pos'] > scores['neg']:
            print("Great, here is a recipe for you ...")
            create_recipe()
            raw_text = input(">>> ")
        elif scores['neg'] > scores['pos']:
            print(
                "ok, then maybe you will like to chat with the counsellor. Please choose option 1. Thank you."
            )
        else:
            print("I could not understand what you are asking.")

    else:
        print("Please select the correct choice.")
示例#19
0
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="data/en_book_conversational.json",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument(
        "--model",
        type=str,
        default="openai-gpt",
        help="Model type (openai-gpt or gpt2)",
        choices=['openai-gpt',
                 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=20,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if args.model == 'gpt2':
            raise ValueError(
                "Interacting with GPT2 requires passing a finetuned model_checkpoint"
            )
        else:
            args.model_checkpoint = download_pretrained_model()

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (
        GPT2Tokenizer,
        GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer,
                                                       OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    logger.info("Sample a personality")
    dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
    personalities = [
        dialog["personality"] for dataset in dataset.values()
        for dialog in dataset
    ]
    personality = random.choice(personalities)
    # persona = get_persona_label(tokenizer)

    tokenizer.decode(personalities)

    def matching_personality(persona_text):
        while 'i have got a headache and a fever.' not in tokenizer.decode(
                chain(*personality)):
            personality = random.choice(personalities)
        return personality

    """'immigration checkpoint', 'in a taxi', 'hotel check-in', 'at a restaurant', 'getting a dessert', 'asking for directions',
        'at a shopping mall', 'hotel check-out', 'checking in at the airport', 'in flight', 'currency exchange', 'renting a car',
        'making a hotel reservation','room service', 'buying a camera', 'at a supermarket', 'in a hospital',
        'getting a souvenir', 'asking someone to take a photo'"""

    logger.info("Selected personality: %s",
                tokenizer.decode(chain(*personality)))

    history = []
    while True:
        raw_text = input(">>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input(">>> ")
        history.append(tokenizer.encode(raw_text))
        with torch.no_grad():
            out_ids = sample_sequence(personality, history, tokenizer, model,
                                      args)
        history.append(out_ids)
        history = history[-(2 * args.max_history + 1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print(out_text)
示例#20
0
parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
parser.add_argument("--seed", type=int, default=0, help="Seed")
parser.add_argument("--temperature", type=float, default=0.7, help="Sampling softmax temperature")
parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
args = parser.parse_args()

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__file__)
logger.info(pformat(args))

if args.model_checkpoint == "":
    if args.model == 'gpt2':
        raise ValueError("Interacting with GPT2 requires passing a finetuned model_checkpoint")
    else:
        args.model_checkpoint = download_pretrained_model()


if args.seed != 0:
    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)


logger.info("Get pretrained model and tokenizer")
tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)
tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
model = model_class.from_pretrained(args.model_checkpoint)
model.to(args.device)
add_special_tokens_(model, tokenizer)
示例#21
0
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument(
        "--model",
        type=str,
        default="openai-gpt",
        help="Model type (openai-gpt or gpt2)",
        choices=['openai-gpt',
                 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=20,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    parser.add_argument(
        "--conv_limit",
        type=int,
        default=None,
        help="Length of conversation - number of times Speaker1 can respond")

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if args.model == 'gpt2':
            raise ValueError(
                "Interacting with GPT2 requires passing a finetuned model_checkpoint"
            )
        else:
            args.model_checkpoint = download_pretrained_model()

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (
        GPT2Tokenizer,
        GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer,
                                                       OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    logger.info("Sample a personality")
    dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
    personalities = [dialog["personality"] for dialog in dataset]
    personality = random.choice(personalities)
    logger.info("Selected personality: %s",
                tokenizer.decode(chain(*personality)))

    sents = ["To sum up, ", "Thus, as I understand, ", "So, to summarize, "]

    if args.conv_limit:
        conv_len = args.conv_limit
    else:
        conv_len = -1

    text_summary = []

    utt = 0

    history = []
    while utt != conv_len:
        raw_text = input(">>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input(">>> ")
        history.append(tokenizer.encode(raw_text))
        text_summary.append(raw_text)
        with torch.no_grad():
            out_ids = sample_sequence(personality, history, tokenizer, model,
                                      args)
        history.append(out_ids)
        history = history[-(2 * args.max_history + 1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print(out_text)
        utt = utt + 1
        if utt == conv_len:
            if out_text.endswith("?"):
                utt = utt - 1

    text = ".".join(text_summary)
    summary = summarizer(text, max_length=50)
    print("\n" + random.choice(sents) +
          create_reflection(summary[0]['summary_text']))
示例#22
0
class run:
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument(
        "--model",
        type=str,
        default="openai-gpt",
        help="Model type (openai-gpt or gpt2)",
        choices=['openai-gpt',
                 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=20,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=20,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if args.model == 'gpt2':
            raise ValueError(
                "Interacting with GPT2 requires passing a finetuned model_checkpoint"
            )
        else:
            args.model_checkpoint = download_pretrained_model()

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (
        GPT2Tokenizer,
        GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer,
                                                       OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    logger.info("Sample a personality")
    dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
    #personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset]
    #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))

    history = []

    def process_text(self, raw_text):
        #personality = random.choice(self.personalities)
        personality = [
            'i am a robot.', 'my job is to give or deny permission.',
            'i love my job.', 'josh is my favorite person.',
            'my name is permissioner-bot.', 'i do not have a gender.'
        ]
        personality = [self.tokenizer.encode(line) for line in personality]
        self.history.append(self.tokenizer.encode(raw_text))
        with torch.no_grad():
            out_ids = sample_sequence(personality, self.history,
                                      self.tokenizer, self.model, self.args)
        self.history.append(out_ids)
        self.history = self.history[-(2 * self.args.max_history + 1):]
        out_text = self.tokenizer.decode(out_ids, skip_special_tokens=True)
        return out_text
示例#23
0
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument(
        "--model",
        type=str,
        default="openai-gpt",
        help="Model type (openai-gpt or gpt2)",
        choices=['openai-gpt',
                 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=20,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if args.model == 'gpt2':
            raise ValueError(
                "Interacting with GPT2 requires passing a finetuned model_checkpoint"
            )
        else:
            args.model_checkpoint = download_pretrained_model()

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (
        GPT2Tokenizer,
        GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer,
                                                       OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    logger.info("Sample a personality")
    dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
    personalities = [
        dialog["personality"] for dataset in dataset.values()
        for dialog in dataset
    ]
    personality = random.choice(personalities)
    logger.info("Selected personality: %s",
                tokenizer.decode(chain(*personality)))

    history = []
    engine = pyttsx3.init()
    r = sr.Recognizer()
    while True:
        print("Talk:")
        with sr.Microphone() as source:
            audio = r.listen(source)
        raw_text = r.recognize_google(audio)
        print(raw_text)
        #        raw_text = input(">>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input(">>> ")
        history.append(tokenizer.encode(raw_text))
        with torch.no_grad():
            out_ids = sample_sequence(personality, history, tokenizer, model,
                                      args)
        history.append(out_ids)
        history = history[-(2 * args.max_history + 1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print(out_text)
        engine.say(out_text)
        engine.runAndWait()
示例#24
0
def train():
    parser = ArgumentParser()
    parser.add_argument(
        "--data_path",
        type=str,
        default=None,
        help=
        "Path to conversational data (by default will look for single file in ./data)"
    )
    parser.add_argument("--run_name",
                        type=str,
                        default='run1',
                        help="The name of the run (subdirectory in ./runs)")
    parser.add_argument(
        "--model",
        type=str,
        default="openai-gpt",
        help=
        "Initialize model from path to checkpoint or with model name (openai-gpt/openai-gpt2)"
    )
    parser.add_argument("--save_every",
                        type=int,
                        default=100,
                        help="Save checkpoint every n updates steps.")
    parser.add_argument("--num_candidates",
                        type=int,
                        default=2,
                        help="Number of candidates for training")
    parser.add_argument("--max_history",
                        type=int,
                        default=2,
                        help="Number of previous exchanges to keep in history")
    parser.add_argument(
        "--max_input_length",
        type=int,
        default=200,
        help=
        "Number of tokens which will be fed into the model (reduce this number if you have memory constraints)"
    )
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=4,
                        help="Batch size for training")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=8,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr",
                        type=float,
                        default=6.25e-5,
                        help="Learning rate")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--lm_coef",
                        type=float,
                        default=1.0,
                        help="LM loss coefficient")
    parser.add_argument("--mc_coef",
                        type=float,
                        default=1.0,
                        help="Multiple-choice loss coefficient")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=3,
                        help="Number of training epochs")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        "--use_huggingface_model",
        action='store_true',
        help="Start training from pre-trained model by Huggingface")
    args = parser.parse_args()

    # Set seed
    set_seed(args.seed)

    if args.use_huggingface_model:
        args.model = download_pretrained_model()
        logger.info(f'Using pre-trained Personachat model {args.model}')

    # Load tokenizer
    logger.info("Prepare tokenizer, pretrained model and optimizer.")
    tokenizer_class = GPT2Tokenizer if "gpt2" in args.model else OpenAIGPTTokenizer  # cant use Autotokenizer because checkpoint could be a Path
    tokenizer = tokenizer_class.from_pretrained(args.model)
    # Load model
    model_class = GPT2DoubleHeadsModel if "gpt2" in args.model else OpenAIGPTDoubleHeadsModel
    model = model_class.from_pretrained(args.model)
    model.to(args.device)
    # Add special tokens if they are not already added
    add_special_tokens_(model, tokenizer)

    # Get data loaders
    logger.info("Prepare datasets")
    train_loader = get_data_loader(args, tokenizer, use_cache=True)

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.lr,
                      eps=args.adam_epsilon)
    t_total = len(
        train_loader) // args.gradient_accumulation_steps * args.n_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Train!
    logger.info("***** Running training *****")
    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model):
        # set global_step to gobal_step of last saved checkpoint from model path
        try:
            global_step = int(args.model.split("-")[-1].split("/")[0])
        except:
            global_step = 0
        epochs_trained = global_step // (len(train_loader) //
                                         args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (
            len(train_loader) // args.gradient_accumulation_steps)
        logger.info(
            "Continuing training from checkpoint, will skip to saved global_step"
        )
        logger.info(f"Continuing training from epoch {epochs_trained}")
        logger.info(f"Continuing training from global step {global_step}")
        logger.info(
            f"Will skip the first {steps_trained_in_current_epoch} steps in the first epoch"
        )

    # Training loop
    model.zero_grad()
    epoch_pbar = trange(epochs_trained, int(args.n_epochs))
    av_loss = 0
    for current_epoch in epoch_pbar:
        epoch_pbar.set_description(
            f"Epoch [{current_epoch+1}/{args.n_epochs}]")
        pbar = tqdm(train_loader)
        for step, batch in enumerate(pbar):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue
            model.train()
            batch = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
            (lm_loss), (mc_loss), *_ = model(input_ids,
                                             token_type_ids=token_type_ids,
                                             mc_token_ids=mc_token_ids,
                                             mc_labels=mc_labels,
                                             lm_labels=lm_labels)
            loss = (lm_loss * args.lm_coef +
                    mc_loss * args.mc_coef) / args.gradient_accumulation_steps
            loss.backward()
            tr_loss = loss.item()
            # caclulate exponential moving average
            av_loss = (step * av_loss + loss) / (step + 1)
            pbar.set_description(f"Average loss: {av_loss:.4f}")
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                if global_step % args.save_every == 0 and global_step > 0:
                    checkpoint_prefix = "checkpoint"
                    output_dir = os.path.join(
                        'runs', args.run_name,
                        "{}-{}".format(checkpoint_prefix, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    logger.info(f"Saving model checkpoint to {output_dir}")
                    model.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    logger.info(
                        f"Saving optimizer and scheduler states to {output_dir}"
                    )
                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))

    # save model
    output_dir = os.path.join('runs', args.run_name)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    logger.info(f"Saving model checkpoint to {output_dir}")
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    # Good practice: save your training arguments together with the trained model
    torch.save(args, os.path.join(output_dir, "training_args.bin"))
示例#25
0
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument(
        "--model",
        type=str,
        default="openai-gpt",
        help="Model type (openai-gpt or gpt2)",
        choices=['openai-gpt',
                 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=200,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if args.model == 'gpt2':
            raise ValueError(
                "Interacting with GPT2 requires passing a finetuned model_checkpoint"
            )
        else:
            args.model_checkpoint = download_pretrained_model()

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (
        GPT2Tokenizer,
        GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer,
                                                       OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    logger.info("Sample a personality")
    dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
    personalities = [
        dialog["personality"] for dataset in dataset.values()
        for dialog in dataset
    ]
    # personality = random.choice(personalities)

    personality = [string_transformer('my name is WabiSabi', tokenizer, False)]
    quotes = [
        'do not be afraid to ask for yourself',
        'to escape fear , you must go through it',
        'I am timeless, incomplete and imperfect. No age. No sense of time.',
        ' failure is another steppingstone to greatness . ',
        'think like a queen .  queen is not afraid to fail . failure is another steppingstone to greatness . ',
        'be thankful for what you have ; you will end up having more . if you concentrate on what you do not have, you will never, ever have enough .',
        'surround yourself with only people who are going to lift you higher .',
        'the biggest adventure you can ever take is to live the life of your dreams  .',
        'doing the best at this moment puts you in the best place for the next moment .',
        'real integrity is doing the right thing , knowing that nobody is going to know whether you did it or not .',
        'the more you praise and celebrate your life , the more there is in life to celebrate .',
        'passion is energy . feel the power that comes from focusing on what excites you .',
        'lots of people want to ride with you in the limo , but what you want is someone who will take the bus with you when the limo breaks down .',
        'turn your wounds into wisdom . ',
        'you can have it all . just not all at once . ',
        'one of the hardest things in life to learn are which bridges to cross and which bridges to burn . ',
        'challenges are gifts that force us to search for a new center of gravity .',
        'the thing you fear most has no power . your fear of it is what has the power . facing the truth really will set you free .',
        'surround yourself only with people who are going to take you higher .',
        'you get in life what you have the courage to ask for .',
        'i trust that everything happens for a reason , even when we are not wise enough to see it .',
        'everybody has a calling . and your real job in life is to figure out as soon as possible what that is , who you were meant to be , and to begin to honor that in the best way possible for yourself .',
        'the key to realizing a dream is to focus not on success but on significance , and then even the small steps and little victories along your path will take on greater meaning .',
        'the biggest adventure you can ever take is to live the life of your dreams .',
        'self-esteem comes from being able to define the world in your own terms and refusing to abide by the judgments of others .',
        'forgiveness is giving up the hope that the past could have been any different .',
        'luck is a matter of preparation meeting opportunity .',
        'the whole point of being alive is to evolve into the complete person you were intended to be .',
        'wisdom equals knowledge plus courage . you have to not only know what to do and when to do it , but you have to also be brave enough to follow through .',
        'surround yourself with great people .',
        'i alone cannot change the world , but i can cast a stone across the water to create many ripples .',
        'whatever the mind of man can conceive and believe, it can achieve .',
        'whenever you see a successful person, you only see the public glories,  never the private sacrifices to reach them .',
        'at some point you are bound to stumble because if you are constantly doing what we do , raising the bar . if you are constantly pushing yourself higher, higher the law of averages not to mention the myth of icarus predicts that you will at some point fall . And when you do i want you to know this , remember this : there is no such thing as failure . failure is just life trying to move us in another direction . now when you are down there in the hole , it looks like failure .',
        'and when you are down in the hole when that moment comes , it is really okay to feel bad for a little while . give yourself time to mourn what you think you may have lost but then here is the key , learn from every mistake because every experience , encounter , and particularly your mistakes are there to teach you and force you into being more who you are . and then figure out what is the next right move .',
        'because when you inevitably stumble and find yourself stuck in a hole that is the story that will get you out : what is your true calling ? what is your dharma ? what is your purpose ?',
        'i know that you all might have a little anxiety now but no matter what challenges or setbacks or disappointments you may encounter along the way , you will find true success and happiness if you have only one goal , there really is only one , and that is this : to fulfill the highest most truthful expression of yourself as a human being . you want to max out your humanity by using your energy to lift yourself up , your family and the people around you .',
        'from time to time you may stumble , fall , you will for sure , you will have questions and you will have doubts about your path . but i know this , if you are willing to be guided by , that still small voice that is the gps within yourself , to find out what makes you come alive , you will be more than okay . you will be happy , you will be successful , and you will make a difference in the world .'
    ]
    random.shuffle(quotes)
    quotes = quotes[:24]
    [personality.append(string_transformer(s, tokenizer)) for s in quotes]
    # print(personality)
    logger.info("Selected personality: %s",
                tokenizer.decode(chain(*personality)))

    history = []
    while True:
        raw_text = input(">>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input(">>> ")
        history.append(string_transformer(raw_text, tokenizer))
        with torch.no_grad():
            out_ids = sample_sequence(personality, history, tokenizer, model,
                                      args)
        history.append(out_ids)
        history = history[-(2 * args.max_history + 1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print(out_text)
示例#26
0
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument("--model",
                        type=str,
                        default="gpt2",
                        help="Model type (gpt or gpt2)")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=150,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    parser.add_argument(
        "--task",
        type=str,
        default="dialogue",
        help="one of task from [dialogue, qa, mt, nlg, summarization]")
    parser.add_argument("--self_copy",
                        action='store_true',
                        help="add self copy")
    parser.add_argument("--perturbation_layers",
                        type=int,
                        default=0,
                        help="number of perturbation layers")
    parser.add_argument("--adapter_bottleneck",
                        type=int,
                        default=0,
                        help="adapter layer bottleneck")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        args.model_checkpoint = download_pretrained_model()

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel
    model = model_class.from_pretrained(
        args.model_checkpoint,
        perturbation_layers=args.perturbation_layers,
        self_copy=args.self_copy,
        adapter_bottleneck=args.adapter_bottleneck)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    if args.task == "dialogue":
        output_text = []
        ref_text = []
        loaded_dataset, cache_path = get_dataset(tokenizer,
                                                 args.dataset_path,
                                                 args.dataset_cache,
                                                 args.task,
                                                 return_cachepath=True)
        persona_text = []
        distillated_dataset = loaded_dataset
        for i, pair in enumerate(tqdm(loaded_dataset["train"])):
            persona = pair["personality"].copy()
            for j, utterance in enumerate(pair["utterances"]):
                history = utterance["history"][-(2 * args.max_history + 1):]
                with torch.no_grad():
                    out_ids = sample_sequence(tokenizer,
                                              model,
                                              args,
                                              personality=persona,
                                              history=history)
                distillated_dataset["train"][i]["utterances"][j][
                    "distillated_candidates"] = [out_ids]
        torch.save(distillated_dataset, cache_path)
    # qa interact
    if args.task == "qa":
        output_text = []
        ref_text = []
        loaded_dataset, cache_path = get_dataset(tokenizer,
                                                 args.dataset_path,
                                                 args.dataset_cache,
                                                 args.task,
                                                 return_cachepath=True)
        distillated_dataset = loaded_dataset
        for i, pair in enumerate(tqdm(loaded_dataset["train"])):
            evidence = pair["document"].copy()
            evidence = [evidence[0][:MAXLEN_MAP[args.task]['document']]]
            for j, utterance in enumerate(pair["utterances"]):
                history = utterance["history"][-(2 * args.max_history + 1):]
                with torch.no_grad():
                    out_ids = sample_sequence(tokenizer,
                                              model,
                                              args,
                                              personality=evidence,
                                              history=history)
                distillated_dataset["train"][i]["utterances"][j][
                    "distillated_candidates"] = [out_ids]
        torch.save(distillated_dataset, cache_path)

    # nlg interact
    if args.task == "nlg":
        output_text = []
        ref_text = []
        loaded_dataset, cache_path = get_dataset(tokenizer,
                                                 args.dataset_path,
                                                 args.dataset_cache,
                                                 args.task,
                                                 return_cachepath=True)
        distillated_dataset = loaded_dataset
        for i, pair in enumerate(tqdm(loaded_dataset["train"])):
            source = pair["src"]
            target = pair["tgt"]
            with torch.no_grad():
                out_ids = sample_sequence(tokenizer,
                                          model,
                                          args,
                                          source=source,
                                          target=target)
            distillated_dataset["train"][i]["distillated_tgt"] = out_ids
        torch.save(distillated_dataset, cache_path)

    if (args.task == "mt" or args.task == "summarization"):
        output_text = []
        ref_text = []
        loaded_dataset, cache_path = get_dataset(tokenizer,
                                                 args.dataset_path,
                                                 args.dataset_cache,
                                                 args.task,
                                                 return_cachepath=True)
        distillated_dataset = loaded_dataset
        for i, pair in enumerate(tqdm(loaded_dataset["train"])):
            source = pair["src"][:MAXLEN_MAP[args.task]['src']]
            target = pair["tgt"]  #[:MAXLEN_MAP[args.task]['tgt']]
            with torch.no_grad():
                out_ids = sample_sequence(tokenizer,
                                          model,
                                          args,
                                          source=source,
                                          target=target)
            distillated_dataset["train"][i]["distillated_tgt"] = out_ids
        torch.save(distillated_dataset, cache_path)
def run(chapter):
    args = easydict.EasyDict({
        "dataset_path":
        "data/en_book_conversational.json",
        "dataset_cache":
        './dataset_cache',
        "model":
        "gpt2",
        "model_checkpoint":
        "/home/ubuntu/GraduateProject/transfer-learning-conv-ai/runs/Jun04_18-39-17_ime-502_gpt2",
        "max_history":
        4,
        "device":
        "cuda" if torch.cuda.is_available() else "cpu",
        "max_length":
        20,
        "min_length":
        1,
        "seed":
        0,
        "top_p":
        0.9
    })

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if args.model == 'gpt2':
            raise ValueError(
                "Interacting with GPT2 requires passing a finetuned model_checkpoint"
            )
        else:
            args.model_checkpoint = download_pretrained_model()

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (
        GPT2Tokenizer,
        GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer,
                                                       OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
    personalities = [
        dialog["personality"] for dataset in dataset.values()
        for dialog in dataset
    ]
    personality = random.choice(personalities)
    print("Selected personality: ", tokenizer.decode(chain(*personality)))

    while get_persona_label(chapter) not in tokenizer.decode(
            chain(*personality)):
        personality = random.choice(personalities)
    return personality
示例#28
0
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument("--model",
                        type=str,
                        default="gpt2",
                        help="Model type (gpt or gpt2)")
    parser.add_argument("--model_checkpoint",
                        "-mc",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=100,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature",
                        type=float,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    # add option to not use personality
    parser.add_argument("--no_personality",
                        type=bool,
                        default=True,
                        help="Set to not sample a personality.")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if os.path.isdir("./huggingface_s3/"):
            args.model_checkpoint = "./huggingface_s3/"
            logger.info("Loading from pre-downloaded temp path: {}".format(
                args.model_checkpoint))
        else:
            args.model_checkpoint = download_pretrained_model()

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (
        GPT2Tokenizer,
        GPT2LMHeadModel) if "gpt2" == args.model else (OpenAIGPTTokenizer,
                                                       OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)

    model.to(args.device)
    add_special_tokens_(model, tokenizer)
    model.eval()

    # added the option to opt out of using a personality
    if args.no_personality:
        logger.info("No personality is sampled for this chatbot.")
        personality = ""
        # personality = ["My name is Isabelle Hawkins.",
        #                "I am five years old.",
        #                "My phone number is 959-100-9300.",
        #                "Here is a link I would like you to check out: google.com.",
        #                "I would like to know more about you."]
        # personality = [tokenizer.encode(p) for p in personality]
        # logger.info("Selected custom personality: %s",tokenizer.decode(chain(*personality)))
    else:
        logger.info("Sample a personality")
        personalities = get_dataset_personalities(tokenizer, args.dataset_path,
                                                  args.dataset_cache)
        personality = random.choice(personalities)
        # import pdb; pdb.set_trace()
        logger.info("Selected personality: %s",
                    tokenizer.decode(chain(*personality)))

    history = []
    # while True:
    #     custom_history = input("Press 0 to end\n\tAdd history: ")
    #     if custom_history == '0':
    #         break
    #     else:
    #         history.append(tokenizer.encode(custom_history))

    while True:
        history = []
        args.temperature = float(input("Set temperature: > 0 and <= 1"))
        prompt = input("Speaker 1 >>> ")
        while not prompt:
            print('Prompt should not be empty!')
            prompt = input("Speaker 1 >>> ")
        history.append(tokenizer.encode(prompt))

        i = 0
        while True:
            with torch.no_grad():
                out_ids = sample_sequence(personality, history, tokenizer,
                                          model, args)
            history.append(out_ids)
            history = history[-(2 * args.max_history + 1):]
            out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
            i += 1
            speaker = "Speaker 2" if i % 2 else "Speaker 1"
            print(f"{speaker}: {out_text}")

            if i == 10:
                break
示例#29
0
def init(quotes, quotes_num=16):
    global history
    global personality
    global tokenizer
    global model
    global args
    global parser
    global logger
    
    # new conversation
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")

    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=200, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if args.model == 'gpt2':
            raise ValueError("Interacting with GPT2 requires passing a finetuned model_checkpoint")
        else:
            args.model_checkpoint = download_pretrained_model()
	
	
    if args.seed != 0:
    	random.seed(args.seed)
    	torch.random.manual_seed(args.seed)
    	torch.cuda.manual_seed(args.seed)


    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    logger.info("Get personality")
    personality =  [string_transformer('my name is WabiSabi', tokenizer, False)]
    random.shuffle(quotes)
    # quotes = quotes[:16]
    # quotes = [q for _, q in zip(range(quotes_num), quotes)]
    concatenated = " ".join(quotes)[0:1600]
    quotes = concatenated.split('.')
    print(quotes)
    [personality.append(string_transformer(s, tokenizer)) for s in quotes]
    # print(personality)
    logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))

    history = []
示例#30
0
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--use_adapter",
                        default=False,
                        action='store_true',
                        help="Use adapter or not")
    parser.add_argument("--keyword_module",
                        type=str,
                        default="",
                        help="add, attention, ")
    parser.add_argument(
        "--model",
        type=str,
        default="openai-gpt",
        help="Model type (openai-gpt or gpt2)",
        choices=['openai-gpt',
                 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--bert_model_path",
                        default="./",
                        type=str,
                        help="Bert pre-trained model path")
    parser.add_argument(
        "--vocab_file",
        default="./vocab.korean.rawtext.list",
        type=str,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=50,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=50,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if args.model == 'gpt2':
            raise ValueError(
                "Interacting with GPT2 requires passing a finetuned model_checkpoint"
            )
        else:
            args.model_checkpoint = download_pretrained_model()

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")

    # Load KoBERT model and tokenizer
    bert_tokenizer = BertTokenizer.from_pretrained(
        args.vocab_file, do_lower_case=args.do_lower_case)
    bert_model = BertModel.from_pretrained(args.bert_model_path)
    bert_model.to(args.device)
    bert_model.eval()

    # Load KoGPT2 model and tokenizer
    tok_path = get_tokenizer()
    gpt_model, gpt_vocab = get_pytorch_conkogpt2_model2(
        use_adapter=args.use_adapter)
    gpt_tokenizer = SentencepieceTokenizer(tok_path)
    gpt_model.to(args.device)
    gpt_model.eval()

    model = Seq2Seq(bert_model, gpt_model, gpt_vocab, args)
    model.load_state_dict(torch.load(args.model_checkpoint), strict=False)
    model.to(args.device)
    model.eval()

    logger.info("Load test data")
    sourceList, targetList = get_test_dataset(bert_tokenizer, gpt_tokenizer,
                                              gpt_vocab, args.dataset_path)

    f1 = open((args.model_checkpoint + "_output.txt"), 'w')
    for line in zip(sourceList, targetList):
        out_ids = sample_sequence(line[0], bert_model, bert_tokenizer,
                                  gpt_model, gpt_vocab, args)
        out_texts = gpt_vocab.to_tokens(out_ids)
        for text in out_texts:
            f1.write(text.replace('▁', ' ').replace('</s>', ' '))
        """
        for id in out_ids:
            f1.write(str(id))
            f1.write(' ')
        """
        f1.write("\n")
    f1.close()