예제 #1
0
def _get_encoded_datasets(tokenizer, config):
    pylog.info("Encoding dataset...")
    train_dataset = _load_rocstories_dataset(config["train_dataset"])
    eval_dataset = _load_rocstories_dataset(config["eval_dataset"])
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = _tokenize_and_encode(datasets, tokenizer)
    return encoded_datasets
예제 #2
0
def _check_is_cached_the_file(url, cache_path):
    if not os.path.exists(cache_path):
        pylog.info("not found %s, downloading %s to tempfile" %  (cache_path, url))
        return False
    else:
        pylog.debug("do not need download, exists %s" % cache_path)
        return True
예제 #3
0
파일: show.py 프로젝트: lgstd/gpt2
def _show_result(result, config):
    output_eval_file = os.path.join(config["output_dir"], "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        pylog.info("***** Eval results *****")
        for key in sorted(result.keys()):
            pylog.info("%s = %s" % (key, str(result[key])))
            writer.write("%s = %s\n" % (key, str(result[key])))
예제 #4
0
파일: down_cache.py 프로젝트: lgstd/gpt2
def _get_spacy_en_install_path(name, config):
    url, cache_dir = _get_url_and_cache_dir(name, config)
    spacy_dict = config["cache_dict"][name]
    env_path = os.path.dirname(os.__file__)
    install_path = "%s/site-packages/%s/%s" % (env_path, spacy_dict["name"], spacy_dict["name_ver"])
    pylog.info(install_path)
    return install_path
예제 #5
0
def _write_temp_file(url, cache_path):
    with tempfile.NamedTemporaryFile() as temp_file:
        pylog.info("tempfile: %s" %  (temp_file.name))
        _http_get(url, temp_file)
        temp_file.flush()
        temp_file.seek(0)
        pylog.info("copying %s to cache at %s" % (temp_file.name, cache_path))
        with open(cache_path, 'wb') as cache_file:
            shutil.copyfileobj(temp_file, cache_file)
예제 #6
0
def _get_max_and_input_length(model, encoded_datasets):
    max_length = model.config.n_positions // 2 - 2
    pylog.info("max_length:%s" % max_length)

    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
    pylog.info("input_length:%s" % input_length)
    return max_length, input_length
예제 #7
0
def show_msg(class_name, missing_keys, unexpected_keys, error_msgs):
    if len(missing_keys) > 0:
        pylog.info("Weights of %s not initialized from pretrained model: %s" %
                   (class_name, missing_keys))
    if len(unexpected_keys) > 0:
        pylog.info("Weights from pretrained model not used in %s: %s" %
                   (class_name, unexpected_keys))
    if len(error_msgs) > 0:
        raise RuntimeError(
            "Error(s) in loading state_dict for {}:\n\t{}".format(
                class_name, "\n\t".join(error_msgs)))
예제 #8
0
파일: main.py 프로젝트: lgstd/gpt2
def run_finetuned20(config):
    _set_config_files(config)
    model = None
    epochs_count = 0
    while True:
        train_loss, model_config, model = train_graph.do_train(config, model)
        epochs_count +=1
        eval_loss, eval_accuracy = eval_graph.do_eval(model_config, config)
        pylog.info(epochs_count)
        show.show_result_detail(eval_loss, eval_accuracy, train_loss, config)
        if epochs_count > 20:
            break
예제 #9
0
 def start_server(self, delay):
     if sys.version > '3':
         asyncio.set_event_loop(asyncio.new_event_loop())
     http_server = tornado.httpserver.HTTPServer(self.deal_request,
                                                 no_keep_alive=False)
     pylog.info("listen_port:%s" % self.get_listen_port())
     http_server.listen(self.get_listen_port())
     if delay:
         tornado.ioloop.IOLoop.instance().call_later(
             delay,
             tornado.ioloop.IOLoop.instance().stop)
     tornado.ioloop.IOLoop.instance().start()
예제 #10
0
def show_model_paramter_size(state_dict):
    count = 0
    for key in state_dict:
        pylog.info(key)
        pylog.info(state_dict[key].size())
        size_lt = state_dict[key].size()
        size_num = 1
        for i in size_lt:
            size_num *= i
        count += size_num

    pylog.info("paramete count:%d" % count)
    pylog.info("paramete count :%d bytes" % (count * 4))
    pylog.info("paramete count :%d M" % ((count * 4) / (1024 * 1024)))
예제 #11
0
def _get_encoded_datasets(tokenizer, config):
    pylog.info("Encoding dataset...")
    train_dataset = _load_rocstories_dataset(config["train_dataset"])
    eval_dataset = _load_rocstories_dataset(config["eval_dataset"])
    datasets = (train_dataset, eval_dataset)
    if config["model_name"] == "gpt":
        encoded_datasets = _tokenize_and_encode(datasets, tokenizer)
#        pylog.info(len(encoded_datasets[1]))
#        pylog.info(encoded_datasets[1][0])
        return encoded_datasets
    elif config["model_name"] == "gpt2":
        encoded_datasets = _gpt2_tokenize_and_encode(datasets, tokenizer)
#        pylog.info(len(encoded_datasets[1]))
#        pylog.info(encoded_datasets[1][0])
        return encoded_datasets
예제 #12
0
    def set_special_tokens(self, special_tokens_lt):
        """ Add a list of additional tokens to the encoder.
            The additional tokens are indexed starting from the last index of the
            current vocabulary in the order of the `special_tokens` list.
        """

        if not special_tokens_lt:
            self.special_tokens = {}
            self.special_tokens_decoder = {}
            return

        self.special_tokens = _get_special_tokens(special_tokens_lt,
                                                  self.encoder)
        self.special_tokens_decoder = _get_special_tokens_decoder(
            self.special_tokens)

        if self.nlp_type == "bert":
            # Using BERT's BasicTokenizer: we can update the tokenizer
            self.nlp.never_split = special_tokens_lt
        pylog.info("Special tokens %s" % self.special_tokens)
예제 #13
0
def one_test():
    data = "Pleaseencrypt my data"
    k = des("DESCRYPT", CBC, "\0\0\0\0\0\0\0\0", pad=None, padmode=PAD_PKCS5)
    d = k.encrypt(data)
    pylog.info("Encrypted:%r" % binascii.hexlify(d))
    pylog.info("Decrypted:%r" % k.decrypt(d))
    pylog.info("data:%s" % data)
예제 #14
0
 def make_handler(self, func):
     interface_name = self.get_interface_name(func)
     pylog.info("interface_name:%s" % interface_name)
     interface_mapping_dic = self.get_interface_mapping_dic()
     interface_mapping_dic[interface_name] = func
예제 #15
0
def show_interface_load_count_dic():
    dic = get_interface_load_count_dic()
    pylog.info(dic)
예제 #16
0
 def walk(self):
     con = "%s %s" % (self.__class__.__name__,sys._getframe().f_code.co_name)
     pylog.info(con)
     return con
예제 #17
0
 def eat(self,food):
     con = "%s %s:%s" % (self.__class__.__name__,sys._getframe().f_code.co_name,food)
     pylog.info(con)
     return con
예제 #18
0
 def collect_sunshine(self):
     con = "%s %s" % (self.__class__.__name__,sys._getframe().f_code.co_name)
     pylog.info(con)
     return con
예제 #19
0
 def move(self):
     con = "Animal move"
     pylog.info(con)
     return con
예제 #20
0
 def start(self, delay):
     pylog.info("thread id:%s" % pythread.get_thread_id())
     self.map_url_handler()
     self.start_server(delay)
     return True
예제 #21
0
def _untar(filename, dirs):
    t = tarfile.open(filename)
    pylog.info("decompression %s" % filename)
    t.extractall(path=dirs)
예제 #22
0
파일: show.py 프로젝트: lgstd/gpt2
def show_devices(config):
    device = config["device"]
    n_gpu = torch.cuda.device_count()
    pylog.info("device:%s, number of gpus:%s" % (device, n_gpu))
예제 #23
0
def _get_spacy_en_install_path(config):
    spacy_dict = config["pretrained_model_dict"]["spacy"]
    env_path = os.path.dirname(os.__file__)
    install_path = "%s/site-packages/%s/%s" % (env_path, spacy_dict["name"], spacy_dict["name_ver"])
    pylog.info(install_path)
    return install_path
예제 #24
0
def _install_spacy(cache_path):
    pylog.info(cache_path)
    cmd_str = "pip install %s" % cache_path
    pylog.info(cmd_str)
    os.system(cmd_str)