def _get_encoded_datasets(tokenizer, config): pylog.info("Encoding dataset...") train_dataset = _load_rocstories_dataset(config["train_dataset"]) eval_dataset = _load_rocstories_dataset(config["eval_dataset"]) datasets = (train_dataset, eval_dataset) encoded_datasets = _tokenize_and_encode(datasets, tokenizer) return encoded_datasets
def _check_is_cached_the_file(url, cache_path): if not os.path.exists(cache_path): pylog.info("not found %s, downloading %s to tempfile" % (cache_path, url)) return False else: pylog.debug("do not need download, exists %s" % cache_path) return True
def _show_result(result, config): output_eval_file = os.path.join(config["output_dir"], "eval_results.txt") with open(output_eval_file, "w") as writer: pylog.info("***** Eval results *****") for key in sorted(result.keys()): pylog.info("%s = %s" % (key, str(result[key]))) writer.write("%s = %s\n" % (key, str(result[key])))
def _get_spacy_en_install_path(name, config): url, cache_dir = _get_url_and_cache_dir(name, config) spacy_dict = config["cache_dict"][name] env_path = os.path.dirname(os.__file__) install_path = "%s/site-packages/%s/%s" % (env_path, spacy_dict["name"], spacy_dict["name_ver"]) pylog.info(install_path) return install_path
def _write_temp_file(url, cache_path): with tempfile.NamedTemporaryFile() as temp_file: pylog.info("tempfile: %s" % (temp_file.name)) _http_get(url, temp_file) temp_file.flush() temp_file.seek(0) pylog.info("copying %s to cache at %s" % (temp_file.name, cache_path)) with open(cache_path, 'wb') as cache_file: shutil.copyfileobj(temp_file, cache_file)
def _get_max_and_input_length(model, encoded_datasets): max_length = model.config.n_positions // 2 - 2 pylog.info("max_length:%s" % max_length) input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions) # Max size of input for the pre-trained model pylog.info("input_length:%s" % input_length) return max_length, input_length
def show_msg(class_name, missing_keys, unexpected_keys, error_msgs): if len(missing_keys) > 0: pylog.info("Weights of %s not initialized from pretrained model: %s" % (class_name, missing_keys)) if len(unexpected_keys) > 0: pylog.info("Weights from pretrained model not used in %s: %s" % (class_name, unexpected_keys)) if len(error_msgs) > 0: raise RuntimeError( "Error(s) in loading state_dict for {}:\n\t{}".format( class_name, "\n\t".join(error_msgs)))
def run_finetuned20(config): _set_config_files(config) model = None epochs_count = 0 while True: train_loss, model_config, model = train_graph.do_train(config, model) epochs_count +=1 eval_loss, eval_accuracy = eval_graph.do_eval(model_config, config) pylog.info(epochs_count) show.show_result_detail(eval_loss, eval_accuracy, train_loss, config) if epochs_count > 20: break
def start_server(self, delay): if sys.version > '3': asyncio.set_event_loop(asyncio.new_event_loop()) http_server = tornado.httpserver.HTTPServer(self.deal_request, no_keep_alive=False) pylog.info("listen_port:%s" % self.get_listen_port()) http_server.listen(self.get_listen_port()) if delay: tornado.ioloop.IOLoop.instance().call_later( delay, tornado.ioloop.IOLoop.instance().stop) tornado.ioloop.IOLoop.instance().start()
def show_model_paramter_size(state_dict): count = 0 for key in state_dict: pylog.info(key) pylog.info(state_dict[key].size()) size_lt = state_dict[key].size() size_num = 1 for i in size_lt: size_num *= i count += size_num pylog.info("paramete count:%d" % count) pylog.info("paramete count :%d bytes" % (count * 4)) pylog.info("paramete count :%d M" % ((count * 4) / (1024 * 1024)))
def _get_encoded_datasets(tokenizer, config): pylog.info("Encoding dataset...") train_dataset = _load_rocstories_dataset(config["train_dataset"]) eval_dataset = _load_rocstories_dataset(config["eval_dataset"]) datasets = (train_dataset, eval_dataset) if config["model_name"] == "gpt": encoded_datasets = _tokenize_and_encode(datasets, tokenizer) # pylog.info(len(encoded_datasets[1])) # pylog.info(encoded_datasets[1][0]) return encoded_datasets elif config["model_name"] == "gpt2": encoded_datasets = _gpt2_tokenize_and_encode(datasets, tokenizer) # pylog.info(len(encoded_datasets[1])) # pylog.info(encoded_datasets[1][0]) return encoded_datasets
def set_special_tokens(self, special_tokens_lt): """ Add a list of additional tokens to the encoder. The additional tokens are indexed starting from the last index of the current vocabulary in the order of the `special_tokens` list. """ if not special_tokens_lt: self.special_tokens = {} self.special_tokens_decoder = {} return self.special_tokens = _get_special_tokens(special_tokens_lt, self.encoder) self.special_tokens_decoder = _get_special_tokens_decoder( self.special_tokens) if self.nlp_type == "bert": # Using BERT's BasicTokenizer: we can update the tokenizer self.nlp.never_split = special_tokens_lt pylog.info("Special tokens %s" % self.special_tokens)
def one_test(): data = "Pleaseencrypt my data" k = des("DESCRYPT", CBC, "\0\0\0\0\0\0\0\0", pad=None, padmode=PAD_PKCS5) d = k.encrypt(data) pylog.info("Encrypted:%r" % binascii.hexlify(d)) pylog.info("Decrypted:%r" % k.decrypt(d)) pylog.info("data:%s" % data)
def make_handler(self, func): interface_name = self.get_interface_name(func) pylog.info("interface_name:%s" % interface_name) interface_mapping_dic = self.get_interface_mapping_dic() interface_mapping_dic[interface_name] = func
def show_interface_load_count_dic(): dic = get_interface_load_count_dic() pylog.info(dic)
def walk(self): con = "%s %s" % (self.__class__.__name__,sys._getframe().f_code.co_name) pylog.info(con) return con
def eat(self,food): con = "%s %s:%s" % (self.__class__.__name__,sys._getframe().f_code.co_name,food) pylog.info(con) return con
def collect_sunshine(self): con = "%s %s" % (self.__class__.__name__,sys._getframe().f_code.co_name) pylog.info(con) return con
def move(self): con = "Animal move" pylog.info(con) return con
def start(self, delay): pylog.info("thread id:%s" % pythread.get_thread_id()) self.map_url_handler() self.start_server(delay) return True
def _untar(filename, dirs): t = tarfile.open(filename) pylog.info("decompression %s" % filename) t.extractall(path=dirs)
def show_devices(config): device = config["device"] n_gpu = torch.cuda.device_count() pylog.info("device:%s, number of gpus:%s" % (device, n_gpu))
def _get_spacy_en_install_path(config): spacy_dict = config["pretrained_model_dict"]["spacy"] env_path = os.path.dirname(os.__file__) install_path = "%s/site-packages/%s/%s" % (env_path, spacy_dict["name"], spacy_dict["name_ver"]) pylog.info(install_path) return install_path
def _install_spacy(cache_path): pylog.info(cache_path) cmd_str = "pip install %s" % cache_path pylog.info(cmd_str) os.system(cmd_str)