def encode_file(self, path, ordered=False, verbose=False, add_eos=False, add_double_eos=False): if verbose: print('encoding file {} ...'.format(path)) assert exists(path) encoded = [] with open(path, 'r') as f: for idx, line in enumerate(f): if verbose and idx > 0 and idx % 500000 == 0: print(' line {}'.format(idx)) if len(line.strip()) == 0: continue symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos) encoded.append(self.convert_to_nparray(symbols)) if ordered: encoded = np.concatenate(encoded) return encoded
def main(unused_argv): del unused_argv # Unused corpus = get_lm_corpus(FLAGS.data_dir, FLAGS.dataset) save_dir = os.path.join(FLAGS.data_dir, "tfrecords") if not exists(save_dir): makedirs(save_dir) # test mode if FLAGS.per_host_test_bsz > 0: corpus.convert_to_tfrecords("test", save_dir, FLAGS.per_host_test_bsz, FLAGS.tgt_len, FLAGS.num_core_per_host, FLAGS=FLAGS) return for split, batch_size in zip( ["train", "valid"], [FLAGS.per_host_train_bsz, FLAGS.per_host_valid_bsz]): if batch_size <= 0: continue print("Converting {} set...".format(split)) corpus.convert_to_tfrecords(split, save_dir, batch_size, FLAGS.tgt_len, FLAGS.num_core_per_host, FLAGS=FLAGS)
def get_lm_corpus(data_dir, dataset): fn = os.path.join(data_dir, "cache.pkl") if exists(fn): print("Loading cached dataset...") with open(fn, "rb") as fp: corpus = pickle.load(fp) else: print("Producing dataset...") kwargs = {} kwargs["special"] = ["<bos>", "<eos>", "<UNK>"] kwargs["lower_case"] = False kwargs["max_size"] = FLAGS.max_size kwargs["min_freq"] = FLAGS.min_freq kwargs["vocab_file"] = FLAGS.vocab_dir corpus = Corpus(data_dir, dataset, **kwargs) if FLAGS.vocab_dir == None: corpus.vocab.save_vocab(os.getcwd(), dataset) print("Saving dataset...") with open(fn, "wb") as fp: pickle.dump(corpus, fp, protocol=2) corpus_info = { "vocab_size": len(corpus.vocab), "cutoffs": corpus.cutoffs, "dataset": corpus.dataset } with open(os.path.join(data_dir, "corpus-info.json"), "w") as fp: json.dump(corpus_info, fp) return corpus
def get_lm_corpus(data_dir, dataset): fn = os.path.join(data_dir, "cache.pkl") print(fn) if exists(fn): print("Loading cached dataset...") with open(fn, "rb") as fp: corpus = pickle.load(fp) else: print("Producing dataset...") kwargs = {} if dataset in ["wt103", "wt2"]: kwargs["special"] = ["<eos>"] kwargs["lower_case"] = False elif dataset == "ptb": kwargs["special"] = ["<eos>"] kwargs["lower_case"] = True elif dataset == "lm1b": kwargs["special"] = [] kwargs["lower_case"] = False kwargs["vocab_file"] = os.path.join(data_dir, "1b_word_vocab.txt") corpus = Corpus(data_dir, dataset, **kwargs) corpus_info = { "vocab_size": len(corpus.vocab), "cutoffs": corpus.cutoffs, "dataset": corpus.dataset } with open(os.path.join(data_dir, "corpus-info.json"), "w") as fp: json.dump(corpus_info, fp) return corpus
def get_lm_corpus(data_dir, dataset): fn = os.path.join(data_dir, "cache.pkl") if exists(fn): print("Loading cached dataset...") with open(fn, "rb") as fp: corpus = pickle.load(fp) else: print("Producing dataset...") kwargs = {} if dataset in ["doupo", "test", "wt103", "zhihu", "poetry", "tangshi"]: kwargs["special"] = ["<eos>"] kwargs["lower_case"] = False corpus = Corpus(data_dir, dataset, **kwargs) print("Saving dataset...") with open(fn, "wb") as fp: pickle.dump(corpus, fp, protocol=2) corpus_info = { "vocab_size": len(corpus.vocab), "cutoffs": corpus.cutoffs, "dataset": corpus.dataset } with open(os.path.join(data_dir, "corpus-info.json"), "w") as fp: json.dump(corpus_info, fp) return corpus
def count_file(self, path, verbose=False, add_eos=False): if verbose: print('counting file {} ...'.format(path)) assert exists(path) with open(path, 'r') as f: for idx, line in enumerate(f): if verbose and idx > 0 and idx % 500000 == 0: print(' line {}'.format(idx)) symbols = self.tokenize(line, add_eos=add_eos) self.counter.update(symbols)
def get_lm_corpus(data_dir, dataset): fn = os.path.join(data_dir, "cache.pkl") if exists(fn): print("Loading cached dataset...") with open(fn, "rb") as fp: corpus = pickle.load(fp) else: print("Producing dataset...") kwargs = {} if dataset in ["wt103", "wt2"]: kwargs["special"] = ["<eos>"] kwargs["lower_case"] = False elif dataset == "generic_dataset": with open(os.path.join(data_dir, 'vocab-params.json')) as f: kwargs = json.load(f) elif dataset == "ptb": kwargs["special"] = ["<eos>"] kwargs["lower_case"] = True elif dataset == "lm1b": kwargs["special"] = [] kwargs["lower_case"] = False kwargs["vocab_file"] = os.path.join(data_dir, "1b_word_vocab.txt") elif dataset in ["enwik8", "text8"]: pass corpus = Corpus(data_dir, dataset, **kwargs) # TODO do something smarter here, maybe joblib would work better? print("Saving dataset...") try: with open(fn, "wb") as fp: pickle.dump(corpus, fp, protocol=2) except Exception: traceback.print_exc() if os.path.exists(fn): os.unlink(fn) print('Ignored error when saving dataset') corpus_info = { "vocab_size": len(corpus.vocab), "cutoffs": corpus.cutoffs, "dataset": corpus.dataset } with open(os.path.join(data_dir, "corpus-info.json"), "w") as fp: json.dump(corpus_info, fp) return corpus
def get_corpus(dataset_name, data_dir, pitch_classes, time_steps_vocab, processing_conf): """ Load groove data into custom Corpus class Param ===== dataset_name: str Name of groove dataset to download from tensorflow datasets data_dir: str Path to store data in (corpus, tf records) pitch_classes: list list of lists indicating pitch class groupings time_steps_vocab: dict Dict of {number of ticks: token} for converting silence to tokens processing_conf: dict Dict of processing options Returns ======= bumblebeat.data.Corpus object """ fn = os.path.join(data_dir, dataset_name, "cache.pkl") if exists(fn): print("Loading cached dataset...") with open(fn, "rb") as fp: corpus = pickle.load(fp) else: bumblebeat.utils.data.create_dir_if_not_exists(fn) print("Producing dataset...") corpus = Corpus( data_dir=data_dir, dataset_name=dataset_name, pitch_classes=pitch_classes, time_steps_vocab=time_steps_vocab, processing_conf=processing_conf ) print("Saving dataset...") with open(fn, "wb") as fp: pickle.dump(corpus, fp, protocol=2) return corpus
def get_lm_corpus(data_dir, dataset): fn = os.path.join(data_dir, "cache.pkl") if exists(fn): print("Loading cached dataset...") with open(fn, "rb") as fp: corpus = joblib.load(fp) else: print("Producing dataset...") kwargs = {} if dataset in ["wt103", "wt2"]: kwargs["special"] = ["<eos>"] kwargs["lower_case"] = False elif dataset == "mn-dataset": kwargs["model_path"] = '../data' kwargs["vocab_file"] = 'mn_cased.vocab' elif dataset == "ptb": kwargs["special"] = ["<eos>"] kwargs["lower_case"] = True elif dataset == "lm1b": kwargs["special"] = [] kwargs["lower_case"] = False kwargs["vocab_file"] = os.path.join(data_dir, "1b_word_vocab.txt") elif dataset in ["enwik8", "text8"]: pass corpus = Corpus(data_dir, dataset, **kwargs) print("Saving dataset...") with open(fn, "wb") as fp: joblib.dump(corpus, fp) corpus_info = { "vocab_size": len(corpus.vocab), "cutoffs": corpus.cutoffs, "dataset": corpus.dataset } with open(os.path.join(data_dir, "corpus-info.json"), "w") as fp: json.dump(corpus_info, fp) return corpus
def main(unused_argv): del unused_argv # Unused corpus = get_lm_corpus(FLAGS.data_dir, FLAGS.dataset) save_dir = os.path.join(FLAGS.data_dir, "tfrecords") if not exists(save_dir): makedirs(save_dir) # # test mode if FLAGS.per_host_test_bsz > 0: corpus.convert_to_tfrecords("test", save_dir, FLAGS.per_host_test_bsz, FLAGS.tgt_len, FLAGS.num_core_per_host, FLAGS=FLAGS) return for split, batch_size in zip( ["train", "valid"], [FLAGS.per_host_train_bsz, FLAGS.per_host_valid_bsz]): if batch_size <= 0: continue print("Converting {} set...".format(split)) corpus.convert_to_tfrecords(split, save_dir, batch_size, FLAGS.tgt_len, FLAGS.num_core_per_host, FLAGS=FLAGS) fn = os.path.join(FLAGS.data_dir, "cache.pkl") print("Saving dataset...") with open(fn, "wb") as fp: pickle.dump(corpus, fp, protocol=2)
def main(unused_argv): del unused_argv # Unused tf.logging.set_verbosity(tf.logging.INFO) # Get corpus info corpus_info = data_utils.get_corpus_info(FLAGS.corpus_info_path) n_token = corpus_info["vocab_size"] cutoffs = corpus_info["cutoffs"][1:-1] if FLAGS.save_steps == 0: FLAGS.save_steps = None if not FLAGS.do_eval_only: # Get train input function train_input_fn, train_record_info = data_utils.get_input_fn( record_info_dir=FLAGS.record_info_dir, split="train", per_host_bsz=FLAGS.train_batch_size // FLAGS.num_hosts, tgt_len=FLAGS.tgt_len, num_core_per_host=FLAGS.num_core_per_host, num_hosts=FLAGS.num_hosts, use_tpu=FLAGS.use_tpu) train_bin_sizes = train_record_info["bin_sizes"] num_train_batch = train_record_info["num_batch"] # Get train cache function train_cache_fn = get_cache_fn(FLAGS.mem_len) else: train_bin_sizes = [] num_train_batch = None train_cache_fn = None if FLAGS.do_eval or FLAGS.do_eval_only: assert FLAGS.num_hosts == 1 # Get eval input function eval_input_fn, eval_record_info = data_utils.get_input_fn( record_info_dir=FLAGS.record_info_dir, split=FLAGS.eval_split, per_host_bsz=FLAGS.eval_batch_size // FLAGS.num_hosts, tgt_len=FLAGS.tgt_len, num_core_per_host=FLAGS.num_core_per_host, num_hosts=FLAGS.num_hosts, use_tpu=FLAGS.use_tpu) eval_bin_sizes = eval_record_info["bin_sizes"] num_eval_batch = eval_record_info["num_batch"] if FLAGS.max_eval_batch > 0: num_eval_batch = min(FLAGS.max_eval_batch, num_eval_batch) # Get eval cache function eval_cache_fn = get_cache_fn(FLAGS.mem_len) model_fn = get_model_fn(n_token, cutoffs, train_bin_sizes, eval_bin_sizes) else: eval_cache_fn = None model_fn = get_model_fn(n_token, cutoffs, train_bin_sizes, []) ##### Create estimator # TPU Configuration tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) per_host_input = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations, num_shards=FLAGS.num_core_per_host * FLAGS.num_hosts, per_host_input_for_training=per_host_input), keep_checkpoint_max=100000, # effectively save all checkpoints save_checkpoints_secs=None, save_checkpoints_steps=FLAGS.save_steps) # warm start warm_start_from = None if FLAGS.warm_start_path is not None: warm_start_from = tf.estimator.WarmStartSettings( ckpt_to_initialize_from=FLAGS.warm_start_path) # TPU Estimator estimator = tpu_estimator.TPUEstimator( model_fn=model_fn, train_cache_fn=train_cache_fn, eval_cache_fn=eval_cache_fn, use_tpu=FLAGS.use_tpu, config=run_config, params={ "data_dir": FLAGS.data_dir, "track_mean": FLAGS.track_mean }, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, warm_start_from=warm_start_from) if FLAGS.do_eval_only: if FLAGS.eval_ckpt_path is not None: ret = estimator.evaluate(input_fn=eval_input_fn, steps=num_eval_batch, checkpoint_path=FLAGS.eval_ckpt_path) tf.logging.info("=" * 200) log_str = "Eval results | " for key, val in ret.items(): log_str += "{} {} | ".format(key, val) tf.logging.info(log_str) tf.logging.info("=" * 200) else: ckpt_state = tf.train.get_checkpoint_state(FLAGS.model_dir) eval_results = [] for eval_checkpoint in ckpt_state.all_model_checkpoint_paths: if not exists(eval_checkpoint + ".index"): continue global_step = int(eval_checkpoint.split("-")[-1]) if global_step < FLAGS.start_eval_steps or global_step > FLAGS.train_steps: continue ret = estimator.evaluate(input_fn=eval_input_fn, steps=num_eval_batch, checkpoint_path=eval_checkpoint) eval_results.append(ret) eval_results.sort(key=lambda x: x["perplexity"]) tf.logging.info("=" * 200) log_str = "Best results | " for key, val in eval_results[0].items(): log_str += "{} {} | ".format(key, val) tf.logging.info(log_str) tf.logging.info("=" * 200) else: if not FLAGS.do_eval: estimator.train(input_fn=train_input_fn, steps=FLAGS.train_steps) else: for step in range(0, FLAGS.train_steps, num_train_batch): train_steps = min(FLAGS.train_steps - step, num_train_batch) estimator.train(input_fn=train_input_fn, steps=train_steps) estimator.evaluate(input_fn=eval_input_fn, steps=num_eval_batch)