def load_vocabulary(): if os.path.exists(config.vocabulary_path): word2index = {} with open(config.vocabulary_path) as file: for line in file: line_spl = line[:-1].split() word2index[line_spl[0]] = int(line_spl[1]) index2word = dict(zip(word2index.values(), word2index.keys())) vocab = Vocabulary() vocab.word2index = word2index vocab.index2word = index2word return vocab else: raise ('not found %s' % config.vocabulary_path)
def load_vocabulary(): if os.path.exists(CKPT_PATH + config['TRAIN']['VOCABULARY']): word2index = {} with open(CKPT_PATH + config['TRAIN']['VOCABULARY']) as file: for line in file: line_spl = line[:-1].split() word2index[line_spl[0]] = int(line_spl[1]) index2word = dict(zip(word2index.values(), word2index.keys())) vocab = Vocabulary() vocab.word2index = word2index vocab.index2word = index2word return vocab else: raise ('not found %s' % CKPT_PATH + config['TRAIN']['VOCABULARY'])
def test_vocabulary(self): vocab = Vocabulary.from_file("testdata/test_vocab.txt") self.assertEqual(vocab.num_tokens, 1000) self.assertEqual(vocab.s_id, 2) self.assertEqual(vocab.s, "<S>") self.assertEqual(vocab.unk_id, 38) self.assertEqual(vocab.unk, "<UNK>")
def main(_): """ Start either train or eval. Note hardcoded parts of path for training and eval data """ hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps._set("num_gpus", FLAGS.num_gpus) print('*****HYPER PARAMETERS*****') print(hps) print('**************************') vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "vocabulary.txt")) if FLAGS.mode == "train": #hps.batch_size = 256 dataset = Dataset(vocab, os.path.join(FLAGS.datadir, "train.txt")) run_train(dataset, hps, os.path.join(FLAGS.logdir, "train"), ps_device="/gpu:0") elif FLAGS.mode.startswith("eval"): data_dir = os.path.join(FLAGS.datadir, "eval.txt") #predict_model = prediction.Model('/dir/ckpt',os.path.join(FLAGS.datadir, "vocabulary.txt"), hps) dataset = Dataset(vocab, data_dir, deterministic=True) prefix_words = "<brk>".split() predict_model = predict.Model(hps, FLAGS.logdir, FLAGS.datadir) print('start input') out = predict_model.predictnextkwords(prefix_words, FLAGS.num_sen) for row in out: print(' '.join(row) + "\n") print("len_out: " + str(len(out)))
def test_vectorize_smile(self): """Test the functionality of vectorize_smile.""" dataset = make_generative_dataset(self.data_path) vocab = Vocabulary.get_default_vocab() data_hparams = build_base_data_hparams() vec_func = functools.partial(vectorize_smile, vocab=vocab, data_hparams=data_hparams) data_iter = dataset.map(vec_func).make_one_shot_iterator().get_next() with tf.Session() as sess: line_id = 0 while True: try: data_dict = sess.run(data_iter) seq_inputs = data_dict["seq_inputs"] seq_labels = data_dict["seq_labels"] # pylint: disable=no-member self.assertEqual(seq_inputs.argmax(1)[0], vocab.GO_ID) self.assertEqual(seq_labels[-1], vocab.EOS_ID) # pylint: enable=no-member self.assertEqual(seq_inputs.shape[0], seq_labels.shape[0]) if line_id == 0: # Note the sequence length is 35 (plus a EOS symbol). self.assertEqual(data_dict["seq_lens"], 36) line_id += 1 except tf.errors.OutOfRangeError: break
def __init__(self, model_path: str, vocab_path: str): self.vocabulary = Vocabulary.from_file(vocab_path) config = tf.ConfigProto(allow_soft_placement=True) self.session = tf.Session(config=config) saver = tf.train.import_meta_graph('{}.meta'.format(model_path)) saver.restore(self.session, str(model_path)) self.input_xs = tf.get_collection('input_xs')[0] self.batch_size = tf.get_collection('batch_size')[0] self.softmax = tf.get_collection('softmax')[0] self.num_steps = 20
def train(hparams, data_hparams): vocab = Vocabulary.get_default_vocab(not data_hparams.skip_at_symbol) # Create global step variable first. train_data, val_data, test_data = make_train_data( json.loads(FLAGS.dataset_spec), vocab, data_hparams, FLAGS.epochs) model = DiscoveryModel(data_hparams, hparams, vocab) train_outputs, _, _ = model.build_train_graph(train_data) seq_loss_op, train_op = model.build_train_loss(train_data, train_outputs) with tf.control_dependencies([val_data.initializer, test_data.initializer]): _, val_ctr_smile_op, val_sampled_smiles_op = model.build_val_net( val_data.get_next()) model.build_test_net(val_ctr_smile_op, val_sampled_smiles_op, test_data.get_next()) train_summary_ops = tf.summary.merge(tf.get_collection("train_summaries")) val_summary_ops = tf.summary.merge(tf.get_collection("val_summaries")) test_summary_ops = tf.summary.merge(tf.get_collection("test_summaries")) stale_global_step_op = tf.train.get_or_create_global_step() with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir or None, save_checkpoint_steps=FLAGS.steps_per_checkpoint or None, log_step_count_steps=FLAGS.steps_per_checkpoint or None) as sess: if FLAGS.train_dir: summary_writer = tf.summary.FileWriterCache.get(FLAGS.train_dir) else: summary_writer = None # step = 0 while not sess.should_stop(): # while step < 10: # step += 1 stale_global_step, seq_loss, _, train_summary = sess.run([ stale_global_step_op, seq_loss_op, train_op, train_summary_ops ]) if summary_writer is not None: summary_writer.add_summary(train_summary, stale_global_step) # Run validation and test. # Trigger test events. if stale_global_step % FLAGS.steps_per_checkpoint == 0: # if True: try: sess.run([val_data.initializer, test_data.initializer]) _, _ = sess.run([val_summary_ops, test_summary_ops]) # The monitored training session will pick up the summary # and automatically add them. except Exception as ex: logging.error(str(ex)) raise except tf.errors.OutOfRangeError: logging.info("Test finished. Continue training.") continue logging.info("Coordinator request to stop.")
def main(_): """ Start either train or eval. Note hardcoded parts of path for training and eval data """ hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps._set("num_gpus", FLAGS.num_gpus) print('*****HYPER PARAMETERS*****') print(hps) print('**************************') print_debug('our training DataSetDir=%s , LogDir=%s' % (FLAGS.datadir, FLAGS.logdir)) #vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "1b_word_vocab.txt")) vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "vocabulary.txt")) FLAGS.mode = "train" for i in range(10): print("Iteration ", i, " phase: ", FLAGS.mode) if FLAGS.mode == "train": #hps.batch_size = 256 # dataset = Dataset(vocab, os.path.join(FLAGS.datadir, # "training-monolingual.tokenized.shuffled/*")) dataset = Dataset(vocab, os.path.join(FLAGS.datadir, "ptb.train.txt")) trainlogdir = ( FLAGS.logdir + str("/") + "train" ) #(FLAGS.logdir+str("\\")+"train")#os.path.join(FLAGS.logdir, "train") print_debug('train log dir=%s' % (trainlogdir)) run_train(dataset, hps, trainlogdir, ps_device="/gpu:0") print_debug('Finished run_train !!!!!!!!!!!') elif FLAGS.mode.startswith("eval"): print_debug('eval mode') # if FLAGS.mode.startswith("eval_train"): # data_dir = os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*") # elif FLAGS.mode.startswith("eval_full"): # data_dir = os.path.join(FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/*") # else: # data_dir = os.path.join(FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050") dataset = Dataset(vocab, os.path.join(FLAGS.datadir, "ptb.test.txt"), deterministic=True) run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps) print_debug('Finished run_eval !!!!!!!!!!!') if FLAGS.mode == "train": FLAGS.mode = "eval_full" else: FLAGS.mode = "train"
def load_data(config, vocab=None): test_df = pd.read_csv(config.test_file, header=0, names=['face_id', 'content', 'label']) test_data, test_label, test_num_sent, test_num_word = build_data( test_df['content'], test_df['label']) if vocab is None: vocab = Vocabulary() [[vocab.add_sentence(x, y) for (x, y) in zip(data, test_label)] for data in test_data] test_input = [[[vocab.word_to_id(word) for word in sent] for sent in doc] for doc in test_data] test_label = [vocab.tag_to_id(label) for label in test_label] test_input = pad_sequence(test_input, True, config.max_sent, config.max_word) # t = torch.tensor(test_input) # print(t.size()) # print(test_label) test_dataset = myDataset(test_input, test_label) return test_dataset, vocab
def main(_): """ Start either train or eval. Note hardcoded parts of path for training and eval data """ hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps._set("num_gpus", FLAGS.num_gpus) print('*****HYPER PARAMETERS*****') print(hps) print('**************************') vocab = Vocabulary.from_file( os.path.join(FLAGS.datadir, "1b_word_vocab.txt")) if FLAGS.mode == "train": #hps.batch_size = 256 dataset = Dataset( vocab, os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*")) run_train(dataset, hps, os.path.join(FLAGS.logdir, "train"), ps_device="/gpu:0") elif FLAGS.mode.startswith("eval_"): if FLAGS.mode.startswith("eval_train"): data_dir = os.path.join( FLAGS.datadir, "training-monolingual.tokenized.shuffled/*") elif FLAGS.mode.startswith("eval_full"): data_dir = os.path.join( FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050" ) else: data_dir = os.path.join( FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050" ) dataset = Dataset(vocab, data_dir, deterministic=True) run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps) elif FLAGS.mode.startswith("infer"): data_dir = os.path.join( FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050" ) dataset = Dataset(vocab, data_dir, deterministic=True) run_infer(dataset, hps, FLAGS.logdir, FLAGS.mode, vocab)
def main(_): hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps.num_gpus = FLAGS.num_gpus vocab = Vocabulary.from_file("1b_word_vocab.txt") if FLAGS.mode == "train": hps.batch_size = 256 dataset = Dataset( vocab, FLAGS.datadir + "/training-monolingual.tokenized.shuffled/*") run_train(dataset, hps, FLAGS.logdir + "/train", ps_device="/gpu:0") elif FLAGS.mode.startswith("eval_"): data_dir = FLAGS.datadir dataset = Dataset(vocab, data_dir, deterministic=True) run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps, FLAGS.ckptpath)
def test_dataset(self): vocab = Vocabulary.from_file("testdata/test_vocab.txt") dataset = Dataset(vocab, "testdata/*") def generator(): for i in range(1, 10): yield [0] + list(range(1, i + 1)) + [0] counts = [0] * 10 for seq in generator(): for v in seq: counts[v] += 1 counts2 = [0] * 10 for x, y in dataset._iterate(generator(), 2, 4): for v in x.ravel(): counts2[v] += 1 for i in range(1, 10): self.assertEqual(counts[i], counts2[i], "Mismatch at i=%d. counts[i]=%s, counts2[i]=%s" % (i,counts[i], counts2[i]))
def main(_): if os.path.exists(checkpoint_path) is False: os.makedirs(checkpoint_path) # 读取训练文本 with open(datafile, 'r', encoding='utf-8') as f: train_data = f.read() # 加载/生成 词典 vocabulary = Vocabulary() if FLAGS.vocab_file: vocabulary.load_vocab(FLAGS.vocab_file) else: vocabulary.build_vocab(train_data) vocabulary.save(FLAGS.vocab_file) input_ids = vocabulary.encode(train_data) g = batch_generator(input_ids, FLAGS.batch_size, FLAGS.num_steps) model = LSTMModel(vocabulary.vocab_size, batch_size=FLAGS.batch_size, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.train( g, FLAGS.max_steps, checkpoint_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def main(config, local): n_gpu = int(GPU_NUM) n_gpu = 1 if n_gpu == 0 else n_gpu np.random.seed(config.random_seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.random_seed) # Create data instances vocab = Vocabulary(config.vocab_path) if config.mode == 'train': # Prepare train data loader train_dataset, val_dataset = Dataset(vocab), Dataset(vocab) train_path = os.path.join(config.data_dir, 'train_data/train_data') val_path = os.path.join(config.data_dir, 'train_data/val_data') train_dataset.create_instances(train_path, config.max_seq_length, type='train') val_dataset.create_instances(val_path, config.max_seq_length, type='val') train_loader = DataLoader(train_dataset, batch_size=config.batch_size * n_gpu, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=config.batch_size * n_gpu) else: train_loader, val_loader = None, None trainer = Trainer(config, n_gpu, vocab, train_loader, val_loader) if nsml.IS_ON_NSML: bind_model(trainer.model, vocab, config) if config.pause: nsml.paused(scope=local) if config.mode == 'train': trainer.train()
def __init__(self, hps, logdir, datadir, mode='eval'): with tf.variable_scope("model"): hps.num_sampled = 0 hps.keep_prob = 1.0 self.model = LM(hps, "eval", "/gpu:0") if hps.average_params: print("Averaging parameters for evaluation.") saver = tf.train.Saver(self.model.avg_dict) else: saver = tf.train.Saver() config = tf.ConfigProto(allow_soft_placement=True) self.sess = tf.Session(config=config) sw = tf.summary.FileWriter(logdir + "/" + mode, self.sess.graph) self.hps = hps self.num_steps = self.hps.num_steps vocab_path = os.path.join(datadir, "vocabulary.txt") with self.sess.as_default(): success = common.load_from_checkpoint(saver, logdir + "/train") if not success: raise Exception('Loading Checkpoint failed') self.vocabulary = Vocabulary.from_file(vocab_path)
def main(_): hvd.init() hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps.num_gpus = FLAGS.num_gpus vocab = Vocabulary.from_file(FLAGS.vocab) hps.vocab_size = vocab.num_tokens config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank()) if FLAGS.logdir is None: FLAGS.logdir = os.path.join('/tmp', 'lm-run-{}'.format(int(time.time()))) print('logdir: {}'.format(FLAGS.logdir)) hps.batch_size = 256 dataset = Dataset(vocab, FLAGS.datadir) run_train(dataset, hps, FLAGS.logdir + '/train', ps_device='/gpu:' + str(hvd.local_rank()))
def main(_): hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps.num_gpus = FLAGS.num_gpus vocab = Vocabulary.from_file(FLAGS.datadir + "/lm_vocab.txt", hps.vocab_size) if FLAGS.mode == "train": hps.batch_size = 256 # reset batchsize dataset = Dataset(vocab, FLAGS.datadir + "/train/*") run_train(dataset, hps, FLAGS.logdir + "/train", ps_device="/gpu:0") elif FLAGS.mode.startswith("eval_"): if FLAGS.mode.startswith("eval_train"): data_dir = FLAGS.datadir + "/train/*" elif FLAGS.mode.startswith("eval_test"): data_dir = FLAGS.datadir + "/heldout/*" print("data_dir:",data_dir) dataset = Dataset(vocab, data_dir, deterministic=True) run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps) elif FLAGS.mode.startswith("predict_next"): data_dir = "data/news.en.heldout-00001-of-00050" dataset = Dataset(vocab, data_dir) predict_next(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps,vocab)
def main(_): vocabulary = Vocabulary() vocabulary.load_vocab(FLAGS.vocab_file) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path =\ tf.train.latest_checkpoint(FLAGS.checkpoint_path) model = LSTMModel(vocabulary.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) start = vocabulary.encode(FLAGS.start_string) arr = model.predict(FLAGS.max_length, start, vocabulary.vocab_size) print(vocabulary.decode(arr))
print("INDEX: %s" % task_index) cluster = tf.train.ClusterSpec(cluster_spec) server = tf.train.Server(cluster, job_name=role, task_index=task_index) if role == "ps": server.join() else: ps_device = '/job:ps/task:0' """ Start either train or eval. Note hardcoded parts of path for training and eval data """ hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps._set("num_gpus", FLAGS.num_gpus) print('*****HYPER PARAMETERS*****') print(hps) print('**************************') vocab = Vocabulary.from_file( os.path.join(FLAGS.datadir, "1b_word_vocab.txt")) if FLAGS.mode == "train": #hps.batch_size = 256 dataset = Dataset( vocab, os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*")) run_train(dataset, hps, os.path.join(FLAGS.logdir, "train"), ps_device=ps_device)
subset = subset_df(df, n_samples=n_subset) # Create train, val, test sets train, validation, test = split_df(subset, size_train=train_size, size_valtest=valtest_size) # Compute main target class weights target_weights = class_weights(train, target='overall', p_expect=(1 / 3)) np.savetxt("train_class_weights.csv", target_weights, delimiter=",") # Compute conditional independent sample weights train = sample_weights(train) # Create Vocab on train set vocab = Vocabulary(freq_threshold=5) wordidx, idxword = vocab.build_vocab(train['reviewText'].tolist()) # Save train, val, test sets train.to_csv(save_train, index=False) validation.to_csv(save_val, index=False) test.to_csv(save_test, index=False) # Save wordidx and idxword with open(save_wordidx, 'w') as csv_file: writer = csv.writer(csv_file) for key, value in wordidx.items(): writer.writerow([key, value]) with open(save_idxword, 'w') as csv_file: writer = csv.writer(csv_file)
def main(_): vocab = Vocabulary.from_file( os.path.join(FLAGS.datadir, "1b_word_vocab.txt")) dataset = Dataset( vocab, os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*")) single_gpu_graph = tf.Graph() with single_gpu_graph.as_default(): with tf.variable_scope("model"): model = language_model_graph.build_model() def run(sess, num_workers, worker_id, num_replicas_per_worker): state_c = [] state_h = [] if len(state_c) == 0: state_c.extend([ np.zeros([FLAGS.batch_size, model.state_size], dtype=np.float32) for _ in range(num_replicas_per_worker) ]) state_h.extend([ np.zeros([FLAGS.batch_size, model.projected_size], dtype=np.float32) for _ in range(num_replicas_per_worker) ]) prev_global_step = sess.run(model.global_step)[0] prev_time = time.time() data_iterator = dataset.iterate_forever( FLAGS.batch_size * num_replicas_per_worker, FLAGS.num_steps, num_workers, worker_id) fetches = { 'global_step': model.global_step, 'loss': model.loss, 'train_op': model.train_op, 'final_state_c': model.final_state_c, 'final_state_h': model.final_state_h } for local_step in range(FLAGS.max_steps): if FLAGS.use_synthetic: x = np.random.randint( low=0, high=model.vocab_size, size=(FLAGS.batch_size * num_replicas_per_worker, FLAGS.num_steps)) y = np.random.randint( low=0, high=model.vocab_size, size=(FLAGS.batch_size * num_replicas_per_worker, FLAGS.num_steps)) w = np.ones((FLAGS.batch_size * num_replicas_per_worker, FLAGS.num_steps)) else: x, y, w = next(data_iterator) feeds = {} feeds[model.x] = np.split(x, num_replicas_per_worker) feeds[model.y] = np.split(y, num_replicas_per_worker) feeds[model.w] = np.split(w, num_replicas_per_worker) feeds[model.initial_state_c] = state_c feeds[model.initial_state_h] = state_h fetched = sess.run(fetches, feeds) state_c = fetched['final_state_c'] state_h = fetched['final_state_h'] if local_step % FLAGS.log_frequency == 0: cur_time = time.time() elapsed_time = cur_time - prev_time num_words = FLAGS.batch_size * FLAGS.num_steps wps = (fetched['global_step'][0] - prev_global_step) * num_words / elapsed_time prev_global_step = fetched['global_step'][0] parallax.log.info( "Iteration %d, time = %.2fs, wps = %.0f, train loss = %.4f" % (fetched['global_step'][0], cur_time - prev_time, wps, fetched['loss'][0])) prev_time = cur_time sess, num_workers, worker_id, num_replicas_per_worker = \ parallax.parallel_run(single_gpu_graph, FLAGS.resource_info_file, sync=FLAGS.sync, parallax_config=parallax_config.build_config()) run(sess, num_workers, worker_id, num_replicas_per_worker)
import numpy as np import time import tensorflow as tf from data_utils import Vocabulary, Dataset from language_model import LM from common import CheckpointLoader BATCH_SIZE = 1 NUM_TIMESTEPS = 1 MAX_WORD_LEN = 50 UPLOAD_FOLDER = '/data/ngramTest/uploads' UPLOAD_FOLDER = './' hps = LM.get_default_hparams() vocab = Vocabulary.from_file("1b_word_vocab.txt") with tf.variable_scope("model"): hps.num_sampled = 0 # Always using full softmax at evaluation. run out of memory hps.keep_prob = 1.0 hps.num_gpus = 1 model = LM(hps, "predict_next", "/cpu:0") if hps.average_params: print("Averaging parameters for evaluation.") saver = tf.train.Saver(model.avg_dict) else: saver = tf.train.Saver() # Use only 4 threads for the evaluation. config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=20,