def main(args): if args.mode == 'prepare': # python3 run.py --mode prepare --pointer-gen prepare(args) elif args.mode == 'train': # python3 run.py --mode train -b 100 -o output --gpu 0 --restore train(args) elif args.mode == 'eval': # python3 run.py --mode eval --eval-model evaluate(args) elif args.mode == 'decode': # # python3 run.py --mode decode --beam-size 10 --decode-model output_big_data/model/model-250000 --decode-dir output_big_data/result --gpu 1 args.batch_size = args.beam_size vocab_encoder = Vocab(args, "encoder_vocab") vocab_decoder = Vocab(args, "decoder_vocab") vocab_user = User_Vocab(args, name="user_vocab") test_file = "./test.data" #test_file = os.path.join(args.data, 'chat_data/tmp.data') # test_file = os.path.join(args.data, 'news_train_span_50.data') batcher = TestBatcher(args, vocab_encoder, vocab_decoder, vocab_user, test_file).batcher() if args.cpu: with tf.device('/cpu:0'): model = CommentModel(args, vocab_decoder) else: model = CommentModel(args, vocab_decoder) decoder = BeamSearchDecoder(args, model, batcher, vocab_decoder) decoder.decode() elif args.mode == 'debug': debug(args) else: raise RuntimeError(f'mode {args.mode} is invalid.')
def prepare(args): if not os.path.exists(args.records_dir): os.makedirs(args.records_dir) train_file = os.path.join(args.data, 'chat_data/tmp.data') dev_file = os.path.join(args.data, 'chat_data/tmp.data') vocab_encoder = Vocab(args, name="encoder_vocab") vocab_decoder = Vocab(args, name="decoder_vocab") vocab_user = User_Vocab(args, name="user_vocab") dataset = Dataset(args, vocab_encoder, vocab_decoder, vocab_user, train_file, dev_file) dataset.save_datasets(['train', 'dev'])
def __init__(self, data_dict, train=True, vocabulary=None, support=False, device=None): """ 'datas': all_datas 'maxlen_story': maxlen_story 'maxlen_query': maxlen_query 'maxlen_sent': maxlen_sent """ self.examples = data_dict['datas'] self.maxlen_story = data_dict['maxlen_story'] self.maxlen_query = data_dict['maxlen_query'] self.maxlen_sent = data_dict['maxlen_sent'] self.support = support self.device = device self.flatten = lambda x: [tkn for sublists in x for tkn in sublists] stories, questions, answers, supports = list(zip(*self.examples)) if train: self.vocab = Vocab() self._build_vocab(stories, questions, answers) else: self.vocab = vocabulary # numerical & add_pad stories, questions, answers = self._preprocess(stories, questions, answers) if self.support: self.data = list(zip(stories, questions, answers, supports)) else: self.data = list(zip(stories, questions, answers))
def __init__(self, filename, vocab_file=None, vocab_dump=None, label_vocab_dump=None, n_prev_turns=0, indices=None): with open(filename) as csvfile: reader = csv.DictReader(csvfile) self.data = [row for row in reader] if indices is not None: self.data = [self.data[i] for i in indices] if "id" in self.data[0]: self.id2idx = {row["id"]: i for i, row in enumerate(self.data)} self.n_prev_turns = n_prev_turns if vocab_dump is None: self.vocab = Vocab(vocab_file) else: with open(vocab_dump, 'rb') as fp: self.vocab = pickle.load(fp) if label_vocab_dump is None: labels = [row["label"] for row in self.data] self.label_vocab = LabelVocab(labels) else: with open(label_vocab_dump, 'rb') as fp: self.label_vocab = pickle.load(fp)
def __init__(self, path, dataset, *args, **kwargs): self.dataset = dataset self.vocab = Vocab(*args, **kwargs) if self.dataset in ["ptb", "wt2", "enwik8", "text8"]: self.vocab.count_file(os.path.join(path, "train.txt")) self.vocab.count_file(os.path.join(path, "valid.txt")) self.vocab.count_file(os.path.join(path, "test.txt")) elif self.dataset == "wt103": self.vocab.count_file(os.path.join(path, "train.txt")) elif self.dataset == "lm1b": train_path_pattern = os.path.join( path, "1-billion-word-language-modeling-benchmark-r13output", "training-monolingual.tokenized.shuffled", "news.en-*") train_paths = glob(train_path_pattern) # the vocab will load from file when build_vocab() is called # for train_path in sorted(train_paths): # self.vocab.count_file(train_path, verbose=True) self.vocab.build_vocab() if self.dataset in ["ptb", "wt2", "wt103"]: self.train = self.vocab.encode_file(os.path.join( path, "train.txt"), ordered=True) self.valid = self.vocab.encode_file(os.path.join( path, "valid.txt"), ordered=True) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True) elif self.dataset in ["enwik8", "text8"]: self.train = self.vocab.encode_file(os.path.join( path, "train.txt"), ordered=True, add_eos=False) self.valid = self.vocab.encode_file(os.path.join( path, "valid.txt"), ordered=True, add_eos=False) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False) elif self.dataset == "lm1b": self.train = train_paths valid_path = os.path.join(path, "valid.txt") test_path = valid_path self.valid = self.vocab.encode_file(valid_path, ordered=True, add_double_eos=True) self.test = self.vocab.encode_file(test_path, ordered=True, add_double_eos=True) if self.dataset == "wt103": self.cutoffs = [0, 20000, 40000, 200000] + [len(self.vocab)] elif self.dataset == "lm1b": self.cutoffs = [0, 60000, 100000, 640000] + [len(self.vocab)] else: self.cutoffs = []
def __init__(self, dataset, batch_size=None, vocab_created=False, vocab=None, target_col=None, word2index=None, sos_token='<SOS>', eos_token='<EOS>', unk_token='<UNK>', pad_token='<PAD>', min_word_count=5, max_vocab_size=None, max_seq_len=0.8, use_pretrained_vectors=False, glove_path='Glove/', glove_name='glove.6B.100d.txt', weights_file_name='Glove/weights.npy'): if not vocab_created: self.vocab = Vocab(dataset, target_col=target_col, word2index=word2index, sos_token=sos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, min_word_count=min_word_count, max_vocab_size=max_vocab_size, max_seq_len=max_seq_len, use_pretrained_vectors=use_pretrained_vectors, glove_path=glove_path, glove_name=glove_name, weights_file_name=weights_file_name) self.dataset = self.vocab.dataset else: self.dataset = dataset self.vocab = vocab self.target_col = target_col self.word2index = self.vocab.word2index if batch_size: self.batch_size = batch_size else: self.batch_size = len(self.dataset) self.x_lengths = np.array(self.vocab.x_lengths) if self.target_col: self.y_lengths = np.array(self.vocab.y_lengths) self.pad_token = self.vocab.word2index[pad_token] self.sort_and_batch()
def __init__(self, filename, vocab_file=None, vocab_dump=None, label_vocab_dump=None, n_prev_turns=0, text_input=False): self.text_input = text_input with open(filename) as csvfile: reader = csv.DictReader(csvfile) self.data = [row for row in reader] lattice_reader = LatticeReader(text_input=text_input) for i, row in enumerate(tqdm(self.data)): row["lattice"] = lattice_reader.read_sent(row["text"], i) row["rev_lattice"] = row["lattice"].reversed() self.id2idx = {row["id"]: i for i, row in enumerate(self.data)} self.n_prev_turns = n_prev_turns if vocab_dump is None: self.vocab = Vocab(vocab_file) else: with open(vocab_dump, 'rb') as fp: self.vocab = pickle.load(fp) if label_vocab_dump is None: labels = [row["label"] for row in self.data] self.label_vocab = LabelVocab(labels) else: with open(label_vocab_dump, 'rb') as fp: self.label_vocab = pickle.load(fp)
def Vocabulary(self, update, context): try: chat_message = update.message.text x = Vocab(chat_message).mean() context.bot.send_message(chat_id=update.effective_chat.id, text=x) except KeyError: context.bot.send_message(chat_id=update.effective_chat.id, text="İnvaild Syntax :(")
def word(self,update,context): try: chat_message=update.message.text chat_message.lower().capitalize() x=Vocab(chat_message).mean() context.bot.send_message(chat_id=update.effective_chat.id,text=x) except KeyError: context.bot.send_message(chat_id=update.effective_chat.id,text="Ä°nvaild Syntax :(")
def __init__(self, path, dataset, *args, **kwargs): self.dataset = dataset self.vocab = Vocab(*args, **kwargs) self.vocab.count_file(os.path.join(path, "train.txt")) self.vocab.build_vocab() self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True) self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True) vocab_len = len(self.vocab) self.cutoffs = [0, int(vocab_len * 0.1), int(vocab_len * 0.2), int(vocab_len * 0.4)] + [vocab_len]
def __init__(self, path, dataset, *args, **kwargs): self.dataset = dataset self.vocab = Vocab(*args, **kwargs) self.vocab.count_file(os.path.join(path, "train.txt")) self.vocab.build_vocab() self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True) self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True) self.test = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True) self.cutoffs = []
def __init__(self, path, dataset, *args, **kwargs): self.dataset = dataset self.vocab = Vocab(*args, **kwargs) self.vocab.count_file(os.path.join( path, "train.txt")) # 更新vocab对象里的counter(用于统计每个不同的词出现的次数) self.vocab.count_file(os.path.join(path, "valid.txt")) # 同上,验证集中更新 self.vocab.build_vocab() # 这一步是为了建立idx2sym和sym2idx,把词映射为索引,把索引还原为词 self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True) self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True)
def train(**kwargs): args = DefaultConfig() args.parse(kwargs) vocab = Vocab() loss_functions = transformer_celoss score_functions = rouge_func model = getattr(Models, args.model_name)(vocab, args) train_loader = get_loaders('train', args.batch_size, 12) dev_loader = get_loaders('val', args.batch_size, 12) trainer = ScheduledTrainerTrans(args, model, loss_functions, score_functions, train_loader, dev_loader) if args.resume is not None: trainer.init_trainner(resume_from=args.resume) else: trainer.init_trainner() trainer.train()
def __init__(self, filename, vocab_file=None, vocab_dump=None, stop_word_file=None): with open(filename) as csvfile: reader = csv.DictReader(csvfile) data = [row for row in reader] self.stop_words = set() if stop_word_file is not None: for line in open(stop_word_file): self.stop_words.add(line.strip()) datas = [] count, total = 0, 0 for row in data: ref = row["transcription"] hyp = row["hypothesis"] score = float(row["score"]) confs = row["confusion"].split() confs = [(confs[i * 3], confs[i * 3 + 1]) for i in range(len(confs) // 3 + 1)] conf_ids = [] ref_id = hyp_id = 0 for ref_w, hyp_w in confs: ref_eps = (ref_w == "<eps>") hyp_eps = (hyp_w == "<eps>") if not ref_eps and not hyp_eps and ref_w != hyp_w: total += 1 if ref_w not in self.stop_words and hyp_w not in self.stop_words: conf_ids.append((ref_id, hyp_id)) else: count += 1 if not ref_eps: ref_id += 1 if not hyp_eps: hyp_id += 1 datas.append((ref, hyp, conf_ids, score)) print(count, total) self.data = datas if vocab_file is not None: self.vocab = Vocab(vocab_file) elif vocab_dump is not None: with open(vocab_dump, 'rb') as fp: self.vocab = pickle.load(fp)
def load_model(exp_name): exp_root = os.path.join(ckpt_root, exp_name) best_model_folder = get_best_k_model_path(os.path.join(exp_root, 'saved_models'))[0] best_model_folder = os.path.join(exp_root, 'saved_models', best_model_folder) model_state = t.load(os.path.join(best_model_folder, 'model'), map_location='cpu') try: for i in model_state: model_state[i] = model_state[i].cpu() except: pass trainner_state = t.load(os.path.join(best_model_folder, 'trainner_state')) args = trainner_state['args'] vocab = Vocab() model = getattr(Models, args.model_name)(vocab, args) model.load_state_dict(model_state) model.eval() return model
def __init__(self, filename, vocab_file=None, vocab_dump=None, text_input=False): self.text_input = text_input with open(filename) as csvfile: reader = csv.DictReader(csvfile) self.data = [row for row in reader] lattice_reader = LatticeReader(text_input=text_input) for i, row in enumerate(tqdm(self.data)): row["lattice"] = lattice_reader.read_sent(row["text"], i) row["rev_lattice"] = row["lattice"].reversed() if vocab_dump is None: self.vocab = Vocab(vocab_file) else: with open(vocab_dump, 'rb') as fp: self.vocab = pickle.load(fp)
def __init__(self, path, dataset, *args, **kwargs): self.dataset = dataset self.vocab = Vocab(*args, **kwargs) train_path = os.path.join(path, "train.txt") valid_path = os.path.join(path, "valid.txt") # test_path = os.path.join(path, "test.txt") # self.vocab.count_file(train_path) # self.vocab.count_file(valid_path) # self.vocab.count_file(test_path) self.vocab.build_vocab(add_bytes=True) self.train = train_path self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True, add_eos=False) # self.test = self.vocab.encode_file( # os.path.join(path, "test.txt"), ordered=True, add_eos=False) self.cutoffs = []
def __init__(self, filename, vocab_file=None, vocab_dump=None, label_vocab_dump=None): with open(filename) as csvfile: reader = csv.DictReader(csvfile) self.data = [row for row in reader] if vocab_dump is None: self.vocab = Vocab(vocab_file) else: with open(vocab_dump, 'rb') as fp: self.vocab = pickle.load(fp) if label_vocab_dump is None: labels = [row["label"] for row in self.data] self.label_vocab = LabelVocab(labels) else: with open(label_vocab_dump, 'rb') as fp: self.label_vocab = pickle.load(fp)
def train_re(**kwargs): args = DefaultConfig() args.parse(kwargs) vocab = Vocab() loss_functions = transformer_celoss score_functions = rouge_func model = getattr(Models, args.model_name)(vocab, args) train_loader = get_loaders('train', args.batch_size, 12) dev_loader = get_loaders('val', args.batch_size, 12) trainer = ScheduledTrainerTrans(args, model, loss_functions, score_functions, train_loader, dev_loader) trainer.init_trainner(resume_from=args.resume) # try: # trainer.model.vgg_feature.requires_grad = True # trainer.model.vgg_input.requires_grad = True # # except: # trainer.model.module.vgg_feature.requires_grad = True # trainer.model.module.vgg_input.requires_grad = True # trainer.optim.param_groups[0]['lr'] = 3e-5 trainer.train()
def __init__( self, checkpoint_path='/home/mnakhodnov/sirius-stt/models/8_recovered_v3/epoch_17.pt', device=torch.device('cpu'), rescore=True, decoder_kwargs=None ): if not os.path.exists(checkpoint_path): raise ValueError(f'There is no checkpoint in {checkpoint_path}') self.device = device self.rescore = rescore self.decoder_kwargs = decoder_kwargs self.checkpoint_path = checkpoint_path self._vocab = Vocab(self._alphabet) self._num_tokens = get_num_tokens(self._vocab) self._blank_index = get_blank_index(self._vocab) self._sample_rate = 8000 self._model_config = { 'num_mel_bins': 64, 'hidden_size': 512, 'num_layers': 4, 'num_tokens': len(self._vocab.tokens2indices()) - 1, } self.model = Model(**self._model_config) load_from_ckpt(self.model, self.checkpoint_path) self.model = self.model.to(device=self.device).eval() self.decoder = fast_beam_search_decode self._kenlm_binary_path = '/data/mnakhodnov/language_data/cc100/xaa.processed.3.binary' if self.decoder_kwargs is None: self.decoder_kwargs = { 'beam_size': 200, 'cutoff_top_n': 33, 'cutoff_prob': 1.0, 'ext_scoring_func': self._kenlm_binary_path, 'alpha': 1.0, 'beta': 0.3, 'num_processes': 32 } if self.rescore: self.rescorer_model = torch.hub.load( 'pytorch/fairseq', 'transformer_lm.wmt19.ru', tokenizer='moses', bpe='fastbpe', force_reload=False ).to(device=device)
def __init__(self, text_path, vocab_file=None, vocab_dump=None): self.data = [] print_time_info("Reading text from {}".format(text_path)) with open(text_path) as csvfile: reader = csv.DictReader(csvfile) for i, row in enumerate(reader): words = row["text"].split() if "id" in row: self.data.append((row["id"], words)) else: self.data.append((i, words)) # for line in tqdm(open(text_path)): # uid, *words = line.strip().split() # self.data.append((uid, words)) if vocab_dump is None: self.vocab = Vocab(vocab_file) else: with open(vocab_dump, 'rb') as fp: self.vocab = pickle.load(fp)
def forward(self, position_feature): # inputs [B, max_lenth] positions_encoded = self.position_encoding(position_feature) return positions_encoded if __name__ == '__main__': import ipdb from loaders import get_loaders from configs_transformer import DefaultConfig from tqdm import tqdm from vocabulary import Vocab args = DefaultConfig args.batch_size = 2 loader = get_loaders('val', args.batch_size, 2) vocab = Vocab() for i in tqdm(loader): feature, caption, lenth = [j for j in i] batch_size, c, h, w = feature.size() _, n, l = caption.size() feature = feature.unsqueeze(1).expand( (batch_size, n, c, h, w)).contiguous().view(-1, c, h, w) caption = caption.long() caption = caption.view(-1, l) model = VGGTransformerNew1(vocab, args) output_log_prob, output_token = model(feature, caption) token = model.greedy_search(feature) loss = output_log_prob.sum() loss.backward()
def main(): clock = Clock() clock.start() random.seed(SEED) np.random.seed(SEED) assert START_TOKEN == 0 parser = argparse.ArgumentParser(description='conditional SeqGAN') parser.add_argument('--conditional', '-c', type=int, default=0, help='If you make SeqGAN conditional, set `-c` 1.') args = parser.parse_args() cond = args.conditional vocab = Vocab() vocab.construct(parsed_haiku_file) vocab.word2id(parsed_haiku_file, positive_file) UNK = vocab.dic.token2id[u'<UNK>'] COMMA = vocab.dic.token2id[u','] gen_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH, COND_LENGTH, UNK) # likelihood_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH, COND_LENGTH, UNK) # For testing vocab_size = len(vocab.dic.token2id) with open(output_token2id, 'w') as f: pickle.dump(vocab.dic.token2id, f) dis_data_loader = Dis_dataloader(BATCH_SIZE, SEQ_LENGTH, UNK) generator = Generator(vocab_size, BATCH_SIZE, EMB_DIM, HIDDEN_DIM, SEQ_LENGTH, COND_LENGTH, START_TOKEN, is_cond=cond) # target_params = cPickle.load(open('save/target_params.pkl')) # target_lstm = TARGET_LSTM(vocab_size, BATCH_SIZE, EMB_DIM, HIDDEN_DIM, SEQ_LENGTH, START_TOKEN, target_params) # The oracle model discriminator = Discriminator(sequence_length=SEQ_LENGTH, cond_length=COND_LENGTH, num_classes=2, vocab_size=vocab_size, batch_size=BATCH_SIZE, embedding_size=dis_embedding_dim, filter_sizes=dis_filter_sizes, num_filters=dis_num_filters, l2_reg_lambda=dis_l2_reg_lambda, is_cond=cond) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) # First, use the oracle model to provide the positive examples, which are sampled from the oracle data distribution # generate_samples(sess, target_lstm, BATCH_SIZE, generated_num, positive_file) gen_data_loader.create_batches(positive_file) if cond: vocab.word2id(parsed_kigo_file, positive_condition_file) vocab.load_cond(positive_condition_file, COND_LENGTH, UNK) gen_data_loader.create_cond_batches(positive_condition_file) log = open('save/experiment-log.txt', 'w') # pre-train generator print 'Start pre-training...' log.write('pre-training...\n') for epoch in xrange(PRE_EPOCH_GEN_NUM): loss = pre_train_epoch(sess, generator, gen_data_loader, cond=cond) if epoch % 5 == 0: generate_samples(sess, generator, BATCH_SIZE, generated_num, eval_file, cond, vocab) # likelihood_data_loader.create_batches(eval_file) # test_loss = target_loss(sess, target_lstm, likelihood_data_loader) # print 'pre-train epoch ', epoch, 'test_loss ', test_loss # buffer = 'epoch:\t'+ str(epoch) + '\tnll:\t' + str(test_loss) + '\n' # log.write(buffer) clock.check_HMS() print 'Start pre-training discriminator...' # Train 3 epoch on the generated data and do this for 50 times for _ in range(PRE_EPOCH_DIS_NUM): generate_samples(sess, generator, BATCH_SIZE, generated_num, negative_file, cond, vocab) dis_data_loader.load_train_data(positive_file, negative_file) for _ in range(3): dis_data_loader.reset_pointer() for it in xrange(dis_data_loader.num_batch): x_batch, y_batch = dis_data_loader.next_batch() feed = { discriminator.input_x: x_batch, discriminator.input_y: y_batch, discriminator.dropout_keep_prob: dis_dropout_keep_prob } _ = sess.run(discriminator.train_op, feed) clock.check_HMS() rollout = ROLLOUT(generator, 0.8, SEQ_LENGTH) print '#########################################################################' print 'Start Adversarial Training...' log.write('adversarial training...\n') for total_batch in range(TOTAL_BATCH): # Train the generator for one step for it in range(1): if cond: cond_batch = vocab.choice_cond(BATCH_SIZE) samples = generator.generate(sess, cond=cond_batch) rewards = rollout.get_reward(sess, samples, 16, discriminator, cond=cond_batch) else: samples = generator.generate(sess) rewards = rollout.get_reward(sess, samples, 16, discriminator) feed = {generator.x: samples, generator.rewards: rewards} if cond: feed[generator.cond] = cond_batch _ = sess.run(generator.g_updates, feed_dict=feed) # Test if total_batch % 5 == 0 or total_batch == TOTAL_BATCH - 1: generate_samples(sess, generator, BATCH_SIZE, generated_num, eval_file, cond, vocab) # likelihood_data_loader.create_batches(eval_file) # test_loss = target_loss(sess, target_lstm, likelihood_data_loader) # buffer = 'epoch:\t' + str(total_batch) + '\tnll:\t' + str(test_loss) + '\n' # print 'total_batch: ', total_batch, 'test_loss: ', test_loss # log.write(buffer) if total_batch % 20 == 0 or total_batch == TOTAL_BATCH - 1: if cond: vocab.id2word( eval_file, generated_haiku_with_kigo_file.format(total_batch)) else: vocab.id2word(eval_file, generated_haiku_file.format(total_batch)) # Update roll-out parameters rollout.update_params() # Train the discriminator for _ in range(5): generate_samples(sess, generator, BATCH_SIZE, generated_num, negative_file, cond, vocab) dis_data_loader.load_train_data(positive_file, negative_file) for _ in range(3): dis_data_loader.reset_pointer() for it in xrange(dis_data_loader.num_batch): x_batch, y_batch = dis_data_loader.next_batch() feed = { discriminator.input_x: x_batch, discriminator.input_y: y_batch, discriminator.dropout_keep_prob: dis_dropout_keep_prob } _ = sess.run(discriminator.train_op, feed) clock.check_HMS() saver = tf.train.Saver() saver.save(sess, output_generator) log.close()
os.environ["CUDA_VISIBLE_DEVICES"] = '{0}'.format( str(cuda_device_id) if cuda_device_id is not None else '') if cuda_device_id is not None and torch.cuda.is_available(): device = 'cuda:{0:d}'.format(0) else: device = torch.device('cpu') print(f'dtype: {dtype}, device: {device}, cuda_device_id {cuda_device_id}') alphabet = [ 'а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ь', 'ы', 'ъ', 'э', 'ю', 'я', ' ', '<blank>' ] vocab = Vocab(alphabet) blank_index = get_blank_index(vocab) audio_transforms = get_default_audio_transforms() # audio_transforms = None sample_rate = 8000 # ## Load Common Voice dataset common_voice_val_manifest_path = '/home/e.chuykova/data/val.txt' common_voice_test_manifest_path = '/home/e.chuykova/data/test.txt' common_voice_train_manifest_path = '/home/e.chuykova/data/train.txt' common_voice_val_dataset = AudioDataset( common_voice_val_manifest_path,
def __init__(self, exp_name='20181212_214746'): self.vocab = Vocab() self.model = load_model(exp_name) self.model.eval()
def __init__(self, path, dataset, *args, **kwargs): self.dataset = dataset if self.dataset == "generic_dataset": encode_kwargs = dict( add_eos=kwargs.pop('add_eos', False), add_double_eos=kwargs.pop('add_double_eos', False), ordered=True, verbose=True, ) if kwargs.get('vocab_file') is not None: kwargs['vocab_file'] = os.path.join(path, kwargs['vocab_file']) print(self.dataset, 'vocab params', kwargs) self.vocab = Vocab(*args, **kwargs) if self.dataset in ["ptb", "wt2", "enwik8", "text8"]: self.vocab.count_file(os.path.join(path, "train.txt")) self.vocab.count_file(os.path.join(path, "valid.txt")) self.vocab.count_file(os.path.join(path, "test.txt")) elif self.dataset == "generic_dataset" and not self.vocab.vocab_file: self.vocab.count_file(os.path.join(path, "train.txt")) self.vocab.count_file(os.path.join(path, "valid.txt")) self.vocab.count_file(os.path.join(path, "test.txt")) elif self.dataset == "wt103": self.vocab.count_file(os.path.join(path, "train.txt")) elif self.dataset == "lm1b": train_path_pattern = os.path.join( path, "1-billion-word-language-modeling-benchmark-r13output", "training-monolingual.tokenized.shuffled", "news.en-*") train_paths = glob(train_path_pattern) # the vocab will load from file when build_vocab() is called # for train_path in sorted(train_paths): # self.vocab.count_file(train_path, verbose=True) self.vocab.build_vocab() if self.dataset in ["ptb", "wt2", "wt103"]: self.train = self.vocab.encode_file(os.path.join( path, "train.txt"), ordered=True) self.valid = self.vocab.encode_file(os.path.join( path, "valid.txt"), ordered=True) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True) elif self.dataset == "generic_dataset": self.train = self.vocab.encode_file( os.path.join(path, "train.txt"), **encode_kwargs) self.valid = self.vocab.encode_file( os.path.join(path, "valid.txt"), **encode_kwargs) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), **encode_kwargs) elif self.dataset in ["enwik8", "text8"]: self.train = self.vocab.encode_file(os.path.join( path, "train.txt"), ordered=True, add_eos=False) self.valid = self.vocab.encode_file(os.path.join( path, "valid.txt"), ordered=True, add_eos=False) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False) elif self.dataset == "lm1b": self.train = train_paths valid_path = os.path.join(path, "valid.txt") test_path = valid_path self.valid = self.vocab.encode_file(valid_path, ordered=True, add_double_eos=True) self.test = self.vocab.encode_file(test_path, ordered=True, add_double_eos=True) if self.dataset == "wt103": self.cutoffs = [0, 20000, 40000, 200000] + [len(self.vocab)] elif self.dataset == "generic_dataset": with open(os.path.join(path, "cutoffs.json")) as f: self.cutoffs = json.load(f) elif self.dataset == "lm1b": self.cutoffs = [0, 60000, 100000, 640000] + [len(self.vocab)] else: self.cutoffs = []
def get_all_data(args, training=True, batch_size=100) -> Tuple[DataLoader, DataLoader, DataLoader]: # evaluation batch size eval_batch = args["eval_batch"] if args["eval_batch"] else batch_size # pickle file path if args['path']: saving_folder_path = args['path'] else: saving_folder_path = 'save/{}-{}-{}-{}/'.format(args["decoder"], args["addName"], args['dataset'], args['task']) iprint('Path to save data: ' + saving_folder_path) if not os.path.exists(saving_folder_path): os.makedirs(saving_folder_path) # read domain-slot pairs ontology = json.load(open(FILE_ONTOLOGY, 'r')) all_slots = get_slot_info(ontology) # vocab vocab_name = 'vocab-all.pkl' if args["all_vocab"] else 'vocab-train.pkl' mem_vocab_name = 'mem-vocab-all.pkl' if args["all_vocab"] else 'mem-vocab-train.pkl' # if vocab files exist, read them in, otherwise we create new ones if os.path.exists(saving_folder_path + vocab_name) and os.path.exists(saving_folder_path + mem_vocab_name): iprint('Loading saved vocab files...') with open(saving_folder_path + vocab_name, 'rb') as handle: vocab = pickle.load(handle) with open(saving_folder_path + mem_vocab_name, 'rb') as handle: mem_vocab = pickle.load(handle) else: vocab = Vocab() vocab.index_words(all_slots, 'slot') mem_vocab = Vocab() mem_vocab.index_words(all_slots, 'slot') if training: pair_train, train_max_len, slot_train, train_dataloader = get_data( args=args, file=FILE_TRAIN, slots=all_slots, dataset='train', vocab=vocab, mem_vocab=mem_vocab, training=training, batch_size=batch_size, shuffle=True ) nb_train_vocab = vocab.n_words else: pair_train, train_max_len, slot_train, train_dataloader, nb_train_vocab = [], 0, {}, [], 0 pair_dev, dev_max_len, slot_dev, dev_dataloader = get_data( args=args, file=FILE_DEV, slots=all_slots, dataset='dev', vocab=vocab, mem_vocab=mem_vocab, training=training, batch_size=eval_batch, shuffle=False ) pair_test, test_max_len, slot_test, test_dataloader = get_data( args=args, file=FILE_TEST, slots=all_slots, dataset='test', vocab=vocab, mem_vocab=mem_vocab, training=training, batch_size=eval_batch, shuffle=False ) iprint('Dumping vocab files...') with open(saving_folder_path + vocab_name, 'wb') as handle: pickle.dump(vocab, handle) with open(saving_folder_path + mem_vocab_name, 'wb') as handle: pickle.dump(mem_vocab, handle) embedding_dump_path = 'data/embedding{}.json'.format(len(vocab.index2word)) if not os.path.exists(embedding_dump_path) and args["load_embedding"]: dump_pretrained_emb(vocab.word2index, vocab.index2word, embedding_dump_path) test_4d = [] if args['except_domain'] != '': pair_test_4d, _, _, test_4d = get_data( file=FILE_TEST, slots=all_slots, dataset='dev', vocab=vocab, mem_vocab=mem_vocab, training=training, batch_size=eval_batch, shuffle=False ) max_word = max(train_max_len, dev_max_len, test_max_len) + 1 iprint('Read %s pairs train' % len(pair_train)) iprint('Read %s pairs dev' % len(pair_dev)) iprint('Read %s pairs test' % len(pair_test)) iprint('Vocab_size: %s' % vocab.n_words) iprint('Vocab_size Training %s' % nb_train_vocab) iprint('Vocab_size Belief %s' % mem_vocab.n_words) iprint('Max. length of dialog words for RNN: %s' % max_word) # iprint('USE_CUDA={}'.format(USE_CUDA)) # slots_list = [all_slots, slot_train, slot_dev, slot_test] slots_dict = { 'all': all_slots, 'train': slot_train, 'val': slot_dev, 'test': slot_test } iprint('[Train Set & Dev Set Slots]: Number is {} in total'.format(len(slots_dict['val']))) iprint(slots_dict['val']) iprint('[Test Set Slots]: Number is {} in total'.format(len(slots_dict['test']))) iprint(slots_dict['test']) vocabs = [vocab, mem_vocab] return train_dataloader, dev_dataloader, test_dataloader, test_4d, vocabs, slots_dict, nb_train_vocab
def main(): clock = Clock() clock.start() random.seed(SEED) np.random.seed(SEED) assert START_TOKEN == 0 vocab = Vocab() vocab.construct(parsed_tweet_file) vocab.word2id(parsed_tweet_file, positive_file) UNK = vocab.dic.token2id[u'<UNK>'] gen_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH, UNK) likelihood_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH, UNK) # For testing vocab_size = 5000 dis_data_loader = Dis_dataloader(BATCH_SIZE, SEQ_LENGTH, UNK) generator = Generator(vocab_size, BATCH_SIZE, EMB_DIM, HIDDEN_DIM, SEQ_LENGTH, START_TOKEN) target_params = cPickle.load(open('save/target_params.pkl')) target_lstm = TARGET_LSTM(vocab_size, BATCH_SIZE, EMB_DIM, HIDDEN_DIM, SEQ_LENGTH, START_TOKEN, target_params) # The oracle model discriminator = Discriminator(sequence_length=SEQ_LENGTH, num_classes=2, vocab_size=vocab_size, embedding_size=dis_embedding_dim, filter_sizes=dis_filter_sizes, num_filters=dis_num_filters, l2_reg_lambda=dis_l2_reg_lambda) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) # First, use the oracle model to provide the positive examples, which are sampled from the oracle data distribution # generate_samples(sess, target_lstm, BATCH_SIZE, generated_num, positive_file) gen_data_loader.create_batches(positive_file) log = open('save/experiment-log.txt', 'w') # pre-train generator print 'Start pre-training...' log.write('pre-training...\n') for epoch in xrange(PRE_EPOCH_NUM): loss = pre_train_epoch(sess, generator, gen_data_loader) if epoch % 5 == 0: generate_samples(sess, generator, BATCH_SIZE, generated_num, eval_file) likelihood_data_loader.create_batches(eval_file) test_loss = target_loss(sess, target_lstm, likelihood_data_loader) print 'pre-train epoch ', epoch, 'test_loss ', test_loss buffer = 'epoch:\t'+ str(epoch) + '\tnll:\t' + str(test_loss) + '\n' log.write(buffer) clock.check_HMS() print 'Start pre-training discriminator...' # Train 3 epoch on the generated data and do this for 50 times for _ in range(50): generate_samples(sess, generator, BATCH_SIZE, generated_num, negative_file) dis_data_loader.load_train_data(positive_file, negative_file) for _ in range(3): dis_data_loader.reset_pointer() for it in xrange(dis_data_loader.num_batch): x_batch, y_batch = dis_data_loader.next_batch() feed = { discriminator.input_x: x_batch, discriminator.input_y: y_batch, discriminator.dropout_keep_prob: dis_dropout_keep_prob } _ = sess.run(discriminator.train_op, feed) clock.check_HMS() rollout = ROLLOUT(generator, 0.8, SEQ_LENGTH) print '#########################################################################' print 'Start Adversarial Training...' log.write('adversarial training...\n') for total_batch in range(TOTAL_BATCH): # Train the generator for one step for it in range(1): samples = generator.generate(sess) rewards = rollout.get_reward(sess, samples, 16, discriminator) feed = {generator.x: samples, generator.rewards: rewards} _ = sess.run(generator.g_updates, feed_dict=feed) # Test if total_batch % 5 == 0 or total_batch == TOTAL_BATCH - 1: generate_samples(sess, generator, BATCH_SIZE, generated_num, eval_file) likelihood_data_loader.create_batches(eval_file) test_loss = target_loss(sess, target_lstm, likelihood_data_loader) buffer = 'epoch:\t' + str(total_batch) + '\tnll:\t' + str(test_loss) + '\n' print 'total_batch: ', total_batch, 'test_loss: ', test_loss log.write(buffer) vocab.id2word(eval_file, generated_tweet_file.format(total_batch)) # Update roll-out parameters rollout.update_params() # Train the discriminator for _ in range(5): generate_samples(sess, generator, BATCH_SIZE, generated_num, negative_file) dis_data_loader.load_train_data(positive_file, negative_file) for _ in range(3): dis_data_loader.reset_pointer() for it in xrange(dis_data_loader.num_batch): x_batch, y_batch = dis_data_loader.next_batch() feed = { discriminator.input_x: x_batch, discriminator.input_y: y_batch, discriminator.dropout_keep_prob: dis_dropout_keep_prob } _ = sess.run(discriminator.train_op, feed) clock.check_HMS() log.close()
def inference(n_token, cutoffs, ps_device): dataset_name = "doupo" tmp_Vocab = Vocab() tmp_Vocab.count_file("../data/{}/train.txt".format(dataset_name), add_eos=False) tmp_Vocab.build_vocab() n_token = len(tmp_Vocab) # print(tmp_Vocab.idx2sym) test_list = tf.placeholder(tf.int64, shape=[1, None]) dataset = tf.data.Dataset.from_tensors(test_list) # dataset = dataset.batch(1, drop_remainder=True) iterator = dataset.make_initializable_iterator() input_feed = iterator.get_next() inputs = tf.split(input_feed, FLAGS.num_core_per_host, 0) # inputs = input_feed per_core_bsz = 1 tower_mems, tower_losses, tower_new_mems = [], [], [] tower_output = [] tower_mems_id = [] tower_new_mems_id = [] tower_attn_prob = [] for i in range(FLAGS.num_core_per_host): with tf.device(assign_to_gpu(i, ps_device)), \ tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): mems_i = [tf.placeholder(tf.float32, [FLAGS.mem_len, per_core_bsz, FLAGS.d_model]) for _ in range(FLAGS.n_layer)] mems_i_id = [tf.placeholder(tf.int64, [FLAGS.mem_len, per_core_bsz]) for _ in range(FLAGS.n_layer)] new_mems_i, output_i, new_mems_i_id, attn_prob_i = single_core_graph_for_inference( n_token=n_token, cutoffs=cutoffs, is_training=False, inp=inputs[i], mems=mems_i, mems_id=mems_i_id) tower_mems.append(mems_i) tower_new_mems.append(new_mems_i) tower_output.append(output_i) tower_mems_id.append(mems_i_id) tower_new_mems_id.append(new_mems_i_id) tower_attn_prob.append(attn_prob_i) # Evaluation loop tower_mems_np = [ [np.zeros([FLAGS.mem_len, per_core_bsz, FLAGS.d_model], dtype=np.float32) for layer in range(FLAGS.n_layer)] for core in range(FLAGS.num_core_per_host) ] tower_mems_id_np = [ [np.zeros([FLAGS.mem_len, per_core_bsz], dtype=np.float32) for layer in range(FLAGS.n_layer)] for core in range(FLAGS.num_core_per_host) ] saver = tf.train.Saver() with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: sess.run(tf.global_variables_initializer()) if FLAGS.eval_ckpt_path is None: eval_ckpt_path = tf.train.latest_checkpoint(FLAGS.model_dir) else: eval_ckpt_path = FLAGS.eval_ckpt_path print('eval_ckpt_path:', eval_ckpt_path) saver.restore(sess, eval_ckpt_path) # attention_score = tf.get_variable('transformer/layer_2/rel_attn/transpose_1:0') fetches = [tower_new_mems, tower_output, tower_new_mems_id, tower_attn_prob, 'transformer/adaptive_embed/lookup_table:0'] while True: input_text = input("seed text >>> ") while not input_text: print('Prompt should not be empty!') input_text = input("Model prompt >>> ") encoded_input = tmp_Vocab.encode_sents(input_text, ordered=True) with open('{}.txt'.format(dataset_name), 'a') as f: f.write('-' * 100+'\n') f.write('input:\n') f.write(input_text+'\n') output_len = 200 progress = ProgressBar() for step in progress(range(output_len)): time.sleep(0.01) feed_dict = {} for i in range(FLAGS.num_core_per_host): for m, m_np in zip(tower_mems[i], tower_mems_np[i]): feed_dict[m] = m_np for id, id_np in zip(tower_mems_id[i], tower_mems_id_np[i]): feed_dict[id] = id_np sess.run(iterator.initializer, feed_dict={test_list: [encoded_input]}) fetched = sess.run(fetches, feed_dict=feed_dict) tower_mems_np, output = fetched[:2] tower_mems_id_np = fetched[2] attn_prob = fetched[3] lookup_table = fetched[4] # print(attention_score) # print(np.array(lookup_table).shape) # print(np.array(tower_mems_id_np).shape) tmp_list = output[0][-1][0] tmp_list = tmp_list.tolist() # 下面是对结果的6种处理方式,若需要就保留,然后注释掉其他几种 # todo 取top1 index = top_one_result(tmp_list) # todo diversity # index = gen_diversity(tmp_list) # todo base on keyword # index = gen_on_keyword(tmp_Vocab, '喜', tmp_list, lookup_table) # # todo 可视化候选词 # visualize_prob(tmp_Vocab, tmp_list, # '../exp_result/{}/candidates'.format(dataset_name+'mem_len500'), len(input_text)) # # # todo 可视化attention per layer # visualize_attention_per_layer(tmp_Vocab, tower_mems_id_np, attn_prob, index, # '../exp_result/{}/attention_per_layer'.format(dataset_name+'mem_len500'), # len(input_text)) # # # todo 可视化attention per head # visualize_attention_per_head(tmp_Vocab, tower_mems_id_np, attn_prob, index, # '../exp_result/{}/attention_per_head'.format(dataset_name+'_repeat'), # len(input_text)) input_text += tmp_Vocab.get_sym(index) if tmp_Vocab.get_sym(index) != '<eos>' else '\n' encoded_input = [index] print(input_text) with open('{}.txt'.format(dataset_name), 'a') as f: f.write('output:\n') f.write(input_text+'\n') f.write('-'*100+'\n')
def main(unused_argv): del unused_argv # Unused tf.logging.set_verbosity(tf.logging.INFO) # Get corpus info corpus_info = data_utils.get_corpus_info(FLAGS.corpus_info_path) n_token = corpus_info["vocab_size"] cutoffs = corpus_info["cutoffs"][1:-1] tf.logging.info("n_token {}".format(n_token)) tmp_Vocab = Vocab(special=["<bos>", "<eos>", "<UNK>"]) tmp_Vocab.count_file("../data/{}/train.txt".format(FLAGS.dataset), add_eos=False) tmp_Vocab.build_vocab() if FLAGS.do_sent_ppl_pred: encoded_txt_input = [] txt_input = [] input_csv = [] with open(FLAGS.input_file_dir, "r") as read_file: csv_reader = csv.reader(read_file) for line in csv_reader: if line[0].strip() != 0: input_csv.append(line) for i in range(1, len(input_csv)): txt_input.append(input_csv[i][0].strip()) encoded_txt_input.append(list(tmp_Vocab.encode_sents(input_csv[i][0].strip(), \ add_eos=True, ordered=True))) encoded_txt_input = [ line[:FLAGS.limit_len] if len(line) > FLAGS.limit_len else line for line in encoded_txt_input ] encoded_txt_input = np.array(encoded_txt_input) input_csv[0].append("ppl") pool = multiprocessing.Pool(FLAGS.multiprocess) parti_len = len(encoded_txt_input) // FLAGS.multiprocess pro_res_l = [] for i in range(FLAGS.multiprocess): print("Setting process-%s" % i) ### 有空这里要写一个控制使用gpu:xx的步骤(gpu:1满了就用下一个) if i + 1 == FLAGS.multiprocess: end = len(encoded_txt_input) else: end = (i + 1) * parti_len pro_res_l.append(pool.apply_async(sent_ppl, \ args=(encoded_txt_input[i*parti_len:end], n_token, cutoffs, "/gpu:1"))) res_l = [] for i in range(len(pro_res_l)): proc_i_res = pro_res_l[i].get() res_l.extend(proc_i_res) pool.close() pool.join() print('All subprocesses done.') tf.logging.info('#time: {}'.format(time.time())) for i in range(1, len(input_csv)): input_csv[i].append(res_l[i - 1]) output_df = pd.DataFrame(input_csv[1:], columns=input_csv[0]) output_df.to_csv(FLAGS.output_file_dir, sep=",", index=False, encoding="utf-8-sig") with open("non_batch_ref_output.txt", "w") as write_res: for i in range(len(txt_input)): write_res.write(txt_input[i] + " " + str(encoded_txt_input[i]) + " " + str(res_l[i]) + "\n") # Check whether the length of result is right; Make sure multiprocess work well print(len(res_l)) elif FLAGS.do_sent_gen: txt_gen_list = [] with open(FLAGS.input_txt_dir, "r") as read_txt: for input_txt in read_txt: if len(input_txt.strip()) != 0: txt_gen_list.append( sent_gen(tmp_Vocab, input_txt.strip(), n_token, cutoffs, "/gpu:1")) with open("sent_generation.txt", "w") as write_res: for line in txt_gen_list: write_res.write(line + "\n")