Пример #1
0
def main(args):
    if args.mode == 'prepare':  # python3 run.py  --mode prepare --pointer-gen
        prepare(args)
    elif args.mode == 'train':  # python3 run.py  --mode train -b 100 -o output --gpu 0  --restore
        train(args)
    elif args.mode == 'eval':
        # python3 run.py --mode eval --eval-model
        evaluate(args)
    elif args.mode == 'decode':  #
        # python3 run.py --mode decode --beam-size 10 --decode-model output_big_data/model/model-250000 --decode-dir output_big_data/result --gpu 1
        args.batch_size = args.beam_size
        vocab_encoder = Vocab(args, "encoder_vocab")
        vocab_decoder = Vocab(args, "decoder_vocab")
        vocab_user = User_Vocab(args, name="user_vocab")
        test_file = "./test.data"
        #test_file = os.path.join(args.data, 'chat_data/tmp.data')
        # test_file = os.path.join(args.data, 'news_train_span_50.data')
        batcher = TestBatcher(args, vocab_encoder, vocab_decoder, vocab_user,
                              test_file).batcher()
        if args.cpu:
            with tf.device('/cpu:0'):
                model = CommentModel(args, vocab_decoder)
        else:
            model = CommentModel(args, vocab_decoder)

        decoder = BeamSearchDecoder(args, model, batcher, vocab_decoder)
        decoder.decode()
    elif args.mode == 'debug':
        debug(args)
    else:
        raise RuntimeError(f'mode {args.mode} is invalid.')
Пример #2
0
def prepare(args):
    if not os.path.exists(args.records_dir):
        os.makedirs(args.records_dir)

    train_file = os.path.join(args.data, 'chat_data/tmp.data')
    dev_file = os.path.join(args.data, 'chat_data/tmp.data')
    vocab_encoder = Vocab(args, name="encoder_vocab")
    vocab_decoder = Vocab(args, name="decoder_vocab")
    vocab_user = User_Vocab(args, name="user_vocab")
    dataset = Dataset(args, vocab_encoder, vocab_decoder, vocab_user,
                      train_file, dev_file)
    dataset.save_datasets(['train', 'dev'])
Пример #3
0
    def __init__(self,
                 data_dict,
                 train=True,
                 vocabulary=None,
                 support=False,
                 device=None):
        """
        'datas': all_datas
        'maxlen_story': maxlen_story
        'maxlen_query': maxlen_query
        'maxlen_sent': maxlen_sent
        """

        self.examples = data_dict['datas']
        self.maxlen_story = data_dict['maxlen_story']
        self.maxlen_query = data_dict['maxlen_query']
        self.maxlen_sent = data_dict['maxlen_sent']
        self.support = support
        self.device = device
        self.flatten = lambda x: [tkn for sublists in x for tkn in sublists]

        stories, questions, answers, supports = list(zip(*self.examples))
        if train:
            self.vocab = Vocab()
            self._build_vocab(stories, questions, answers)
        else:
            self.vocab = vocabulary
        # numerical & add_pad
        stories, questions, answers = self._preprocess(stories, questions,
                                                       answers)

        if self.support:
            self.data = list(zip(stories, questions, answers, supports))
        else:
            self.data = list(zip(stories, questions, answers))
Пример #4
0
    def __init__(self, filename, vocab_file=None,
                 vocab_dump=None, label_vocab_dump=None,
                 n_prev_turns=0, indices=None):
        with open(filename) as csvfile:
            reader = csv.DictReader(csvfile)
            self.data = [row for row in reader]

        if indices is not None:
            self.data = [self.data[i] for i in indices]

        if "id" in self.data[0]:
            self.id2idx = {row["id"]: i for i, row in enumerate(self.data)}

        self.n_prev_turns = n_prev_turns

        if vocab_dump is None:
            self.vocab = Vocab(vocab_file)
        else:
            with open(vocab_dump, 'rb') as fp:
                self.vocab = pickle.load(fp)
                
        if label_vocab_dump is None:
            labels = [row["label"] for row in self.data]
            self.label_vocab = LabelVocab(labels)
        else:
            with open(label_vocab_dump, 'rb') as fp:
                self.label_vocab = pickle.load(fp)
Пример #5
0
    def __init__(self, path, dataset, *args, **kwargs):
        self.dataset = dataset
        self.vocab = Vocab(*args, **kwargs)

        if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
            self.vocab.count_file(os.path.join(path, "train.txt"))
            self.vocab.count_file(os.path.join(path, "valid.txt"))
            self.vocab.count_file(os.path.join(path, "test.txt"))
        elif self.dataset == "wt103":
            self.vocab.count_file(os.path.join(path, "train.txt"))
        elif self.dataset == "lm1b":
            train_path_pattern = os.path.join(
                path, "1-billion-word-language-modeling-benchmark-r13output",
                "training-monolingual.tokenized.shuffled", "news.en-*")
            train_paths = glob(train_path_pattern)

            # the vocab will load from file when build_vocab() is called
            # for train_path in sorted(train_paths):
            #   self.vocab.count_file(train_path, verbose=True)

        self.vocab.build_vocab()

        if self.dataset in ["ptb", "wt2", "wt103"]:
            self.train = self.vocab.encode_file(os.path.join(
                path, "train.txt"),
                                                ordered=True)
            self.valid = self.vocab.encode_file(os.path.join(
                path, "valid.txt"),
                                                ordered=True)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                               ordered=True)
        elif self.dataset in ["enwik8", "text8"]:
            self.train = self.vocab.encode_file(os.path.join(
                path, "train.txt"),
                                                ordered=True,
                                                add_eos=False)
            self.valid = self.vocab.encode_file(os.path.join(
                path, "valid.txt"),
                                                ordered=True,
                                                add_eos=False)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                               ordered=True,
                                               add_eos=False)
        elif self.dataset == "lm1b":
            self.train = train_paths
            valid_path = os.path.join(path, "valid.txt")
            test_path = valid_path
            self.valid = self.vocab.encode_file(valid_path,
                                                ordered=True,
                                                add_double_eos=True)
            self.test = self.vocab.encode_file(test_path,
                                               ordered=True,
                                               add_double_eos=True)

        if self.dataset == "wt103":
            self.cutoffs = [0, 20000, 40000, 200000] + [len(self.vocab)]
        elif self.dataset == "lm1b":
            self.cutoffs = [0, 60000, 100000, 640000] + [len(self.vocab)]
        else:
            self.cutoffs = []
Пример #6
0
    def __init__(self, dataset, batch_size=None, vocab_created=False, vocab=None, target_col=None, word2index=None,
             sos_token='<SOS>', eos_token='<EOS>', unk_token='<UNK>', pad_token='<PAD>', min_word_count=5,
             max_vocab_size=None, max_seq_len=0.8, use_pretrained_vectors=False, glove_path='Glove/',
             glove_name='glove.6B.100d.txt', weights_file_name='Glove/weights.npy'):

        if not vocab_created:
            self.vocab = Vocab(dataset, target_col=target_col, word2index=word2index, sos_token=sos_token, eos_token=eos_token,
                               unk_token=unk_token, pad_token=pad_token, min_word_count=min_word_count,
                               max_vocab_size=max_vocab_size, max_seq_len=max_seq_len,
                               use_pretrained_vectors=use_pretrained_vectors, glove_path=glove_path,
                               glove_name=glove_name, weights_file_name=weights_file_name)

            self.dataset = self.vocab.dataset

        else:
            self.dataset = dataset
            self.vocab = vocab

        self.target_col = target_col

        self.word2index = self.vocab.word2index

        if batch_size:
            self.batch_size = batch_size
        else:
            self.batch_size = len(self.dataset)

        self.x_lengths = np.array(self.vocab.x_lengths)

        if self.target_col:
            self.y_lengths = np.array(self.vocab.y_lengths)

        self.pad_token = self.vocab.word2index[pad_token]

        self.sort_and_batch()
Пример #7
0
    def __init__(self,
                 filename,
                 vocab_file=None,
                 vocab_dump=None,
                 label_vocab_dump=None,
                 n_prev_turns=0,
                 text_input=False):
        self.text_input = text_input
        with open(filename) as csvfile:
            reader = csv.DictReader(csvfile)
            self.data = [row for row in reader]
            lattice_reader = LatticeReader(text_input=text_input)
            for i, row in enumerate(tqdm(self.data)):
                row["lattice"] = lattice_reader.read_sent(row["text"], i)
                row["rev_lattice"] = row["lattice"].reversed()

        self.id2idx = {row["id"]: i for i, row in enumerate(self.data)}
        self.n_prev_turns = n_prev_turns
        if vocab_dump is None:
            self.vocab = Vocab(vocab_file)
        else:
            with open(vocab_dump, 'rb') as fp:
                self.vocab = pickle.load(fp)
        if label_vocab_dump is None:
            labels = [row["label"] for row in self.data]
            self.label_vocab = LabelVocab(labels)
        else:
            with open(label_vocab_dump, 'rb') as fp:
                self.label_vocab = pickle.load(fp)
Пример #8
0
 def Vocabulary(self, update, context):
     try:
         chat_message = update.message.text
         x = Vocab(chat_message).mean()
         context.bot.send_message(chat_id=update.effective_chat.id, text=x)
     except KeyError:
         context.bot.send_message(chat_id=update.effective_chat.id,
                                  text="İnvaild Syntax :(")
Пример #9
0
 def word(self,update,context):
     try:
         chat_message=update.message.text
         chat_message.lower().capitalize()
         x=Vocab(chat_message).mean()
         context.bot.send_message(chat_id=update.effective_chat.id,text=x)
     except KeyError:
         context.bot.send_message(chat_id=update.effective_chat.id,text="Ä°nvaild Syntax :(")
Пример #10
0
    def __init__(self, path, dataset, *args, **kwargs):
        self.dataset = dataset
        self.vocab = Vocab(*args, **kwargs)

        self.vocab.count_file(os.path.join(path, "train.txt"))
        self.vocab.build_vocab()

        self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True)
        self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True)
        self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True)

        vocab_len = len(self.vocab)
        self.cutoffs = [0, int(vocab_len * 0.1), int(vocab_len * 0.2), int(vocab_len * 0.4)] + [vocab_len]
Пример #11
0
    def __init__(self, path, dataset, *args, **kwargs):
        self.dataset = dataset
        self.vocab = Vocab(*args, **kwargs)

        self.vocab.count_file(os.path.join(path, "train.txt"))
        self.vocab.build_vocab()

        self.train = self.vocab.encode_file(os.path.join(path, "train.txt"),
                                            ordered=True)
        self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"),
                                            ordered=True)
        self.test = self.vocab.encode_file(os.path.join(path, "train.txt"),
                                           ordered=True)

        self.cutoffs = []
    def __init__(self, path, dataset, *args, **kwargs):

        self.dataset = dataset
        self.vocab = Vocab(*args, **kwargs)

        self.vocab.count_file(os.path.join(
            path, "train.txt"))  # 更新vocab对象里的counter(用于统计每个不同的词出现的次数)
        self.vocab.count_file(os.path.join(path, "valid.txt"))  # 同上,验证集中更新

        self.vocab.build_vocab()  # 这一步是为了建立idx2sym和sym2idx,把词映射为索引,把索引还原为词

        self.train = self.vocab.encode_file(os.path.join(path, "train.txt"),
                                            ordered=True)
        self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"),
                                            ordered=True)
Пример #13
0
def train(**kwargs):
    args = DefaultConfig()
    args.parse(kwargs)
    vocab = Vocab()
    loss_functions = transformer_celoss
    score_functions = rouge_func
    model = getattr(Models, args.model_name)(vocab, args)
    train_loader = get_loaders('train', args.batch_size, 12)
    dev_loader = get_loaders('val', args.batch_size, 12)
    trainer = ScheduledTrainerTrans(args, model, loss_functions, score_functions, train_loader, dev_loader)
    if args.resume is not None:
        trainer.init_trainner(resume_from=args.resume)
    else:
        trainer.init_trainner()
    trainer.train()
Пример #14
0
    def __init__(self,
                 filename,
                 vocab_file=None,
                 vocab_dump=None,
                 stop_word_file=None):
        with open(filename) as csvfile:
            reader = csv.DictReader(csvfile)
            data = [row for row in reader]

        self.stop_words = set()
        if stop_word_file is not None:
            for line in open(stop_word_file):
                self.stop_words.add(line.strip())

        datas = []
        count, total = 0, 0
        for row in data:
            ref = row["transcription"]
            hyp = row["hypothesis"]
            score = float(row["score"])
            confs = row["confusion"].split()
            confs = [(confs[i * 3], confs[i * 3 + 1])
                     for i in range(len(confs) // 3 + 1)]
            conf_ids = []
            ref_id = hyp_id = 0
            for ref_w, hyp_w in confs:
                ref_eps = (ref_w == "<eps>")
                hyp_eps = (hyp_w == "<eps>")
                if not ref_eps and not hyp_eps and ref_w != hyp_w:
                    total += 1
                    if ref_w not in self.stop_words and hyp_w not in self.stop_words:
                        conf_ids.append((ref_id, hyp_id))
                    else:
                        count += 1

                if not ref_eps:
                    ref_id += 1
                if not hyp_eps:
                    hyp_id += 1
            datas.append((ref, hyp, conf_ids, score))
        print(count, total)
        self.data = datas

        if vocab_file is not None:
            self.vocab = Vocab(vocab_file)
        elif vocab_dump is not None:
            with open(vocab_dump, 'rb') as fp:
                self.vocab = pickle.load(fp)
Пример #15
0
def load_model(exp_name):
    exp_root = os.path.join(ckpt_root, exp_name)
    best_model_folder = get_best_k_model_path(os.path.join(exp_root, 'saved_models'))[0]
    best_model_folder = os.path.join(exp_root, 'saved_models', best_model_folder)
    model_state = t.load(os.path.join(best_model_folder, 'model'), map_location='cpu')
    try:
        for i in model_state:
            model_state[i] = model_state[i].cpu()
    except:
        pass

    trainner_state = t.load(os.path.join(best_model_folder, 'trainner_state'))
    args = trainner_state['args']

    vocab = Vocab()
    model = getattr(Models, args.model_name)(vocab, args)
    model.load_state_dict(model_state)
    model.eval()
    return model
Пример #16
0
    def __init__(self,
                 filename,
                 vocab_file=None,
                 vocab_dump=None,
                 text_input=False):
        self.text_input = text_input
        with open(filename) as csvfile:
            reader = csv.DictReader(csvfile)
            self.data = [row for row in reader]
            lattice_reader = LatticeReader(text_input=text_input)
            for i, row in enumerate(tqdm(self.data)):
                row["lattice"] = lattice_reader.read_sent(row["text"], i)
                row["rev_lattice"] = row["lattice"].reversed()

        if vocab_dump is None:
            self.vocab = Vocab(vocab_file)
        else:
            with open(vocab_dump, 'rb') as fp:
                self.vocab = pickle.load(fp)
Пример #17
0
    def __init__(self, path, dataset, *args, **kwargs):
        self.dataset = dataset
        self.vocab = Vocab(*args, **kwargs)

        train_path = os.path.join(path, "train.txt")
        valid_path = os.path.join(path, "valid.txt")
        # test_path = os.path.join(path, "test.txt")

        # self.vocab.count_file(train_path)
        # self.vocab.count_file(valid_path)
        # self.vocab.count_file(test_path)
        self.vocab.build_vocab(add_bytes=True)

        self.train = train_path
        self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"),
                                            ordered=True,
                                            add_eos=False)
        # self.test  = self.vocab.encode_file(
        #     os.path.join(path, "test.txt"), ordered=True, add_eos=False)
        self.cutoffs = []
Пример #18
0
    def __init__(self,
                 filename,
                 vocab_file=None,
                 vocab_dump=None,
                 label_vocab_dump=None):
        with open(filename) as csvfile:
            reader = csv.DictReader(csvfile)
            self.data = [row for row in reader]

        if vocab_dump is None:
            self.vocab = Vocab(vocab_file)
        else:
            with open(vocab_dump, 'rb') as fp:
                self.vocab = pickle.load(fp)
        if label_vocab_dump is None:
            labels = [row["label"] for row in self.data]
            self.label_vocab = LabelVocab(labels)
        else:
            with open(label_vocab_dump, 'rb') as fp:
                self.label_vocab = pickle.load(fp)
Пример #19
0
def train_re(**kwargs):
    args = DefaultConfig()
    args.parse(kwargs)
    vocab = Vocab()
    loss_functions = transformer_celoss
    score_functions = rouge_func
    model = getattr(Models, args.model_name)(vocab, args)
    train_loader = get_loaders('train', args.batch_size, 12)
    dev_loader = get_loaders('val', args.batch_size, 12)
    trainer = ScheduledTrainerTrans(args, model, loss_functions, score_functions, train_loader, dev_loader)
    trainer.init_trainner(resume_from=args.resume)
    # try:
    #     trainer.model.vgg_feature.requires_grad = True
    #     trainer.model.vgg_input.requires_grad = True
    #
    # except:
    #     trainer.model.module.vgg_feature.requires_grad = True
    #     trainer.model.module.vgg_input.requires_grad = True
    # trainer.optim.param_groups[0]['lr'] = 3e-5
    trainer.train()
Пример #20
0
    def __init__(
            self, checkpoint_path='/home/mnakhodnov/sirius-stt/models/8_recovered_v3/epoch_17.pt',
            device=torch.device('cpu'), rescore=True, decoder_kwargs=None
    ):
        if not os.path.exists(checkpoint_path):
            raise ValueError(f'There is no checkpoint in {checkpoint_path}')

        self.device = device
        self.rescore = rescore
        self.decoder_kwargs = decoder_kwargs
        self.checkpoint_path = checkpoint_path

        self._vocab = Vocab(self._alphabet)

        self._num_tokens = get_num_tokens(self._vocab)
        self._blank_index = get_blank_index(self._vocab)

        self._sample_rate = 8000
        self._model_config = {
            'num_mel_bins': 64,
            'hidden_size': 512,
            'num_layers': 4,
            'num_tokens': len(self._vocab.tokens2indices()) - 1,
        }

        self.model = Model(**self._model_config)
        load_from_ckpt(self.model, self.checkpoint_path)
        self.model = self.model.to(device=self.device).eval()

        self.decoder = fast_beam_search_decode
        self._kenlm_binary_path = '/data/mnakhodnov/language_data/cc100/xaa.processed.3.binary'
        if self.decoder_kwargs is None:
            self.decoder_kwargs = {
                'beam_size': 200, 'cutoff_top_n': 33, 'cutoff_prob': 1.0,
                'ext_scoring_func': self._kenlm_binary_path, 'alpha': 1.0, 'beta': 0.3, 'num_processes': 32
            }

        if self.rescore:
            self.rescorer_model = torch.hub.load(
                'pytorch/fairseq', 'transformer_lm.wmt19.ru', tokenizer='moses', bpe='fastbpe', force_reload=False
            ).to(device=device)
Пример #21
0
    def __init__(self, text_path, vocab_file=None, vocab_dump=None):
        self.data = []

        print_time_info("Reading text from {}".format(text_path))

        with open(text_path) as csvfile:
            reader = csv.DictReader(csvfile)
            for i, row in enumerate(reader):
                words = row["text"].split()
                if "id" in row:
                    self.data.append((row["id"], words))
                else:
                    self.data.append((i, words))
        # for line in tqdm(open(text_path)):
        #     uid, *words = line.strip().split()
        #     self.data.append((uid, words))

        if vocab_dump is None:
            self.vocab = Vocab(vocab_file)
        else:
            with open(vocab_dump, 'rb') as fp:
                self.vocab = pickle.load(fp)
Пример #22
0
    def forward(self, position_feature):
        # inputs [B, max_lenth]
        positions_encoded = self.position_encoding(position_feature)
        return positions_encoded


if __name__ == '__main__':
    import ipdb
    from loaders import get_loaders
    from configs_transformer import DefaultConfig
    from tqdm import tqdm
    from vocabulary import Vocab
    args = DefaultConfig
    args.batch_size = 2
    loader = get_loaders('val', args.batch_size, 2)
    vocab = Vocab()

    for i in tqdm(loader):
        feature, caption, lenth = [j for j in i]
        batch_size, c, h, w = feature.size()
        _, n, l = caption.size()
        feature = feature.unsqueeze(1).expand(
            (batch_size, n, c, h, w)).contiguous().view(-1, c, h, w)
        caption = caption.long()
        caption = caption.view(-1, l)

        model = VGGTransformerNew1(vocab, args)
        output_log_prob, output_token = model(feature, caption)
        token = model.greedy_search(feature)
        loss = output_log_prob.sum()
        loss.backward()
Пример #23
0
def main():
    clock = Clock()
    clock.start()
    random.seed(SEED)
    np.random.seed(SEED)
    assert START_TOKEN == 0

    parser = argparse.ArgumentParser(description='conditional SeqGAN')
    parser.add_argument('--conditional',
                        '-c',
                        type=int,
                        default=0,
                        help='If you make SeqGAN conditional, set `-c` 1.')
    args = parser.parse_args()
    cond = args.conditional

    vocab = Vocab()
    vocab.construct(parsed_haiku_file)
    vocab.word2id(parsed_haiku_file, positive_file)
    UNK = vocab.dic.token2id[u'<UNK>']
    COMMA = vocab.dic.token2id[u',']

    gen_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH, COND_LENGTH, UNK)
    # likelihood_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH, COND_LENGTH, UNK) # For testing
    vocab_size = len(vocab.dic.token2id)
    with open(output_token2id, 'w') as f:
        pickle.dump(vocab.dic.token2id, f)
    dis_data_loader = Dis_dataloader(BATCH_SIZE, SEQ_LENGTH, UNK)

    generator = Generator(vocab_size,
                          BATCH_SIZE,
                          EMB_DIM,
                          HIDDEN_DIM,
                          SEQ_LENGTH,
                          COND_LENGTH,
                          START_TOKEN,
                          is_cond=cond)
    # target_params = cPickle.load(open('save/target_params.pkl'))
    # target_lstm = TARGET_LSTM(vocab_size, BATCH_SIZE, EMB_DIM, HIDDEN_DIM, SEQ_LENGTH, START_TOKEN, target_params) # The oracle model

    discriminator = Discriminator(sequence_length=SEQ_LENGTH,
                                  cond_length=COND_LENGTH,
                                  num_classes=2,
                                  vocab_size=vocab_size,
                                  batch_size=BATCH_SIZE,
                                  embedding_size=dis_embedding_dim,
                                  filter_sizes=dis_filter_sizes,
                                  num_filters=dis_num_filters,
                                  l2_reg_lambda=dis_l2_reg_lambda,
                                  is_cond=cond)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())

    # First, use the oracle model to provide the positive examples, which are sampled from the oracle data distribution
    # generate_samples(sess, target_lstm, BATCH_SIZE, generated_num, positive_file)
    gen_data_loader.create_batches(positive_file)

    if cond:
        vocab.word2id(parsed_kigo_file, positive_condition_file)
        vocab.load_cond(positive_condition_file, COND_LENGTH, UNK)
        gen_data_loader.create_cond_batches(positive_condition_file)

    log = open('save/experiment-log.txt', 'w')
    #  pre-train generator
    print 'Start pre-training...'
    log.write('pre-training...\n')
    for epoch in xrange(PRE_EPOCH_GEN_NUM):
        loss = pre_train_epoch(sess, generator, gen_data_loader, cond=cond)
        if epoch % 5 == 0:
            generate_samples(sess, generator, BATCH_SIZE, generated_num,
                             eval_file, cond, vocab)
            # likelihood_data_loader.create_batches(eval_file)
            # test_loss = target_loss(sess, target_lstm, likelihood_data_loader)
            # print 'pre-train epoch ', epoch, 'test_loss ', test_loss
            # buffer = 'epoch:\t'+ str(epoch) + '\tnll:\t' + str(test_loss) + '\n'
            # log.write(buffer)
    clock.check_HMS()

    print 'Start pre-training discriminator...'
    # Train 3 epoch on the generated data and do this for 50 times
    for _ in range(PRE_EPOCH_DIS_NUM):
        generate_samples(sess, generator, BATCH_SIZE, generated_num,
                         negative_file, cond, vocab)
        dis_data_loader.load_train_data(positive_file, negative_file)
        for _ in range(3):
            dis_data_loader.reset_pointer()
            for it in xrange(dis_data_loader.num_batch):
                x_batch, y_batch = dis_data_loader.next_batch()
                feed = {
                    discriminator.input_x: x_batch,
                    discriminator.input_y: y_batch,
                    discriminator.dropout_keep_prob: dis_dropout_keep_prob
                }
                _ = sess.run(discriminator.train_op, feed)
    clock.check_HMS()

    rollout = ROLLOUT(generator, 0.8, SEQ_LENGTH)

    print '#########################################################################'
    print 'Start Adversarial Training...'
    log.write('adversarial training...\n')
    for total_batch in range(TOTAL_BATCH):
        # Train the generator for one step
        for it in range(1):
            if cond:
                cond_batch = vocab.choice_cond(BATCH_SIZE)
                samples = generator.generate(sess, cond=cond_batch)
                rewards = rollout.get_reward(sess,
                                             samples,
                                             16,
                                             discriminator,
                                             cond=cond_batch)
            else:
                samples = generator.generate(sess)
                rewards = rollout.get_reward(sess, samples, 16, discriminator)
            feed = {generator.x: samples, generator.rewards: rewards}
            if cond:
                feed[generator.cond] = cond_batch
            _ = sess.run(generator.g_updates, feed_dict=feed)

        # Test
        if total_batch % 5 == 0 or total_batch == TOTAL_BATCH - 1:
            generate_samples(sess, generator, BATCH_SIZE, generated_num,
                             eval_file, cond, vocab)
            # likelihood_data_loader.create_batches(eval_file)
            # test_loss = target_loss(sess, target_lstm, likelihood_data_loader)
            # buffer = 'epoch:\t' + str(total_batch) + '\tnll:\t' + str(test_loss) + '\n'
            # print 'total_batch: ', total_batch, 'test_loss: ', test_loss
            # log.write(buffer)
            if total_batch % 20 == 0 or total_batch == TOTAL_BATCH - 1:
                if cond:
                    vocab.id2word(
                        eval_file,
                        generated_haiku_with_kigo_file.format(total_batch))
                else:
                    vocab.id2word(eval_file,
                                  generated_haiku_file.format(total_batch))

        # Update roll-out parameters
        rollout.update_params()

        # Train the discriminator
        for _ in range(5):
            generate_samples(sess, generator, BATCH_SIZE, generated_num,
                             negative_file, cond, vocab)
            dis_data_loader.load_train_data(positive_file, negative_file)

            for _ in range(3):
                dis_data_loader.reset_pointer()
                for it in xrange(dis_data_loader.num_batch):
                    x_batch, y_batch = dis_data_loader.next_batch()
                    feed = {
                        discriminator.input_x: x_batch,
                        discriminator.input_y: y_batch,
                        discriminator.dropout_keep_prob: dis_dropout_keep_prob
                    }
                    _ = sess.run(discriminator.train_op, feed)
    clock.check_HMS()
    saver = tf.train.Saver()
    saver.save(sess, output_generator)
    log.close()
Пример #24
0
os.environ["CUDA_VISIBLE_DEVICES"] = '{0}'.format(
    str(cuda_device_id) if cuda_device_id is not None else '')
if cuda_device_id is not None and torch.cuda.is_available():
    device = 'cuda:{0:d}'.format(0)
else:
    device = torch.device('cpu')

print(f'dtype: {dtype}, device: {device}, cuda_device_id {cuda_device_id}')

alphabet = [
    'а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н',
    'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ь', 'ы', 'ъ',
    'э', 'ю', 'я', ' ', '<blank>'
]

vocab = Vocab(alphabet)

blank_index = get_blank_index(vocab)

audio_transforms = get_default_audio_transforms()
# audio_transforms = None

sample_rate = 8000

# ## Load Common Voice dataset
common_voice_val_manifest_path = '/home/e.chuykova/data/val.txt'
common_voice_test_manifest_path = '/home/e.chuykova/data/test.txt'
common_voice_train_manifest_path = '/home/e.chuykova/data/train.txt'

common_voice_val_dataset = AudioDataset(
    common_voice_val_manifest_path,
Пример #25
0
 def __init__(self, exp_name='20181212_214746'):
     self.vocab = Vocab()
     self.model = load_model(exp_name)
     self.model.eval()
Пример #26
0
    def __init__(self, path, dataset, *args, **kwargs):
        self.dataset = dataset
        if self.dataset == "generic_dataset":
            encode_kwargs = dict(
                add_eos=kwargs.pop('add_eos', False),
                add_double_eos=kwargs.pop('add_double_eos', False),
                ordered=True,
                verbose=True,
            )
            if kwargs.get('vocab_file') is not None:
                kwargs['vocab_file'] = os.path.join(path, kwargs['vocab_file'])

        print(self.dataset, 'vocab params', kwargs)
        self.vocab = Vocab(*args, **kwargs)

        if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
            self.vocab.count_file(os.path.join(path, "train.txt"))
            self.vocab.count_file(os.path.join(path, "valid.txt"))
            self.vocab.count_file(os.path.join(path, "test.txt"))
        elif self.dataset == "generic_dataset" and not self.vocab.vocab_file:
            self.vocab.count_file(os.path.join(path, "train.txt"))
            self.vocab.count_file(os.path.join(path, "valid.txt"))
            self.vocab.count_file(os.path.join(path, "test.txt"))
        elif self.dataset == "wt103":
            self.vocab.count_file(os.path.join(path, "train.txt"))
        elif self.dataset == "lm1b":
            train_path_pattern = os.path.join(
                path, "1-billion-word-language-modeling-benchmark-r13output",
                "training-monolingual.tokenized.shuffled", "news.en-*")
            train_paths = glob(train_path_pattern)

            # the vocab will load from file when build_vocab() is called
            # for train_path in sorted(train_paths):
            #   self.vocab.count_file(train_path, verbose=True)

        self.vocab.build_vocab()

        if self.dataset in ["ptb", "wt2", "wt103"]:
            self.train = self.vocab.encode_file(os.path.join(
                path, "train.txt"),
                                                ordered=True)
            self.valid = self.vocab.encode_file(os.path.join(
                path, "valid.txt"),
                                                ordered=True)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                               ordered=True)
        elif self.dataset == "generic_dataset":
            self.train = self.vocab.encode_file(
                os.path.join(path, "train.txt"), **encode_kwargs)
            self.valid = self.vocab.encode_file(
                os.path.join(path, "valid.txt"), **encode_kwargs)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                               **encode_kwargs)
        elif self.dataset in ["enwik8", "text8"]:
            self.train = self.vocab.encode_file(os.path.join(
                path, "train.txt"),
                                                ordered=True,
                                                add_eos=False)
            self.valid = self.vocab.encode_file(os.path.join(
                path, "valid.txt"),
                                                ordered=True,
                                                add_eos=False)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                               ordered=True,
                                               add_eos=False)
        elif self.dataset == "lm1b":
            self.train = train_paths
            valid_path = os.path.join(path, "valid.txt")
            test_path = valid_path
            self.valid = self.vocab.encode_file(valid_path,
                                                ordered=True,
                                                add_double_eos=True)
            self.test = self.vocab.encode_file(test_path,
                                               ordered=True,
                                               add_double_eos=True)

        if self.dataset == "wt103":
            self.cutoffs = [0, 20000, 40000, 200000] + [len(self.vocab)]
        elif self.dataset == "generic_dataset":
            with open(os.path.join(path, "cutoffs.json")) as f:
                self.cutoffs = json.load(f)
        elif self.dataset == "lm1b":
            self.cutoffs = [0, 60000, 100000, 640000] + [len(self.vocab)]
        else:
            self.cutoffs = []
def get_all_data(args, training=True, batch_size=100) -> Tuple[DataLoader, DataLoader, DataLoader]:
    # evaluation batch size
    eval_batch = args["eval_batch"] if args["eval_batch"] else batch_size

    # pickle file path
    if args['path']:
        saving_folder_path = args['path']
    else:
        saving_folder_path = 'save/{}-{}-{}-{}/'.format(args["decoder"], args["addName"], args['dataset'], args['task'])
    iprint('Path to save data: ' + saving_folder_path)

    if not os.path.exists(saving_folder_path):
        os.makedirs(saving_folder_path)

    # read domain-slot pairs
    ontology = json.load(open(FILE_ONTOLOGY, 'r'))
    all_slots = get_slot_info(ontology)

    # vocab
    vocab_name = 'vocab-all.pkl' if args["all_vocab"] else 'vocab-train.pkl'
    mem_vocab_name = 'mem-vocab-all.pkl' if args["all_vocab"] else 'mem-vocab-train.pkl'
    # if vocab files exist, read them in, otherwise we create new ones
    if os.path.exists(saving_folder_path + vocab_name) and os.path.exists(saving_folder_path + mem_vocab_name):
        iprint('Loading saved vocab files...')
        with open(saving_folder_path + vocab_name, 'rb') as handle:
            vocab = pickle.load(handle)
        with open(saving_folder_path + mem_vocab_name, 'rb') as handle:
            mem_vocab = pickle.load(handle)
    else:
        vocab = Vocab()
        vocab.index_words(all_slots, 'slot')
        mem_vocab = Vocab()
        mem_vocab.index_words(all_slots, 'slot')

    if training:
        pair_train, train_max_len, slot_train, train_dataloader = get_data(
            args=args,
            file=FILE_TRAIN,
            slots=all_slots,
            dataset='train',
            vocab=vocab,
            mem_vocab=mem_vocab,
            training=training,
            batch_size=batch_size,
            shuffle=True
        )

        nb_train_vocab = vocab.n_words
    else:
        pair_train, train_max_len, slot_train, train_dataloader, nb_train_vocab = [], 0, {}, [], 0

    pair_dev, dev_max_len, slot_dev, dev_dataloader = get_data(
        args=args,
        file=FILE_DEV,
        slots=all_slots,
        dataset='dev',
        vocab=vocab,
        mem_vocab=mem_vocab,
        training=training,
        batch_size=eval_batch,
        shuffle=False
    )

    pair_test, test_max_len, slot_test, test_dataloader = get_data(
        args=args,
        file=FILE_TEST,
        slots=all_slots,
        dataset='test',
        vocab=vocab,
        mem_vocab=mem_vocab,
        training=training,
        batch_size=eval_batch,
        shuffle=False
    )

    iprint('Dumping vocab files...')
    with open(saving_folder_path + vocab_name, 'wb') as handle:
        pickle.dump(vocab, handle)
    with open(saving_folder_path + mem_vocab_name, 'wb') as handle:
        pickle.dump(mem_vocab, handle)
    embedding_dump_path = 'data/embedding{}.json'.format(len(vocab.index2word))
    if not os.path.exists(embedding_dump_path) and args["load_embedding"]:
        dump_pretrained_emb(vocab.word2index, vocab.index2word, embedding_dump_path)

    test_4d = []
    if args['except_domain'] != '':
        pair_test_4d, _, _, test_4d = get_data(
            file=FILE_TEST,
            slots=all_slots,
            dataset='dev',
            vocab=vocab,
            mem_vocab=mem_vocab,
            training=training,
            batch_size=eval_batch,
            shuffle=False
        )

    max_word = max(train_max_len, dev_max_len, test_max_len) + 1

    iprint('Read %s pairs train' % len(pair_train))
    iprint('Read %s pairs dev' % len(pair_dev))
    iprint('Read %s pairs test' % len(pair_test))
    iprint('Vocab_size: %s' % vocab.n_words)
    iprint('Vocab_size Training %s' % nb_train_vocab)
    iprint('Vocab_size Belief %s' % mem_vocab.n_words)
    iprint('Max. length of dialog words for RNN: %s' % max_word)
    # iprint('USE_CUDA={}'.format(USE_CUDA))

    # slots_list = [all_slots, slot_train, slot_dev, slot_test]
    slots_dict = {
        'all': all_slots,
        'train': slot_train,
        'val': slot_dev,
        'test': slot_test
    }
    iprint('[Train Set & Dev Set Slots]: Number is {} in total'.format(len(slots_dict['val'])))
    iprint(slots_dict['val'])
    iprint('[Test Set Slots]: Number is {} in total'.format(len(slots_dict['test'])))
    iprint(slots_dict['test'])
    vocabs = [vocab, mem_vocab]
    return train_dataloader, dev_dataloader, test_dataloader, test_4d, vocabs, slots_dict, nb_train_vocab
Пример #28
0
def main():
    clock = Clock()
    clock.start()
    random.seed(SEED)
    np.random.seed(SEED)
    assert START_TOKEN == 0
    
    vocab = Vocab()
    vocab.construct(parsed_tweet_file)
    vocab.word2id(parsed_tweet_file, positive_file)
    UNK = vocab.dic.token2id[u'<UNK>']

    gen_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH, UNK)
    likelihood_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH, UNK) # For testing
    vocab_size = 5000
    dis_data_loader = Dis_dataloader(BATCH_SIZE, SEQ_LENGTH, UNK)

    generator = Generator(vocab_size, BATCH_SIZE, EMB_DIM, HIDDEN_DIM, SEQ_LENGTH, START_TOKEN)
    target_params = cPickle.load(open('save/target_params.pkl'))
    target_lstm = TARGET_LSTM(vocab_size, BATCH_SIZE, EMB_DIM, HIDDEN_DIM, SEQ_LENGTH, START_TOKEN, target_params) # The oracle model

    discriminator = Discriminator(sequence_length=SEQ_LENGTH, num_classes=2, vocab_size=vocab_size, embedding_size=dis_embedding_dim, 
                                filter_sizes=dis_filter_sizes, num_filters=dis_num_filters, l2_reg_lambda=dis_l2_reg_lambda)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())

    # First, use the oracle model to provide the positive examples, which are sampled from the oracle data distribution
    # generate_samples(sess, target_lstm, BATCH_SIZE, generated_num, positive_file)
    gen_data_loader.create_batches(positive_file)

    log = open('save/experiment-log.txt', 'w')
    #  pre-train generator
    print 'Start pre-training...'
    log.write('pre-training...\n')
    for epoch in xrange(PRE_EPOCH_NUM):
        loss = pre_train_epoch(sess, generator, gen_data_loader)
        if epoch % 5 == 0:
            generate_samples(sess, generator, BATCH_SIZE, generated_num, eval_file)
            likelihood_data_loader.create_batches(eval_file)
            test_loss = target_loss(sess, target_lstm, likelihood_data_loader)
            print 'pre-train epoch ', epoch, 'test_loss ', test_loss
            buffer = 'epoch:\t'+ str(epoch) + '\tnll:\t' + str(test_loss) + '\n'
            log.write(buffer)
    clock.check_HMS()
    
    print 'Start pre-training discriminator...'
    # Train 3 epoch on the generated data and do this for 50 times
    for _ in range(50):
        generate_samples(sess, generator, BATCH_SIZE, generated_num, negative_file)
        dis_data_loader.load_train_data(positive_file, negative_file)
        for _ in range(3):
            dis_data_loader.reset_pointer()
            for it in xrange(dis_data_loader.num_batch):
                x_batch, y_batch = dis_data_loader.next_batch()
                feed = {
                    discriminator.input_x: x_batch,
                    discriminator.input_y: y_batch,
                    discriminator.dropout_keep_prob: dis_dropout_keep_prob
                }
                _ = sess.run(discriminator.train_op, feed)
    clock.check_HMS()

    rollout = ROLLOUT(generator, 0.8, SEQ_LENGTH)

    print '#########################################################################'
    print 'Start Adversarial Training...'
    log.write('adversarial training...\n')
    for total_batch in range(TOTAL_BATCH):
        # Train the generator for one step
        for it in range(1):
            samples = generator.generate(sess)
            rewards = rollout.get_reward(sess, samples, 16, discriminator)
            feed = {generator.x: samples, generator.rewards: rewards}
            _ = sess.run(generator.g_updates, feed_dict=feed)

        # Test
        if total_batch % 5 == 0 or total_batch == TOTAL_BATCH - 1:
            generate_samples(sess, generator, BATCH_SIZE, generated_num, eval_file)
            likelihood_data_loader.create_batches(eval_file)
            test_loss = target_loss(sess, target_lstm, likelihood_data_loader)
            buffer = 'epoch:\t' + str(total_batch) + '\tnll:\t' + str(test_loss) + '\n'
            print 'total_batch: ', total_batch, 'test_loss: ', test_loss
            log.write(buffer)
            vocab.id2word(eval_file, generated_tweet_file.format(total_batch))

        # Update roll-out parameters
        rollout.update_params()

        # Train the discriminator
        for _ in range(5):
            generate_samples(sess, generator, BATCH_SIZE, generated_num, negative_file)
            dis_data_loader.load_train_data(positive_file, negative_file)

            for _ in range(3):
                dis_data_loader.reset_pointer()
                for it in xrange(dis_data_loader.num_batch):
                    x_batch, y_batch = dis_data_loader.next_batch()
                    feed = {
                        discriminator.input_x: x_batch,
                        discriminator.input_y: y_batch,
                        discriminator.dropout_keep_prob: dis_dropout_keep_prob
                    }
                    _ = sess.run(discriminator.train_op, feed)
    clock.check_HMS()
    log.close()
Пример #29
0
def inference(n_token, cutoffs, ps_device):
    dataset_name = "doupo"
    tmp_Vocab = Vocab()
    tmp_Vocab.count_file("../data/{}/train.txt".format(dataset_name), add_eos=False)
    tmp_Vocab.build_vocab()

    n_token = len(tmp_Vocab)
    # print(tmp_Vocab.idx2sym)

    test_list = tf.placeholder(tf.int64, shape=[1, None])
    dataset = tf.data.Dataset.from_tensors(test_list)
    # dataset = dataset.batch(1, drop_remainder=True)

    iterator = dataset.make_initializable_iterator()
    input_feed = iterator.get_next()

    inputs = tf.split(input_feed, FLAGS.num_core_per_host, 0)
    # inputs = input_feed

    per_core_bsz = 1
    tower_mems, tower_losses, tower_new_mems = [], [], []
    tower_output = []
    tower_mems_id = []
    tower_new_mems_id = []
    tower_attn_prob = []

    for i in range(FLAGS.num_core_per_host):
        with tf.device(assign_to_gpu(i, ps_device)), \
             tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
            mems_i = [tf.placeholder(tf.float32,
                                     [FLAGS.mem_len, per_core_bsz, FLAGS.d_model])
                      for _ in range(FLAGS.n_layer)]

            mems_i_id = [tf.placeholder(tf.int64,
                                     [FLAGS.mem_len, per_core_bsz])
                      for _ in range(FLAGS.n_layer)]

            new_mems_i, output_i, new_mems_i_id, attn_prob_i = single_core_graph_for_inference(
                n_token=n_token,
                cutoffs=cutoffs,
                is_training=False,
                inp=inputs[i],
                mems=mems_i,
                mems_id=mems_i_id)

            tower_mems.append(mems_i)
            tower_new_mems.append(new_mems_i)
            tower_output.append(output_i)
            tower_mems_id.append(mems_i_id)
            tower_new_mems_id.append(new_mems_i_id)
            tower_attn_prob.append(attn_prob_i)

    # Evaluation loop
    tower_mems_np = [
        [np.zeros([FLAGS.mem_len, per_core_bsz, FLAGS.d_model], dtype=np.float32)
         for layer in range(FLAGS.n_layer)]
        for core in range(FLAGS.num_core_per_host)
    ]

    tower_mems_id_np = [
        [np.zeros([FLAGS.mem_len, per_core_bsz], dtype=np.float32)
         for layer in range(FLAGS.n_layer)]
        for core in range(FLAGS.num_core_per_host)
    ]

    saver = tf.train.Saver()

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
        sess.run(tf.global_variables_initializer())

        if FLAGS.eval_ckpt_path is None:
            eval_ckpt_path = tf.train.latest_checkpoint(FLAGS.model_dir)
        else:
            eval_ckpt_path = FLAGS.eval_ckpt_path
        print('eval_ckpt_path:', eval_ckpt_path)
        saver.restore(sess, eval_ckpt_path)

        # attention_score = tf.get_variable('transformer/layer_2/rel_attn/transpose_1:0')

        fetches = [tower_new_mems,
                   tower_output,
                   tower_new_mems_id,
                   tower_attn_prob,
                   'transformer/adaptive_embed/lookup_table:0']

        while True:
            input_text = input("seed text >>> ")
            while not input_text:
                print('Prompt should not be empty!')
                input_text = input("Model prompt >>> ")
            encoded_input = tmp_Vocab.encode_sents(input_text, ordered=True)

            with open('{}.txt'.format(dataset_name), 'a') as f:
                f.write('-' * 100+'\n')
                f.write('input:\n')
                f.write(input_text+'\n')

            output_len = 200
            progress = ProgressBar()
            for step in progress(range(output_len)):
                time.sleep(0.01)
                feed_dict = {}
                for i in range(FLAGS.num_core_per_host):
                    for m, m_np in zip(tower_mems[i], tower_mems_np[i]):
                        feed_dict[m] = m_np

                    for id, id_np in zip(tower_mems_id[i], tower_mems_id_np[i]):
                        feed_dict[id] = id_np

                sess.run(iterator.initializer, feed_dict={test_list: [encoded_input]})
                fetched = sess.run(fetches, feed_dict=feed_dict)

                tower_mems_np, output = fetched[:2]

                tower_mems_id_np = fetched[2]

                attn_prob = fetched[3]
                lookup_table = fetched[4]
                # print(attention_score)
                # print(np.array(lookup_table).shape)
                # print(np.array(tower_mems_id_np).shape)

                tmp_list = output[0][-1][0]
                tmp_list = tmp_list.tolist()

                # 下面是对结果的6种处理方式,若需要就保留,然后注释掉其他几种
                # todo 取top1
                index = top_one_result(tmp_list)
                # todo diversity
                # index = gen_diversity(tmp_list)
                # todo base on keyword
                # index = gen_on_keyword(tmp_Vocab, '喜', tmp_list, lookup_table)

                # # todo 可视化候选词
                # visualize_prob(tmp_Vocab, tmp_list,
                # '../exp_result/{}/candidates'.format(dataset_name+'mem_len500'), len(input_text))

                # # # todo 可视化attention per layer
                # visualize_attention_per_layer(tmp_Vocab, tower_mems_id_np, attn_prob, index,
                #                               '../exp_result/{}/attention_per_layer'.format(dataset_name+'mem_len500'),
                #                               len(input_text))

                # # # todo 可视化attention per head
                # visualize_attention_per_head(tmp_Vocab, tower_mems_id_np, attn_prob, index,
                #                              '../exp_result/{}/attention_per_head'.format(dataset_name+'_repeat'),
                #                              len(input_text))

                input_text += tmp_Vocab.get_sym(index) if tmp_Vocab.get_sym(index) != '<eos>' else '\n'
                encoded_input = [index]

            print(input_text)

            with open('{}.txt'.format(dataset_name), 'a') as f:
                f.write('output:\n')
                f.write(input_text+'\n')
                f.write('-'*100+'\n')
Пример #30
0
def main(unused_argv):
    del unused_argv  # Unused

    tf.logging.set_verbosity(tf.logging.INFO)

    # Get corpus info
    corpus_info = data_utils.get_corpus_info(FLAGS.corpus_info_path)
    n_token = corpus_info["vocab_size"]
    cutoffs = corpus_info["cutoffs"][1:-1]
    tf.logging.info("n_token {}".format(n_token))

    tmp_Vocab = Vocab(special=["<bos>", "<eos>", "<UNK>"])
    tmp_Vocab.count_file("../data/{}/train.txt".format(FLAGS.dataset),
                         add_eos=False)
    tmp_Vocab.build_vocab()

    if FLAGS.do_sent_ppl_pred:
        encoded_txt_input = []
        txt_input = []
        input_csv = []
        with open(FLAGS.input_file_dir, "r") as read_file:
            csv_reader = csv.reader(read_file)
            for line in csv_reader:
                if line[0].strip() != 0:
                    input_csv.append(line)

            for i in range(1, len(input_csv)):
                txt_input.append(input_csv[i][0].strip())
                encoded_txt_input.append(list(tmp_Vocab.encode_sents(input_csv[i][0].strip(), \
                    add_eos=True, ordered=True)))

        encoded_txt_input = [
            line[:FLAGS.limit_len] if len(line) > FLAGS.limit_len else line
            for line in encoded_txt_input
        ]
        encoded_txt_input = np.array(encoded_txt_input)

        input_csv[0].append("ppl")

        pool = multiprocessing.Pool(FLAGS.multiprocess)

        parti_len = len(encoded_txt_input) // FLAGS.multiprocess
        pro_res_l = []

        for i in range(FLAGS.multiprocess):
            print("Setting process-%s" % i)
            ### 有空这里要写一个控制使用gpu:xx的步骤(gpu:1满了就用下一个)

            if i + 1 == FLAGS.multiprocess:
                end = len(encoded_txt_input)
            else:
                end = (i + 1) * parti_len
            pro_res_l.append(pool.apply_async(sent_ppl, \
                args=(encoded_txt_input[i*parti_len:end], n_token, cutoffs, "/gpu:1")))

        res_l = []

        for i in range(len(pro_res_l)):
            proc_i_res = pro_res_l[i].get()
            res_l.extend(proc_i_res)

        pool.close()
        pool.join()
        print('All subprocesses done.')

        tf.logging.info('#time: {}'.format(time.time()))

        for i in range(1, len(input_csv)):
            input_csv[i].append(res_l[i - 1])
        output_df = pd.DataFrame(input_csv[1:], columns=input_csv[0])
        output_df.to_csv(FLAGS.output_file_dir,
                         sep=",",
                         index=False,
                         encoding="utf-8-sig")

        with open("non_batch_ref_output.txt", "w") as write_res:
            for i in range(len(txt_input)):
                write_res.write(txt_input[i] + " " +
                                str(encoded_txt_input[i]) + " " +
                                str(res_l[i]) + "\n")

        # Check whether the length of result is right; Make sure multiprocess work well
        print(len(res_l))

    elif FLAGS.do_sent_gen:
        txt_gen_list = []
        with open(FLAGS.input_txt_dir, "r") as read_txt:
            for input_txt in read_txt:
                if len(input_txt.strip()) != 0:
                    txt_gen_list.append(
                        sent_gen(tmp_Vocab, input_txt.strip(), n_token,
                                 cutoffs, "/gpu:1"))

        with open("sent_generation.txt", "w") as write_res:
            for line in txt_gen_list:
                write_res.write(line + "\n")