def setVocab(self, args): vocab_name = "./datasets/vocab_{}.bin".format(args.dataname) if os.path.exists(vocab_name): src_vocab = Vocabulary.load(vocab_name) else: set_vocab = set() [[set_vocab.add(word) for word in word_arr] for word_arr in gens.word_list(args.train_source)] n_vocab = len(set_vocab) + 3 print("n_vocab:{}".format(n_vocab)) print("arg_vocab:{}".format(args.n_vocab)) src_vocab = Vocabulary.new( gens.word_list(args.train_source), args.n_vocab) src_vocab.save(vocab_name) self.vocab = src_vocab return src_vocab
def getBatchGen_test(self, args, is_shuffle=True): tt_now_list = [[self.vocab.stoi(char) for char in char_arr] for char_arr in gens.word_list(args.test_source)] ind_arr = list(range(len(tt_now_list))) if is_shuffle: random.shuffle(ind_arr) tt_now = (tt_now_list[ind] for ind in ind_arr) tt_gen = gens.batch(tt_now, args.test_batchsize) for tt in tt_gen: yield tt