def __init__(self, dataset_path, ngram_size, vocab_path): L.info("Initializing dataset from: " + dataset_path) vocab = VocabManager(vocab_path) curr_index = 0 self.num_sentences = 0 ngrams_list = [] dataset = codecs.open(dataset_path, 'r', encoding="UTF-8") for line in dataset: tokens = line.split() ngrams = vocab.get_ids_given_word_list(tokens) ngrams_list.append(ngrams) curr_index += 1 dataset.close() data = np.asarray(ngrams_list) x = data[:,0:-1] y = data[:,-1] self.num_samples = y.shape[0] self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
# from dlm.io.ngramsReader import NgramsReader from dlm.io.vocabReader import VocabManager testset = NgramsReader(dataset_path=args.input_path, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path) vocab = VocabManager(args.vocab_path) ## Loading restricted vocab restricted_ids = [] restricted_vocab = [] if args.restricted_vocab_path: with open(args.restricted_vocab_path) as restricted_vocab_file: for line in restricted_vocab_file: restricted_vocab.append(line.strip()) restricted_ids = vocab.get_ids_given_word_list(restricted_vocab) ######################### ## Compiling theano function # evaluator = eval.Evaluator(testset, classifier) if args.output_path: with open(args.output_path, "w") as output: for i in xrange(testset._get_num_samples()): out = evaluator.get_class(i, restricted_ids) output.write(vocab.get_word_given_id(out) + "\n")
# from dlm.io.ngramsReader import NgramsReader from dlm.io.vocabReader import VocabManager testset = NgramsReader(dataset_path=args.input_path, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path) vocab = VocabManager(args.vocab_path) ## Loading restricted vocab restricted_ids = [] restricted_vocab = [] if args.restricted_vocab_path: with open(args.restricted_vocab_path) as restricted_vocab_file: for line in restricted_vocab_file: restricted_vocab.append(line.strip()) restricted_ids = vocab.get_ids_given_word_list(restricted_vocab) ######################### ## Compiling theano function # evaluator = eval.Evaluator(testset, classifier) if args.output_path: with open(args.output_path, 'w') as output: for i in xrange(testset._get_num_samples()): out = evaluator.get_class(i, restricted_ids) output.write(vocab.get_word_given_id(out) + '\n')