예제 #1
0
	def __init__(self, dataset_path, ngram_size, vocab_path):

		L.info("Initializing dataset from: " + dataset_path)

		vocab = VocabManager(vocab_path)

		curr_index = 0
		self.num_sentences = 0

		ngrams_list = []
		dataset = codecs.open(dataset_path, 'r', encoding="UTF-8")
		for line in dataset:
			tokens = line.split()
			ngrams = vocab.get_ids_given_word_list(tokens)
			ngrams_list.append(ngrams)
			curr_index += 1
		dataset.close()

		data = np.asarray(ngrams_list)

		x = data[:,0:-1]
		y = data[:,-1]
		self.num_samples = y.shape[0]

		self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
		self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
예제 #2
0
#

from dlm.io.ngramsReader import NgramsReader
from dlm.io.vocabReader import VocabManager

testset = NgramsReader(dataset_path=args.input_path, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path)
vocab = VocabManager(args.vocab_path)

## Loading restricted vocab
restricted_ids = []
restricted_vocab = []
if args.restricted_vocab_path:
    with open(args.restricted_vocab_path) as restricted_vocab_file:
        for line in restricted_vocab_file:
            restricted_vocab.append(line.strip())
    restricted_ids = vocab.get_ids_given_word_list(restricted_vocab)


#########################
## Compiling theano function
#

evaluator = eval.Evaluator(testset, classifier)


if args.output_path:
    with open(args.output_path, "w") as output:
        for i in xrange(testset._get_num_samples()):
            out = evaluator.get_class(i, restricted_ids)
            output.write(vocab.get_word_given_id(out) + "\n")
예제 #3
0
#

from dlm.io.ngramsReader import NgramsReader
from dlm.io.vocabReader import VocabManager

testset = NgramsReader(dataset_path=args.input_path,
                       ngram_size=classifier.ngram_size,
                       vocab_path=args.vocab_path)
vocab = VocabManager(args.vocab_path)

## Loading restricted vocab
restricted_ids = []
restricted_vocab = []
if args.restricted_vocab_path:
    with open(args.restricted_vocab_path) as restricted_vocab_file:
        for line in restricted_vocab_file:
            restricted_vocab.append(line.strip())
    restricted_ids = vocab.get_ids_given_word_list(restricted_vocab)

#########################
## Compiling theano function
#

evaluator = eval.Evaluator(testset, classifier)

if args.output_path:
    with open(args.output_path, 'w') as output:
        for i in xrange(testset._get_num_samples()):
            out = evaluator.get_class(i, restricted_ids)
            output.write(vocab.get_word_given_id(out) + '\n')