예제 #1
0
	def __init__(self, dataset_path, ngram_size, vocab_path):

		L.info("Initializing dataset from: " + dataset_path)

		vocab = VocabManager(vocab_path)

		curr_index = 0
		self.num_sentences = 0

		ngrams_list = []
		dataset = codecs.open(dataset_path, 'r', encoding="UTF-8")
		for line in dataset:
			tokens = line.split()
			ngrams = vocab.get_ids_given_word_list(tokens)
			ngrams_list.append(ngrams)
			curr_index += 1
		dataset.close()

		data = np.asarray(ngrams_list)

		x = data[:,0:-1]
		y = data[:,-1]
		self.num_samples = y.shape[0]

		self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
		self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
예제 #2
0
파일: lookuptable.py 프로젝트: tamhd/corelm
	def initialize(self, emb_path, vocab_path):
		L.info('Initializing lookup table')
		vm = VocabManager(vocab_path)
		w2v = W2VEmbReader(emb_path)
		U.xassert(w2v.get_emb_dim() == self.emb_matrix.shape[1], 'The embeddings dimension does not match with the given word embeddings')
		for i in range(self.emb_matrix.shape[0]):
			vec = w2v.get_emb_given_word(vm.get_word_given_id(i))
			if vec:
				self.emb_matrix[i] = vec
예제 #3
0
 def initialize(self, emb_path, vocab_path):
     L.info('Initializing lookup table')
     vm = VocabManager(vocab_path)
     w2v = W2VEmbReader(emb_path)
     U.xassert(
         w2v.get_emb_dim() == self.emb_matrix.shape[1],
         'The embeddings dimension does not match with the given word embeddings'
     )
     for i in range(self.emb_matrix.shape[0]):
         vec = w2v.get_emb_given_word(vm.get_word_given_id(i))
         if vec:
             self.emb_matrix[i] = vec
예제 #4
0
def augment(model_path, input_nbest_path, vocab_path, output_nbest_path):
    classifier = MLP(model_path=model_path)
    evaluator = eval.Evaluator(None, classifier)

    vocab = VocabManager(vocab_path)

    ngram_size = classifier.ngram_size

    def get_ngrams(tokens):
        for i in range(ngram_size - 1):
            tokens.insert(0, '<s>')
        if vocab.has_end_padding:
            tokens.append('</s>')
        indices = vocab.get_ids_given_word_list(tokens)
        return U.get_all_windows(indices, ngram_size)

    input_nbest = NBestList(input_nbest_path, mode='r')
    output_nbest = NBestList(output_nbest_path, mode='w')

    L.info('Augmenting: ' + input_nbest_path)

    start_time = time.time()

    counter = 0
    cache = dict()
    for group in input_nbest:
        ngram_list = []
        for item in group:
            tokens = item.hyp.split()
            ngrams = get_ngrams(tokens)
            for ngram in ngrams:
                if not cache.has_key(str(ngram)):
                    ngram_list.append(ngram)
                    cache[str(ngram)] = 1000
        if len(ngram_list) > 0:
            ngram_array = np.asarray(ngram_list, dtype='int32')
            ngram_log_prob_list = evaluator.get_ngram_log_prob(
                ngram_array[:, 0:-1], ngram_array[:, -1])
            for i in range(len(ngram_list)):
                cache[str(ngram_list[i])] = ngram_log_prob_list[i]
        for item in group:
            tokens = item.hyp.split()
            ngrams = get_ngrams(tokens)
            sum_ngram_log_prob = 0
            for ngram in ngrams:
                sum_ngram_log_prob += cache[str(ngram)]
            item.append_feature(sum_ngram_log_prob)
            output_nbest.write(item)
        #print counter
        counter += 1
    output_nbest.close()

    L.info("Ran for %.2fs" % (time.time() - start_time))
예제 #5
0
	def __init__(self, dataset_path, is_nbest, ngram_size, vocab_path):
		
		L.info("Initializing dataset from: " + dataset_path)
		
		vocab = VocabManager(vocab_path)
		
		def get_ngrams(tokens):
			for i in range(ngram_size - 1):
				tokens.insert(0, '<s>')
			if vocab.has_end_padding:
				tokens.append('</s>')
			indices = vocab.get_ids_given_word_list(tokens)
			return U.get_all_windows(indices, ngram_size)
		
		starts_list = []
		curr_index = 0
		curr_start_index = 0
		self.num_sentences = 0
		
		ngrams_list = []
		if is_nbest == True:
			nbest = NBestList(dataset_path)
			for group in nbest:
				for item in group:
					tokens = item.hyp.split()
					starts_list.append(curr_start_index)
					ngrams = get_ngrams(tokens)
					ngrams_list += ngrams
					curr_start_index += len(ngrams)
		else:
			dataset = codecs.open(dataset_path, 'r', encoding="UTF-8")
			for line in dataset:
				tokens = line.split()
				starts_list.append(curr_start_index)
				ngrams = get_ngrams(tokens)
				ngrams_list += ngrams
				curr_start_index += len(ngrams)
			dataset.close()
		
		self.num_sentences = len(starts_list)
		
		data = np.asarray(ngrams_list)
		starts_list.append(curr_start_index)
		starts_array = np.asarray(starts_list)
		
		x = data[:,0:-1]
		y = data[:,-1]
		
		self.num_samples = y.shape[0]
		
		self.shared_starts = T.cast(theano.shared(starts_array, borrow=True), 'int64')
		self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
		self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
예제 #6
0
#########################
## Loading model
#

classifier = MLP(model_path=args.model_path)

#########################
## Loading dataset
#

from dlm.io.ngramsReader import NgramsReader
from dlm.io.vocabReader import VocabManager

testset = NgramsReader(dataset_path=args.input_path, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path)
vocab = VocabManager(args.vocab_path)

## Loading restricted vocab
restricted_ids = []
restricted_vocab = []
if args.restricted_vocab_path:
    with open(args.restricted_vocab_path) as restricted_vocab_file:
        for line in restricted_vocab_file:
            restricted_vocab.append(line.strip())
    restricted_ids = vocab.get_ids_given_word_list(restricted_vocab)


#########################
## Compiling theano function
#
예제 #7
0
## Loading model
#

classifier = MLP(model_path=args.model_path)

#########################
## Loading dataset
#

from dlm.io.ngramsReader import NgramsReader
from dlm.io.vocabReader import VocabManager

testset = NgramsReader(dataset_path=args.input_path,
                       ngram_size=classifier.ngram_size,
                       vocab_path=args.vocab_path)
vocab = VocabManager(args.vocab_path)

## Loading restricted vocab
restricted_ids = []
restricted_vocab = []
if args.restricted_vocab_path:
    with open(args.restricted_vocab_path) as restricted_vocab_file:
        for line in restricted_vocab_file:
            restricted_vocab.append(line.strip())
    restricted_ids = vocab.get_ids_given_word_list(restricted_vocab)

#########################
## Compiling theano function
#

evaluator = eval.Evaluator(testset, classifier)
예제 #8
0
import theano.tensor as T

#########################
## Loading model
#

classifier = MLP(model_path=args.model_path)

#########################
## Loading dataset
#

from dlm.io.ngramsReader import NgramsReader
from dlm.io.vocabReader import VocabManager
testset = NgramsReader(dataset_path=args.input_path, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path)
vocab = VocabManager(args.vocab_path)

## Loading restricted vocab
restricted_ids = []
restricted_vocab = []
if args.restricted_vocab_path:
	with open(args.restricted_vocab_path) as restricted_vocab_file:
		for line in restricted_vocab_file:
			restricted_vocab.append(line.strip())
	restricted_ids = vocab.get_ids_given_word_list(restricted_vocab)


#########################
## Compiling theano function
#