def __init__(self, dataset_path, ngram_size, vocab_path): L.info("Initializing dataset from: " + dataset_path) vocab = VocabManager(vocab_path) curr_index = 0 self.num_sentences = 0 ngrams_list = [] dataset = codecs.open(dataset_path, 'r', encoding="UTF-8") for line in dataset: tokens = line.split() ngrams = vocab.get_ids_given_word_list(tokens) ngrams_list.append(ngrams) curr_index += 1 dataset.close() data = np.asarray(ngrams_list) x = data[:,0:-1] y = data[:,-1] self.num_samples = y.shape[0] self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
def initialize(self, emb_path, vocab_path): L.info('Initializing lookup table') vm = VocabManager(vocab_path) w2v = W2VEmbReader(emb_path) U.xassert(w2v.get_emb_dim() == self.emb_matrix.shape[1], 'The embeddings dimension does not match with the given word embeddings') for i in range(self.emb_matrix.shape[0]): vec = w2v.get_emb_given_word(vm.get_word_given_id(i)) if vec: self.emb_matrix[i] = vec
def initialize(self, emb_path, vocab_path): L.info('Initializing lookup table') vm = VocabManager(vocab_path) w2v = W2VEmbReader(emb_path) U.xassert( w2v.get_emb_dim() == self.emb_matrix.shape[1], 'The embeddings dimension does not match with the given word embeddings' ) for i in range(self.emb_matrix.shape[0]): vec = w2v.get_emb_given_word(vm.get_word_given_id(i)) if vec: self.emb_matrix[i] = vec
def augment(model_path, input_nbest_path, vocab_path, output_nbest_path): classifier = MLP(model_path=model_path) evaluator = eval.Evaluator(None, classifier) vocab = VocabManager(vocab_path) ngram_size = classifier.ngram_size def get_ngrams(tokens): for i in range(ngram_size - 1): tokens.insert(0, '<s>') if vocab.has_end_padding: tokens.append('</s>') indices = vocab.get_ids_given_word_list(tokens) return U.get_all_windows(indices, ngram_size) input_nbest = NBestList(input_nbest_path, mode='r') output_nbest = NBestList(output_nbest_path, mode='w') L.info('Augmenting: ' + input_nbest_path) start_time = time.time() counter = 0 cache = dict() for group in input_nbest: ngram_list = [] for item in group: tokens = item.hyp.split() ngrams = get_ngrams(tokens) for ngram in ngrams: if not cache.has_key(str(ngram)): ngram_list.append(ngram) cache[str(ngram)] = 1000 if len(ngram_list) > 0: ngram_array = np.asarray(ngram_list, dtype='int32') ngram_log_prob_list = evaluator.get_ngram_log_prob( ngram_array[:, 0:-1], ngram_array[:, -1]) for i in range(len(ngram_list)): cache[str(ngram_list[i])] = ngram_log_prob_list[i] for item in group: tokens = item.hyp.split() ngrams = get_ngrams(tokens) sum_ngram_log_prob = 0 for ngram in ngrams: sum_ngram_log_prob += cache[str(ngram)] item.append_feature(sum_ngram_log_prob) output_nbest.write(item) #print counter counter += 1 output_nbest.close() L.info("Ran for %.2fs" % (time.time() - start_time))
def __init__(self, dataset_path, is_nbest, ngram_size, vocab_path): L.info("Initializing dataset from: " + dataset_path) vocab = VocabManager(vocab_path) def get_ngrams(tokens): for i in range(ngram_size - 1): tokens.insert(0, '<s>') if vocab.has_end_padding: tokens.append('</s>') indices = vocab.get_ids_given_word_list(tokens) return U.get_all_windows(indices, ngram_size) starts_list = [] curr_index = 0 curr_start_index = 0 self.num_sentences = 0 ngrams_list = [] if is_nbest == True: nbest = NBestList(dataset_path) for group in nbest: for item in group: tokens = item.hyp.split() starts_list.append(curr_start_index) ngrams = get_ngrams(tokens) ngrams_list += ngrams curr_start_index += len(ngrams) else: dataset = codecs.open(dataset_path, 'r', encoding="UTF-8") for line in dataset: tokens = line.split() starts_list.append(curr_start_index) ngrams = get_ngrams(tokens) ngrams_list += ngrams curr_start_index += len(ngrams) dataset.close() self.num_sentences = len(starts_list) data = np.asarray(ngrams_list) starts_list.append(curr_start_index) starts_array = np.asarray(starts_list) x = data[:,0:-1] y = data[:,-1] self.num_samples = y.shape[0] self.shared_starts = T.cast(theano.shared(starts_array, borrow=True), 'int64') self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
######################### ## Loading model # classifier = MLP(model_path=args.model_path) ######################### ## Loading dataset # from dlm.io.ngramsReader import NgramsReader from dlm.io.vocabReader import VocabManager testset = NgramsReader(dataset_path=args.input_path, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path) vocab = VocabManager(args.vocab_path) ## Loading restricted vocab restricted_ids = [] restricted_vocab = [] if args.restricted_vocab_path: with open(args.restricted_vocab_path) as restricted_vocab_file: for line in restricted_vocab_file: restricted_vocab.append(line.strip()) restricted_ids = vocab.get_ids_given_word_list(restricted_vocab) ######################### ## Compiling theano function #
## Loading model # classifier = MLP(model_path=args.model_path) ######################### ## Loading dataset # from dlm.io.ngramsReader import NgramsReader from dlm.io.vocabReader import VocabManager testset = NgramsReader(dataset_path=args.input_path, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path) vocab = VocabManager(args.vocab_path) ## Loading restricted vocab restricted_ids = [] restricted_vocab = [] if args.restricted_vocab_path: with open(args.restricted_vocab_path) as restricted_vocab_file: for line in restricted_vocab_file: restricted_vocab.append(line.strip()) restricted_ids = vocab.get_ids_given_word_list(restricted_vocab) ######################### ## Compiling theano function # evaluator = eval.Evaluator(testset, classifier)
import theano.tensor as T ######################### ## Loading model # classifier = MLP(model_path=args.model_path) ######################### ## Loading dataset # from dlm.io.ngramsReader import NgramsReader from dlm.io.vocabReader import VocabManager testset = NgramsReader(dataset_path=args.input_path, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path) vocab = VocabManager(args.vocab_path) ## Loading restricted vocab restricted_ids = [] restricted_vocab = [] if args.restricted_vocab_path: with open(args.restricted_vocab_path) as restricted_vocab_file: for line in restricted_vocab_file: restricted_vocab.append(line.strip()) restricted_ids = vocab.get_ids_given_word_list(restricted_vocab) ######################### ## Compiling theano function #